/// other block.
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const;
+ /// Return the successor of this block if it has a single successor.
+ /// Otherwise return a null pointer.
+ ///
+ const MachineBasicBlock *getSingleSuccessor() const;
+ MachineBasicBlock *getSingleSuccessor() {
+ return const_cast<MachineBasicBlock *>(
+ static_cast<const MachineBasicBlock *>(this)->getSingleSuccessor());
+ }
+
/// Return the fallthrough block if the block can implicitly
/// transfer control to the block after it by falling off the end of
/// it. This should return null if it can reach the block after
return std::next(I) == MachineFunction::const_iterator(MBB);
}
+const MachineBasicBlock *MachineBasicBlock::getSingleSuccessor() const {
+ return Successors.size() == 1 ? Successors[0] : nullptr;
+}
+
MachineBasicBlock *MachineBasicBlock::getFallThrough() {
MachineFunction::iterator Fallthrough = getIterator();
++Fallthrough;
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/Sequence.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/DebugCounter.h"
DenseSet<MachineInstr *> TrackedWaitcntSet;
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+ DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
+ MachineLoopInfo *MLI;
MachinePostDominatorTree *PDT;
struct BlockInfo {
(void)ForceVMCounter;
}
+ bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
+ bool isPreheaderToFlush(MachineBasicBlock &MBB,
+ WaitcntBrackets &ScoreBrackets);
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<MachineLoopInfo>();
AU.addRequired<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr);
+ MachineInstr *OldWaitcntInstr,
+ bool FlushVmCnt);
+ bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr);
+ bool generateWaitcnt(AMDGPU::Waitcnt Wait,
+ MachineBasicBlock::instr_iterator It,
+ MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr);
void updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
- AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
+ AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It);
};
} // end anonymous namespace
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
false)
return new SIInsertWaitcnts();
}
-/// Combine consecutive waitcnt instructions that precede \p MI and follow
+/// Combine consecutive waitcnt instructions that precede \p It and follow
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
/// by previous passes. Currently this pass conservatively assumes that these
/// preexisting waitcnt are required for correctness.
-bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
- MachineInstr &OldWaitcntInstr,
- AMDGPU::Waitcnt &Wait,
- const MachineInstr *MI) {
+bool SIInsertWaitcnts::applyPreexistingWaitcnt(
+ WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) {
bool Modified = false;
MachineInstr *WaitcntInstr = nullptr;
MachineInstr *WaitcntVsCntInstr = nullptr;
- for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
- &*II != MI; II = NextI, ++NextI) {
- if (II->isMetaInstruction())
+
+ for (auto &II :
+ make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
+ if (II.isMetaInstruction())
continue;
- if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ if (II.getOpcode() == AMDGPU::S_WAITCNT) {
// Conservatively update required wait if this waitcnt was added in an
// earlier pass. In this case it will not exist in the tracked waitcnt
// set.
- if (!TrackedWaitcntSet.count(&*II)) {
- unsigned IEnc = II->getOperand(0).getImm();
+ if (!TrackedWaitcntSet.count(&II)) {
+ unsigned IEnc = II.getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
Wait = Wait.combined(OldWait);
}
// Merge consecutive waitcnt of the same type by erasing multiples.
if (!WaitcntInstr) {
- WaitcntInstr = &*II;
+ WaitcntInstr = &II;
} else {
- II->eraseFromParent();
+ II.eraseFromParent();
Modified = true;
}
} else {
- assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
- assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- if (!TrackedWaitcntSet.count(&*II)) {
+ assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ if (!TrackedWaitcntSet.count(&II)) {
unsigned OldVSCnt =
- TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
}
if (!WaitcntVsCntInstr) {
- WaitcntVsCntInstr = &*II;
+ WaitcntVsCntInstr = &II;
} else {
- II->eraseFromParent();
+ II.eraseFromParent();
Modified = true;
}
}
Wait.LgkmCnt = ~0u;
Wait.ExpCnt = ~0u;
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << *MI << "New Instr: " << *WaitcntInstr
- << '\n');
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntInstr
+ << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntInstr << '\n');
+
} else {
WaitcntInstr->eraseFromParent();
Modified = true;
ScoreBrackets.applyWaitcnt(Wait);
Wait.VsCnt = ~0u;
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << *MI
- << "New Instr: " << *WaitcntVsCntInstr << '\n');
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: "
+ << *WaitcntVsCntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntVsCntInstr << '\n');
} else {
WaitcntVsCntInstr->eraseFromParent();
Modified = true;
/// and if so what the value of each counter is.
/// The "score bracket" is bound by the lower bound and upper bound
/// scores (*_score_LB and *_score_ub respectively).
-bool SIInsertWaitcnts::generateWaitcntInstBefore(
- MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr) {
+/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
+/// flush the vmcnt counter here.
+bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr,
+ bool FlushVmCnt) {
setForceEmitWaitcnt();
if (MI.isMetaInstruction())
return false;
AMDGPU::Waitcnt Wait;
- bool Modified = false;
// FIXME: This should have already been handled by the memory legalizer.
// Removing this currently doesn't affect any lit tests, but we need to
if (ForceEmitWaitcnt[VS_CNT])
Wait.VsCnt = 0;
- if (OldWaitcntInstr) {
+ if (FlushVmCnt) {
+ unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
+ unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
+ if (UB - LB != 0)
+ Wait.VmCnt = 0;
+ }
+
+ return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
+ OldWaitcntInstr);
+}
+
+// Add a waitcnt to flush the vmcnt counter at the end of the given block if
+// needed.
+bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr) {
+ AMDGPU::Waitcnt Wait;
+
+ unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
+ unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
+ if (UB - LB == 0)
+ return false;
+
+ Wait.VmCnt = 0;
+
+ return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+ OldWaitcntInstr);
+}
+
+bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
+ MachineBasicBlock::instr_iterator It,
+ MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr) {
+ bool Modified = false;
+ const DebugLoc &DL = Block.findDebugLoc(It);
+
+ if (OldWaitcntInstr)
// Try to merge the required wait with preexisting waitcnt instructions.
// Also erase redundant waitcnt.
Modified =
- applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
- } else {
- // Update waitcnt brackets after determining the required wait.
+ applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
+ else
ScoreBrackets.applyWaitcnt(Wait);
- }
// ExpCnt can be merged into VINTERP.
- if (Wait.ExpCnt != ~0u && SIInstrInfo::isVINTERP(MI)) {
- MachineOperand *WaitExp = TII->getNamedOperand(MI, AMDGPU::OpName::waitexp);
+ if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
+ SIInstrInfo::isVINTERP(*It)) {
+ MachineOperand *WaitExp =
+ TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
if (Wait.ExpCnt < WaitExp->getImm()) {
WaitExp->setImm(Wait.ExpCnt);
Modified = true;
Wait.ExpCnt = ~0u;
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Update Instr: " << MI);
+ << "Update Instr: " << *It);
}
// Build new waitcnt instructions unless no wait is needed or the old waitcnt
// instruction was modified to handle the required wait.
if (Wait.hasWaitExceptVsCnt()) {
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
- MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(Enc);
+ auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI
- << "New Instr: " << *SWaitInst << '\n');
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
}
if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
- auto SWaitInst =
- BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
- TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(Wait.VsCnt);
+ auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.VsCnt);
TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI
- << "New Instr: " << *SWaitInst << '\n');
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
}
-
return Modified;
}
continue;
}
+ bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
+ isPreheaderToFlush(Block, ScoreBrackets);
+
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
- Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+ Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
+ FlushVmCnt);
OldWaitcntInstr = nullptr;
// Restore vccz if it's not known to be correct already.
++Iter;
}
+ if (Block.getFirstTerminator() == Block.end() &&
+ isPreheaderToFlush(Block, ScoreBrackets))
+ Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
+
return Modified;
}
+// Return true if the given machine basic block is a preheader of a loop in
+// which we want to flush the vmcnt counter, and false otherwise.
+bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
+ WaitcntBrackets &ScoreBrackets) {
+ if (PreheadersToFlush.count(&MBB))
+ return PreheadersToFlush[&MBB];
+
+ auto UpdateCache = [&](bool val) {
+ PreheadersToFlush[&MBB] = val;
+ return val;
+ };
+
+ MachineBasicBlock *Succ = MBB.getSingleSuccessor();
+ if (!Succ)
+ return UpdateCache(false);
+
+ MachineLoop *Loop = MLI->getLoopFor(Succ);
+ if (!Loop)
+ return UpdateCache(false);
+
+ if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets))
+ return UpdateCache(true);
+
+ return UpdateCache(false);
+}
+
+// Return true if it is better to flush the vmcnt counter in the preheader of
+// the given loop. We currently decide to flush in two situations:
+// 1. The loop contains vmem store(s), no vmem load and at least one use of a
+// vgpr containing a value that is loaded outside of the loop. (Only on
+// targets with no vscnt counter).
+// 2. The loop contains vmem load(s), but the loaded values are not used in the
+// loop, and at least one use of a vgpr containing a value that is loaded
+// outside of the loop.
+bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
+ WaitcntBrackets &Brackets) {
+ bool HasVMemLoad = false;
+ bool HasVMemStore = false;
+ bool UsesVgprLoadedOutside = false;
+ DenseSet<Register> VgprUse;
+ DenseSet<Register> VgprDef;
+
+ for (MachineBasicBlock *MBB : ML->blocks()) {
+ for (MachineInstr &MI : *MBB) {
+ if (SIInstrInfo::isVMEM(MI)) {
+ if (MI.mayLoad())
+ HasVMemLoad = true;
+ if (MI.mayStore())
+ HasVMemStore = true;
+ }
+ for (unsigned I = 0; I < MI.getNumOperands(); I++) {
+ MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
+ continue;
+ RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
+ // Vgpr use
+ if (Op.isUse()) {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ // If we find a register that is loaded inside the loop, 1. and 2.
+ // are invalidated and we can exit.
+ if (VgprDef.contains(RegNo))
+ return false;
+ VgprUse.insert(RegNo);
+ // If at least one of Op's registers is in the score brackets, the
+ // value is likely loaded outside of the loop.
+ if (Brackets.getRegScore(RegNo, VM_CNT) > 0) {
+ UsesVgprLoadedOutside = true;
+ break;
+ }
+ }
+ }
+ // VMem load vgpr def
+ else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef())
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ // If we find a register that is loaded inside the loop, 1. and 2.
+ // are invalidated and we can exit.
+ if (VgprUse.contains(RegNo))
+ return false;
+ VgprDef.insert(RegNo);
+ }
+ }
+ }
+ }
+ if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
+ return true;
+ return HasVMemLoad && UsesVgprLoadedOutside;
+}
+
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
MRI = &MF.getRegInfo();
IV = AMDGPU::getIsaVersion(ST->getCPU());
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MLI = &getAnalysis<MachineLoopInfo>();
PDT = &getAnalysis<MachinePostDominatorTree>();
ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
; GCN-O0-NEXT: Insert fentry calls
; GCN-O0-NEXT: Insert XRay ops
; GCN-O0-NEXT: SI Memory Legalizer
+; GCN-O0-NEXT: MachineDominator Tree Construction
+; GCN-O0-NEXT: Machine Natural Loop Construction
; GCN-O0-NEXT: MachinePostDominator Tree Construction
; GCN-O0-NEXT: SI insert wait instructions
; GCN-O0-NEXT: Insert required mode register values
-; GCN-O0-NEXT: MachineDominator Tree Construction
; GCN-O0-NEXT: SI Final Branch Preparation
; GCN-O0-NEXT: Post RA hazard recognizer
; GCN-O0-NEXT: Branch relaxation pass
; GCN-O1-NEXT: Insert fentry calls
; GCN-O1-NEXT: Insert XRay ops
; GCN-O1-NEXT: SI Memory Legalizer
+; GCN-O1-NEXT: MachineDominator Tree Construction
+; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: MachinePostDominator Tree Construction
; GCN-O1-NEXT: SI insert wait instructions
; GCN-O1-NEXT: Insert required mode register values
; GCN-O1-NEXT: SI Insert Hard Clauses
-; GCN-O1-NEXT: MachineDominator Tree Construction
; GCN-O1-NEXT: SI Final Branch Preparation
; GCN-O1-NEXT: SI peephole optimizations
; GCN-O1-NEXT: Post RA hazard recognizer
; GCN-O1-OPTS-NEXT: Insert fentry calls
; GCN-O1-OPTS-NEXT: Insert XRay ops
; GCN-O1-OPTS-NEXT: SI Memory Legalizer
+; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction
+; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction
; GCN-O1-OPTS-NEXT: SI insert wait instructions
; GCN-O1-OPTS-NEXT: Insert required mode register values
; GCN-O1-OPTS-NEXT: SI Insert Hard Clauses
-; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction
; GCN-O1-OPTS-NEXT: SI Final Branch Preparation
; GCN-O1-OPTS-NEXT: SI peephole optimizations
; GCN-O1-OPTS-NEXT: Post RA hazard recognizer
; GCN-O2-NEXT: Insert fentry calls
; GCN-O2-NEXT: Insert XRay ops
; GCN-O2-NEXT: SI Memory Legalizer
+; GCN-O2-NEXT: MachineDominator Tree Construction
+; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: MachinePostDominator Tree Construction
; GCN-O2-NEXT: SI insert wait instructions
; GCN-O2-NEXT: Insert required mode register values
; GCN-O2-NEXT: SI Insert Hard Clauses
-; GCN-O2-NEXT: MachineDominator Tree Construction
; GCN-O2-NEXT: SI Final Branch Preparation
; GCN-O2-NEXT: SI peephole optimizations
; GCN-O2-NEXT: Post RA hazard recognizer
; GCN-O3-NEXT: Insert fentry calls
; GCN-O3-NEXT: Insert XRay ops
; GCN-O3-NEXT: SI Memory Legalizer
+; GCN-O3-NEXT: MachineDominator Tree Construction
+; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: MachinePostDominator Tree Construction
; GCN-O3-NEXT: SI insert wait instructions
; GCN-O3-NEXT: Insert required mode register values
; GCN-O3-NEXT: SI Insert Hard Clauses
-; GCN-O3-NEXT: MachineDominator Tree Construction
; GCN-O3-NEXT: SI Final Branch Preparation
; GCN-O3-NEXT: SI peephole optimizations
; GCN-O3-NEXT: Post RA hazard recognizer
--- /dev/null
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
+
+---
+
+# The loop contains a store and a use of a value loaded outside of the loop.
+# We expect the waitcnt for the use to be hoisted on GFX9, but not on GFX10+
+# because we have the vscnt counter.
+
+# GFX9-LABEL: waitcnt_vm_loop
+# GFX9-LABEL: bb.0:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop
+# GFX10-LABEL: bb.0:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# Same as before, but the loop preheader has no terminator.
+
+# GFX9-LABEL: waitcnt_vm_loop_noterm
+# GFX9-LABEL: bb.0:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop_noterm
+# GFX10-LABEL: bb.0:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop_noterm
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# Same as before but there is a preexisting waitcnt in the preheader.
+
+# GFX9-LABEL: waitcnt_vm_loop_noterm_wait
+# GFX9-LABEL: bb.0:
+# GFX9: S_WAITCNT 39
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+name: waitcnt_vm_loop_noterm_wait
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_WAITCNT 3952
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# The loop contains a store, a load, and uses values loaded both inside and
+# outside the loop.
+# We do not expect the waitcnt to be hoisted out of the loop.
+
+# GFX9-LABEL: waitcnt_vm_loop_load
+# GFX9-LABEL: bb.0:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop_load
+# GFX10-LABEL: bb.0:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop_load
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# The loop contains a use of a value loaded outside of the loop, and no store
+# nor load.
+# We do not expect the waitcnt to be hoisted out of the loop.
+
+# GFX9-LABEL: waitcnt_vm_loop_no_store
+# GFX9-LABEL: bb.0:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop_no_store
+# GFX10-LABEL: bb.0:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop_no_store
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# The loop contains a store, no load, and doesn't use any value loaded inside
+# or outside of the loop. There is only one use of the loaded value in the
+# exit block.
+# We don't expect any s_waitcnt vmcnt in the loop body or preheader, but expect
+# one in the exit block.
+
+
+# GFX9-LABEL: waitcnt_vm_loop_no_use
+# GFX9-LABEL: bb.0:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop_no_use
+# GFX10-LABEL: bb.0:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop_no_use
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+
+# The loop loads a value that is not used in the loop, and uses a value loaded
+# outside of the loop.
+# We expect the waitcnt to be hoisted of the loop to wait a single time before
+# the loop is executed and avoid waiting for the load to complete on each
+# iteration.
+
+# GFX9-LABEL: waitcnt_vm_loop2
+# GFX9-LABEL: bb.0:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop2
+# GFX10-LABEL: bb.0:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop2
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+ $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# Same as before with an additional store in the loop. We still expect the
+# waitcnt instructions to be hoisted.
+
+# GFX9-LABEL: waitcnt_vm_loop2_store
+# GFX9-LABEL: bb.0:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop2_store
+# GFX10-LABEL: bb.0:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop2_store
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+ $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# Same as loop2 but the value loaded inside the loop is also used in the loop.
+# We do not expect the waitcnt to be hoisted out of the loop.
+
+# GFX9-LABEL: waitcnt_vm_loop2_use_in_loop
+# GFX9-LABEL: bb.0:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop2_use_in_loop
+# GFX10-LABEL: bb.0:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop2_use_in_loop
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+ $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr4 = V_ADD_U32_e32 $vgpr5, $vgpr1, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# The loop contains a use of a value loaded outside of the loop, but we already
+# waited for that load to complete. The loop also loads a value that is not used
+# in the loop. We do not expect any waitcnt in the loop.
+
+# GFX9-LABEL: waitcnt_vm_loop2_nowait
+# GFX9-LABEL: bb.0:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.3:
+
+# GFX10-LABEL: waitcnt_vm_loop2_nowait
+# GFX10-LABEL: bb.0:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.3:
+name: waitcnt_vm_loop2_nowait
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+
+ $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
+ $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
+ $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
+
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.2, %bb.3
+
+ $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+ $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+ S_BRANCH %bb.3
+
+ bb.3:
+ S_ENDPGM 0
+
+...
+---
+
+# Similar test case but for register intervals.
+
+# GFX9-LABEL: waitcnt_vm_loop2_reginterval
+# GFX9-LABEL: bb.0:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop2_reginterval
+# GFX10-LABEL: bb.0:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop2_reginterval
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec
+
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr10 = COPY $vgpr0
+
+ $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# Similar test case but for register intervals.
+
+# GFX9-LABEL: waitcnt_vm_loop2_reginterval2
+# GFX9-LABEL: bb.0:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.1:
+# GFX9: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_loop2_reginterval2
+# GFX10-LABEL: bb.0:
+# GFX10-NOT: S_WAITCNT 16
+# GFX10-LABEL: bb.1:
+# GFX10: S_WAITCNT 16
+# GFX10-LABEL: bb.2:
+name: waitcnt_vm_loop2_reginterval2
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec
+
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr10 = COPY $vgpr0
+
+ $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+ $vgpr11 = COPY $vgpr7
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+
+# The loop loads a value that is not used in the loop, but uses a value loaded
+# outside of it. We expect the s_waitcnt instruction to be hoisted.
+# A s_waitcnt vmcnt(0) is generated to flush in the preheader, but for this
+# specific test case, it would be better to use vmcnt(1) instead. This is
+# currently not implemented.
+
+# GFX9-LABEL: waitcnt_vm_zero
+# GFX9-LABEL: bb.0:
+# GFX9: S_WAITCNT 3952
+# GFX9-LABEL: bb.1:
+# GFX9-NOT: S_WAITCNT 39
+# GFX9-LABEL: bb.2:
+
+# GFX10-LABEL: waitcnt_vm_zero
+# GFX10-LABEL: bb.0:
+# GFX10: S_WAITCNT 16240
+# GFX10-LABEL: bb.1:
+# GFX10-NOT: S_WAITCNT 16240
+# GFX10-LABEL: bb.2:
+
+name: waitcnt_vm_zero
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr3, implicit $exec
+ $vgpr2 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr3, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec
+ S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...