From 1bc6e71f32ebadefc319c65b3f650e08e39d6aa7 Mon Sep 17 00:00:00 2001 From: Mark Searles Date: Thu, 19 Apr 2018 15:42:30 +0000 Subject: [PATCH] [AMDGPU] Do not only rely on BB number when finding bottom loop We should also check that the "bottom" basic block of a loopis a successor of the "header" basic block, otherwise we don't propagate the information correctly when the CFG is complex. This fixes an important rendering problem with Wolfsentein 2, because of one vector-memory wait was missing. Differential Revision: https://reviews.llvm.org/D43831 llvm-svn: 330337 --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 65 +++++++++++++++------- .../test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir | 59 ++++++++++++++++++++ 2 files changed, 104 insertions(+), 20 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 5eae119..bed4a70 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -367,7 +367,7 @@ private: DenseMap> BlockWaitcntBracketsMap; - DenseSet BlockWaitcntProcessedSet; + std::vector BlockWaitcntProcessedSet; DenseMap> LoopWaitcntDataMap; @@ -403,7 +403,8 @@ public: void updateEventWaitCntAfter(MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets); void mergeInputScoreBrackets(MachineBasicBlock &Block); - MachineBasicBlock *loopBottom(const MachineLoop *Loop); + bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block); + unsigned countNumBottomBlocks(const MachineLoop *Loop); void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); bool isWaitcntStronger(unsigned LHS, unsigned RHS); @@ -1568,15 +1569,29 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { } } -/// Return the "bottom" block of a loop. This differs from -/// MachineLoop::getBottomBlock in that it works even if the loop is -/// discontiguous. -MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) { - MachineBasicBlock *Bottom = Loop->getHeader(); - for (MachineBasicBlock *MBB : Loop->blocks()) - if (MBB->getNumber() > Bottom->getNumber()) - Bottom = MBB; - return Bottom; +/// Return true if the given basic block is a "bottom" block of a loop. This +/// differs from MachineLoop::getBottomBlock in that it works even if the loop +/// is discontiguous. This also handles multiple back-edges for the same +/// "header" block of a loop. +bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop, + const MachineBasicBlock *Block) { + for (MachineBasicBlock *MBB : Loop->blocks()) { + if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) { + return true; + } + } + return false; +} + +/// Count the number of "bottom" basic blocks of a loop. +unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) { + unsigned Count = 0; + for (MachineBasicBlock *MBB : Loop->blocks()) { + if (MBB->isSuccessor(Loop->getHeader())) { + Count++; + } + } + return Count; } // Generate s_waitcnt instructions where needed. @@ -1685,7 +1700,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // Check if we need to force convergence at loop footer. MachineLoop *ContainingLoop = MLI->getLoopFor(&Block); - if (ContainingLoop && loopBottom(ContainingLoop) == &Block) { + if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) { LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); WaitcntData->print(); DEBUG(dbgs() << '\n';); @@ -1773,6 +1788,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockVisitedSet.clear(); VCCZBugHandledSet.clear(); + LoopWaitcntDataMap.clear(); // Walk over the blocks in reverse post-dominator order, inserting // s_waitcnt where needed. @@ -1799,21 +1815,30 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // If we are walking into the block from before the loop, then guarantee // at least 1 re-walk over the loop to propagate the information, even if // no S_WAITCNT instructions were generated. - if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I && - (!BlockWaitcntProcessedSet.count(&MBB))) { - BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); - DEBUG(dbgs() << "set-revisit: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); + if (ContainingLoop && ContainingLoop->getHeader() == &MBB) { + unsigned Count = countNumBottomBlocks(ContainingLoop); + + // If the loop has multiple back-edges, and so more than one "bottom" + // basic block, we have to guarantee a re-walk over every blocks. + if ((std::count(BlockWaitcntProcessedSet.begin(), + BlockWaitcntProcessedSet.end(), &MBB) < Count)) { + BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); + DEBUG(dbgs() << "set-revisit: Block" + << ContainingLoop->getHeader()->getNumber() << '\n';); + } } // Walk over the instructions. insertWaitcntInBlock(MF, MBB); // Flag that waitcnts have been processed at least once. - BlockWaitcntProcessedSet.insert(&MBB); + BlockWaitcntProcessedSet.push_back(&MBB); - // See if we want to revisit the loop. - if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) { + // See if we want to revisit the loop. If a loop has multiple back-edges, + // we shouldn't revisit the same "bottom" basic block. + if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) && + std::count(BlockWaitcntProcessedSet.begin(), + BlockWaitcntProcessedSet.end(), &MBB) == 1) { MachineBasicBlock *EntryBB = ContainingLoop->getHeader(); BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get(); if (EntrySB && EntrySB->getRevisitLoop()) { diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir new file mode 100644 index 0000000..2d9ec03 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir @@ -0,0 +1,59 @@ +# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-waitcnts -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: waitcnt-back-edge-loop +# GCN: bb.2 +# GCN: S_WAITCNT 112 +# GCN: $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec + +--- +name: waitcnt-back-edge-loop +body: | + bb.0: + successors: %bb.1 + + $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2 + $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2 + $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 3, killed $sgpr4, implicit $exec + $vgpr3 = V_CNDMASK_B32_e64 -1082130432, 1065353216, killed $sgpr0_sgpr1, implicit $exec + $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec + S_BRANCH %bb.1 + + bb.3: + successors: %bb.1 + + $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + + bb.1: + successors: %bb.5, %bb.2 + + $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec + V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.5, implicit killed $vcc + + bb.2: + successors: %bb.4, %bb.3 + + V_CMP_EQ_U32_e32 9, killed $vgpr5, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.3, implicit killed $vcc + + bb.4: + successors: %bb.3, %bb.1 + + $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $vgpr4 = V_CVT_I32_F32_e32 $vgpr5, implicit $exec + V_CMP_EQ_U32_e32 2, killed $vgpr4, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + $vgpr4 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_BRANCH %bb.3 + + bb.5: + + $vgpr4 = V_MAC_F32_e32 killed $vgpr0, killed $vgpr3, killed $vgpr4, implicit $exec + EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec + S_ENDPGM +... -- 2.7.4