From 2bfcacf0ad362956277a1c2c9ba00ddc453a42ce Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Fri, 3 Jul 2020 12:25:33 +0900
Subject: [PATCH] [AMDGPU] Insert PS early exit at end of control flow

Exit early if the exec mask is zero at the end of control flow.
Mark the ends of control flow during control flow lowering and
convert these to exits during the insert skips pass.

Reviewed By: nhaehnle

Differential Revision: https://reviews.llvm.org/D82737
---
 llvm/lib/Target/AMDGPU/SIInsertSkips.cpp       |   9 ++
 llvm/lib/Target/AMDGPU/SIInstructions.td       |   3 +
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp  | 181 ++++++++++++++++++-------
 llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll |   6 +-
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll       |   6 +-
 5 files changed, 153 insertions(+), 52 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index d4371ef..e208385 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -447,6 +447,15 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
 
+      case AMDGPU::SI_KILL_CLEANUP:
+        if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
+            dominatesAllReachable(MBB)) {
+          KillInstrs.push_back(&MI);
+        } else {
+          MI.eraseFromParent();
+        }
+        break;
+
       default:
         break;
       }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2b053f8..ec378379 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -379,6 +379,9 @@ multiclass PseudoInstKill <dag ins> {
 defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
 defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
 
+let Defs = [EXEC] in
+def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
+
 let Defs = [EXEC,VCC] in
 def SI_ILLEGAL_COPY : SPseudoInstSI <
   (outs unknown:$dst), (ins unknown:$src),
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 1e90e6b..36d52ac 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -89,8 +89,10 @@ private:
   MachineRegisterInfo *MRI = nullptr;
   SetVector<MachineInstr*> LoweredEndCf;
   DenseSet<Register> LoweredIf;
+  SmallSet<MachineInstr *, 16> NeedsKillCleanup;
 
   const TargetRegisterClass *BoolRC = nullptr;
+  bool InsertKillCleanups;
   unsigned AndOpc;
   unsigned OrOpc;
   unsigned XorOpc;
@@ -111,6 +113,8 @@ private:
 
   void combineMasks(MachineInstr &MI);
 
+  void process(MachineInstr &MI);
+
   // Skip to the next instruction, ignoring debug instructions, and trivial
   // block boundaries (blocks that have one (typically fallthrough) successor,
   // and the successor has one predecessor.
@@ -160,36 +164,36 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
 
 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 
-static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
-                       const SIInstrInfo *TII) {
-  Register SaveExecReg = MI.getOperand(0).getReg();
-  auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
-
-  if (U == MRI->use_instr_nodbg_end() ||
-      std::next(U) != MRI->use_instr_nodbg_end() ||
-      U->getOpcode() != AMDGPU::SI_END_CF)
-    return false;
-
-  // Check for SI_KILL_*_TERMINATOR on path from if to endif.
-  // if there is any such terminator simplififcations are not safe.
-  auto SMBB = MI.getParent();
-  auto EMBB = U->getParent();
+static bool hasKill(const MachineBasicBlock *Begin,
+                    const MachineBasicBlock *End, const SIInstrInfo *TII) {
   DenseSet<const MachineBasicBlock*> Visited;
-  SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(),
-                                              SMBB->succ_end());
+  SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(),
+                                               Begin->succ_end());
 
   while (!Worklist.empty()) {
     MachineBasicBlock *MBB = Worklist.pop_back_val();
 
-    if (MBB == EMBB || !Visited.insert(MBB).second)
+    if (MBB == End || !Visited.insert(MBB).second)
       continue;
-    for(auto &Term : MBB->terminators())
+    for (auto &Term : MBB->terminators())
       if (TII->isKillTerminator(Term.getOpcode()))
-        return false;
+        return true;
 
     Worklist.append(MBB->succ_begin(), MBB->succ_end());
   }
 
+  return false;
+}
+
+static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
+  Register SaveExecReg = MI.getOperand(0).getReg();
+  auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
+
+  if (U == MRI->use_instr_nodbg_end() ||
+      std::next(U) != MRI->use_instr_nodbg_end() ||
+      U->getOpcode() != AMDGPU::SI_END_CF)
+    return false;
+
   return true;
 }
 
@@ -207,7 +211,35 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   // If there is only one use of save exec register and that use is SI_END_CF,
   // we can optimize SI_IF by returning the full saved exec mask instead of
   // just cleared bits.
-  bool SimpleIf = isSimpleIf(MI, MRI, TII);
+  bool SimpleIf = isSimpleIf(MI, MRI);
+
+  if (InsertKillCleanups) {
+    // Check for SI_KILL_*_TERMINATOR on full path of control flow and
+    // flag the associated SI_END_CF for insertion of a kill cleanup.
+    auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+    while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
+      assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
+      assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
+      MachineOperand &NextExec = UseMI->getOperand(0);
+      Register NextExecReg = NextExec.getReg();
+      if (NextExec.isDead()) {
+        assert(!SimpleIf);
+        break;
+      }
+      UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
+    }
+    if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
+      if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
+        NeedsKillCleanup.insert(&*UseMI);
+        SimpleIf = false;
+      }
+    }
+  } else if (SimpleIf) {
+    // Check for SI_KILL_*_TERMINATOR on path from if to endif.
+    // if there is any such terminator simplifications are not safe.
+    auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+    SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
+  }
 
   // Add an implicit def of exec to discourage scheduling VALU after this which
   // will interfere with trying to form s_and_saveexec_b64 later.
@@ -427,6 +459,8 @@ SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
 
     auto E = B->end();
     for ( ; It != E; ++It) {
+      if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
+        continue;
       if (TII->mayReadEXEC(*MRI, *It))
         break;
     }
@@ -461,8 +495,18 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
 
   LoweredEndCf.insert(NewMI);
 
-  if (LIS)
+  // If this ends control flow which contains kills (as flagged in emitIf)
+  // then insert an SI_KILL_CLEANUP immediately following the exec mask
+  // manipulation.  This can be lowered to early termination if appropriate.
+  MachineInstr *CleanUpMI = nullptr;
+  if (NeedsKillCleanup.count(&MI))
+    CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
+
+  if (LIS) {
     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+    if (CleanUpMI)
+      LIS->InsertMachineInstrInMaps(*CleanUpMI);
+  }
 
   MI.eraseFromParent();
 
@@ -553,6 +597,56 @@ void SILowerControlFlow::optimizeEndCf() {
   }
 }
 
+void SILowerControlFlow::process(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::iterator I(MI);
+  MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_IF:
+    emitIf(MI);
+    break;
+
+  case AMDGPU::SI_ELSE:
+    emitElse(MI);
+    break;
+
+  case AMDGPU::SI_IF_BREAK:
+    emitIfBreak(MI);
+    break;
+
+  case AMDGPU::SI_LOOP:
+    emitLoop(MI);
+    break;
+
+  case AMDGPU::SI_END_CF:
+    emitEndCf(MI);
+    break;
+
+  default:
+    assert(false && "Attempt to process unsupported instruction");
+    break;
+  }
+
+  MachineBasicBlock::iterator Next;
+  for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
+    Next = std::next(I);
+    MachineInstr &MaskMI = *I;
+    switch (MaskMI.getOpcode()) {
+    case AMDGPU::S_AND_B64:
+    case AMDGPU::S_OR_B64:
+    case AMDGPU::S_AND_B32:
+    case AMDGPU::S_OR_B32:
+      // Cleanup bit manipulations on exec mask
+      combineMasks(MaskMI);
+      break;
+    default:
+      I = MBB.end();
+      break;
+    }
+  }
+}
+
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -562,6 +656,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   LIS = getAnalysisIfAvailable<LiveIntervals>();
   MRI = &MF.getRegInfo();
   BoolRC = TRI->getBoolRC();
+  InsertKillCleanups =
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
 
   if (ST.isWave32()) {
     AndOpc = AMDGPU::S_AND_B32;
@@ -583,62 +679,49 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     Exec = AMDGPU::EXEC;
   }
 
+  SmallVector<MachineInstr *, 32> Worklist;
+
   MachineFunction::iterator NextBB;
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; BI = NextBB) {
     NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
 
-    MachineBasicBlock::iterator I, Next, Last;
-
-    for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
       case AMDGPU::SI_IF:
-        emitIf(MI);
+        process(MI);
         break;
 
       case AMDGPU::SI_ELSE:
-        emitElse(MI);
-        break;
-
       case AMDGPU::SI_IF_BREAK:
-        emitIfBreak(MI);
-        break;
-
       case AMDGPU::SI_LOOP:
-        emitLoop(MI);
-        break;
-
       case AMDGPU::SI_END_CF:
-        emitEndCf(MI);
+        // Only build worklist if SI_IF instructions must be processed first.
+        if (InsertKillCleanups)
+          Worklist.push_back(&MI);
+        else
+          process(MI);
         break;
 
-      case AMDGPU::S_AND_B64:
-      case AMDGPU::S_OR_B64:
-      case AMDGPU::S_AND_B32:
-      case AMDGPU::S_OR_B32:
-        // Cleanup bit manipulations on exec mask
-        combineMasks(MI);
-        Last = I;
-        continue;
-
       default:
-        Last = I;
-        continue;
+        break;
       }
-
-      // Replay newly inserted code to combine masks
-      Next = (Last == MBB.end()) ? MBB.begin() : Last;
     }
   }
 
+  for (MachineInstr *MI : Worklist)
+    process(*MI);
+
   optimizeEndCf();
 
   LoweredEndCf.clear();
   LoweredIf.clear();
+  NeedsKillCleanup.clear();
 
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
index a2358f3..172e6bf 100644
--- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
@@ -61,9 +61,11 @@ loop:
   br label %loop
 }
 
-; In case there's an epilog, we shouldn't have to do this.
+; Check that the epilog is the final block
 ; CHECK-LABEL: return_nonvoid
-; CHECK-NOT: exp null off, off, off, off done vm
+; CHECK: exp null off, off, off, off done vm
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB{{[0-9]+}}_{{[0-9]+}}:
 define amdgpu_ps float @return_nonvoid(float %0) #0 {
 main_body:
   %cmp = fcmp olt float %0, 1.000000e+01
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index fee3158..f178259 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -470,7 +470,11 @@ bb9:                                              ; preds = %bb4
 }
 
 ; CHECK-LABEL: {{^}}cbranch_kill:
-; CHECK-NOT: exp null off, off, off, off done vm
+; CHECK: ; %bb.{{[0-9]+}}: ; %export
+; CHECK-NEXT: s_or_b64
+; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: exp null off, off, off, off done vm
 define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) {
 .entry:
   %val0 = extractelement <2 x float> %1, i32 0
-- 
2.7.4