[MachineSink] Use SkipPHIsAndLabels for sink insertion points
authorCarl Ritson <carl.ritson@amd.com>
Wed, 16 Feb 2022 02:56:54 +0000 (11:56 +0900)
committerCarl Ritson <carl.ritson@amd.com>
Wed, 16 Feb 2022 03:44:22 +0000 (12:44 +0900)
For AMDGPU the insertion point for a block may not be the first
non-PHI instruction.  This happens when a block contains EXEC
mask manipulation related to control flow (converging lanes).

Use SkipPHIsAndLabels to determine the block insertion point
so that the target can skip any block prologue instructions.

Reviewed By: rampitec, ruiling

Differential Revision: https://reviews.llvm.org/D119399

llvm/lib/CodeGen/MachineSink.cpp
llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir [new file with mode: 0644]

index e584ebe..7ed33f9 100644 (file)
@@ -1272,7 +1272,8 @@ bool MachineSinking::SinkIntoLoop(MachineLoop *L, MachineInstr &I) {
   }
 
   LLVM_DEBUG(dbgs() << "LoopSink: Sinking instruction!\n");
-  SinkBlock->splice(SinkBlock->getFirstNonPHI(), Preheader, I);
+  SinkBlock->splice(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()), Preheader,
+                    I);
 
   // The instruction is moved from its basic block, so do not retain the
   // debug information.
@@ -1392,9 +1393,8 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
   }
 
   // Determine where to insert into. Skip phi nodes.
-  MachineBasicBlock::iterator InsertPos = SuccToSinkTo->begin();
-  while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI())
-    ++InsertPos;
+  MachineBasicBlock::iterator InsertPos =
+      SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
 
   // Collect debug users of any vreg that this inst defines.
   SmallVector<MIRegs, 4> DbgUsersToSink;
@@ -1796,7 +1796,8 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     // Clear the kill flag if SrcReg is killed between MI and the end of the
     // block.
     clearKillFlags(&MI, CurBB, UsedOpsInCopy, UsedRegUnits, TRI);
-    MachineBasicBlock::iterator InsertPos = SuccBB->getFirstNonPHI();
+    MachineBasicBlock::iterator InsertPos =
+        SuccBB->SkipPHIsAndLabels(SuccBB->begin());
     performSink(MI, *SuccBB, InsertPos, DbgValsToSink);
     updateLiveIn(&MI, SuccBB, UsedOpsInCopy, DefedRegsInCopy);
 
diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
new file mode 100644 (file)
index 0000000..4feef21
--- /dev/null
@@ -0,0 +1,122 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX10 %s
+
+# Test that MachineSink pass respects block prologues when sinking instructions.
+# Specifically an instruction must not be sunk before exec mask manipulation.
+
+---
+name:            _amdgpu_hs_main
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX10-LABEL: name: _amdgpu_hs_main
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8
+  ; GFX10-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[DEF]], 8, 5, implicit $exec
+  ; GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+  ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
+  ; GFX10-NEXT:   [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], -1, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_]], $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_AND_B32_]], implicit-def $scc
+  ; GFX10-NEXT:   $exec_lo = S_MOV_B32_term [[S_AND_B32_]]
+  ; GFX10-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX10-NEXT:   S_BRANCH %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc
+  ; GFX10-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
+  ; GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 31
+  ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_2]], implicit $exec
+  ; GFX10-NEXT:   [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], -1, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_2]], $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[S_AND_B32_1]], implicit-def $scc
+  ; GFX10-NEXT:   $exec_lo = S_MOV_B32_term [[S_AND_B32_1]]
+  ; GFX10-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   S_BRANCH %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_3]], implicit-def $scc
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+  ; GFX10-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+  ; GFX10-NEXT:   [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHL_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[S_MOV_B32_4]], killed [[S_MOV_B32_3]], implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   $exec_lo = S_OR_B32 $exec_lo, [[DEF2]], implicit-def $scc
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+    %0:sgpr_32 = IMPLICIT_DEF
+    %14:sreg_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:sreg_32 = S_MOV_B32 8
+    %17:vgpr_32 = V_LSHRREV_B32_e64 %16, %15, implicit $exec
+    %18:vgpr_32 = V_BFE_U32_e64 %15, 8, 5, implicit $exec
+    %19:sreg_32 = S_MOV_B32 5
+    %20:sreg_32 = V_CMP_NE_U32_e64 %18, killed %19, implicit $exec
+    %21:sreg_32 = S_XOR_B32 %20, -1, implicit-def $scc
+    %22:sreg_32 = S_AND_B32 %21, $exec_lo, implicit-def $scc
+    %23:sreg_32 = S_XOR_B32 $exec_lo, %22, implicit-def $scc
+    $exec_lo = S_MOV_B32_term %22
+    S_CBRANCH_EXECZ %bb.5, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    S_BRANCH %bb.5
+
+  bb.5:
+    successors: %bb.6(0x40000000), %bb.7(0x40000000)
+
+    $exec_lo = S_OR_B32 $exec_lo, %23, implicit-def $scc
+    %24:sreg_32 = S_MOV_B32 31
+    %25:sreg_32 = V_CMP_NE_U32_e64 %18, killed %24, implicit $exec
+    %26:sreg_32 = S_XOR_B32 %25, -1, implicit-def $scc
+    %27:sreg_32 = S_AND_B32 %26, $exec_lo, implicit-def $scc
+    %28:sreg_32 = S_XOR_B32 $exec_lo, %27, implicit-def $scc
+    $exec_lo = S_MOV_B32_term %27
+    S_CBRANCH_EXECZ %bb.7, implicit $exec
+    S_BRANCH %bb.6
+
+  bb.6:
+    successors: %bb.7(0x80000000)
+
+    S_BRANCH %bb.7
+
+  bb.7:
+    successors: %bb.8(0x80000000)
+
+    $exec_lo = S_OR_B32 $exec_lo, %28, implicit-def $scc
+    %29:sreg_32 = S_MOV_B32 16
+    %30:sreg_32 = S_MOV_B32 4
+    %31:vgpr_32 = nuw nsw V_LSHL_ADD_U32_e64 %17, %30, killed %29, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.8:
+    $exec_lo = S_OR_B32 $exec_lo, %14, implicit-def $scc
+    S_ENDPGM 0
+
+...