From 3f8ef57bede94445b1a1042c987cc914a886e7ff Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 14 Jul 2023 11:05:24 -0400 Subject: [PATCH] MachineSink: Fix sinking VGPR def out of a divergent loop This fixes sinking a VGPR def out of a loop past the reconvergence point at the SI_END_CF. There was a prior fix which introduced blockPrologueInterferes (D121277) to fix the same basic problem for the post RA sink. This also had the special case isIgnorableUse case which was incorrect, because in some contexts the exec use is not ignorable. I'm thinking about a new way to represent this which will avoid needing hasIgnorableUse and isBasicBlockPrologue, which would function more like the exception handling. Fixes: SWDEV-407790 https://reviews.llvm.org/D155343 --- llvm/lib/CodeGen/MachineSink.cpp | 15 +++++++++++---- ...ine-sink-loop-var-out-of-divergent-loop-swdev407790.ll | 2 +- ...ne-sink-loop-var-out-of-divergent-loop-swdev407790.mir | 2 +- llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir | 2 +- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 3b0166e..8da97dc 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -288,8 +288,7 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB, if (!Reg) continue; if (MO.isUse()) { - if (Reg.isPhysical() && - (TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg)))) + if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg)) continue; if (PI->modifiesRegister(Reg, TRI)) return true; @@ -1007,16 +1006,24 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB, if (MBB == SuccToSinkTo) return nullptr; + if (!SuccToSinkTo) + return nullptr; + // It's not safe to sink instructions to EH landing pad. Control flow into // landing pad is implicitly defined. - if (SuccToSinkTo && SuccToSinkTo->isEHPad()) + if (SuccToSinkTo->isEHPad()) return nullptr; // It ought to be okay to sink instructions into an INLINEASM_BR target, but // only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in // the source block (which this code does not yet do). So for now, forbid // doing so. - if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget()) + if (SuccToSinkTo->isInlineAsmBrIndirectTarget()) + return nullptr; + + MachineBasicBlock::const_iterator InsertPos = + SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin()); + if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI)) return nullptr; return SuccToSinkTo; diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index b8e74bc..e2456b7 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -21,7 +21,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .LBB0_1: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4 ; CHECK-NEXT: .LBB0_2: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 @@ -54,6 +53,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_add_nc_u32_e32 v4, s9, v2 ; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v4, v0 +; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4 ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execz .LBB0_1 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir index 037a285..cc14b4a 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir @@ -42,6 +42,7 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[SI_IF1]], [[SI_IF]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.5 @@ -51,7 +52,6 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4 ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]] ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir index 4feef21..ee3d7ae 100644 --- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir +++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir @@ -17,6 +17,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8 + ; GFX10-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[DEF]], 8, 5, implicit $exec ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5 ; GFX10-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_1]], implicit $exec @@ -37,7 +38,6 @@ body: | ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc - ; GFX10-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec ; GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 31 ; GFX10-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_2]], implicit $exec ; GFX10-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], -1, implicit-def $scc -- 2.7.4