From e9b236f411c5683d270a381bf810ba3c8f3ed12c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 24 Jul 2020 20:13:04 -0400 Subject: [PATCH] AMDGPU: Check for other defs when folding conditions into s_andn2_b64 We can't fold the masked compare value through the select if the select condition is re-defed after the and instruction. Fixes a verifier error and trying to use the outgoing value defined in the block. I'm not sure why this pass is bothering to handle physregs. It's making this more complex and forces extra liveness computation. --- .../Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp | 34 +++- .../AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir | 201 +++++++++++++++++++++ 2 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 8af00fc..74546be 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -77,6 +77,32 @@ static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) { return false; } +// See if there is a def between \p AndIdx and \p SelIdx that needs to live +// beyond \p AndIdx. +static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx, + SlotIndex SelIdx) { + LiveQueryResult AndLRQ = LR.Query(AndIdx); + return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut()); +} + +// FIXME: Why do we bother trying to handle physical registers here? +static bool isDefBetween(const SIRegisterInfo &TRI, + LiveIntervals *LIS, Register Reg, + const MachineInstr &Sel, const MachineInstr &And) { + SlotIndex AndIdx = LIS->getInstructionIndex(And); + SlotIndex SelIdx = LIS->getInstructionIndex(Sel); + + if (Reg.isVirtual()) + return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx); + + for (MCRegUnitIterator UI(Reg, &TRI); UI.isValid(); ++UI) { + if (isDefBetween(LIS->getRegUnit(*UI), AndIdx, SelIdx)) + return true; + } + + return false; +} + // Optimize sequence // %sel = V_CNDMASK_B32_e64 0, 1, %cc // %cmp = V_CMP_NE_U32 1, %1 @@ -158,10 +184,16 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, Op1->getImm() != 0 || Op2->getImm() != 1) return AMDGPU::NoRegister; + Register CCReg = CC->getReg(); + + // If there was a def between the select and the and, we would need to move it + // to fold this. + if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And)) + return AMDGPU::NoRegister; + LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t' << *And); - Register CCReg = CC->getReg(); LIS->RemoveMachineInstrFromMaps(*And); MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc), diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir new file mode 100644 index 0000000..0c62d66 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir @@ -0,0 +1,201 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=si-optimize-exec-masking-pre-ra,si-optimize-exec-masking-pre-ra -o - %s | FileCheck %s + +# FIXME: Second run of the pass is a workaround for a bug in +# -run-pass. The verifier doesn't detect broken LiveIntervals, see bug +# 46873 + + +# Cannot fold this without moving the def of %7 after the and. +--- +name: no_fold_andn2_select_condition_live_out_phi +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: no_fold_andn2_select_condition_live_out_phi + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; CHECK: undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec + ; CHECK: V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec + ; CHECK: %1.sub1:vreg_64 = COPY %1.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; CHECK: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + %7:sreg_64_xexec = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %7, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + %7:sreg_64_xexec = S_MOV_B64 0 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... + +# It's OK to fold this, since the phi def is after the andn2 insert point. +--- +name: fold_andn2_select_condition_live_out_phi_reorder +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: fold_andn2_select_condition_live_out_phi_reorder + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; CHECK: undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: %1.sub1:vreg_64 = COPY %1.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: $vcc = S_ANDN2_B64 $exec, [[S_MOV_B64_]], implicit-def dead $scc + ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + %7:sreg_64_xexec = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %7, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + %7:sreg_64_xexec = S_MOV_B64 0 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... + +--- +name: no_fold_andn2_select_condition_live_out_phi_physreg +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: no_fold_andn2_select_condition_live_out_phi_physreg + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 -1 + ; CHECK: undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec + ; CHECK: V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec + ; CHECK: %0.sub1:vreg_64 = COPY %0.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + $sgpr4_sgpr5 = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + liveins: $sgpr4_sgpr5 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + $sgpr4_sgpr5 = S_MOV_B64 0 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... + +--- +name: fold_andn2_select_condition_live_out_phi_physreg_reorder +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: fold_andn2_select_condition_live_out_phi_physreg_reorder + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 -1 + ; CHECK: undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK: %0.sub1:vreg_64 = COPY %0.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: $vcc = S_ANDN2_B64 $exec, $sgpr4_sgpr5, implicit-def dead $scc + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + $sgpr4_sgpr5 = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + liveins: $sgpr4_sgpr5 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + $sgpr4_sgpr5 = S_MOV_B64 0 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... -- 2.7.4