From 3a18665748342d8a55c3000dcc2c444ba1de2049 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Fri, 17 Jul 2020 11:12:12 +0900 Subject: [PATCH] [AMDGPU] Translate s_and/s_andn2 to s_mov in vcc optimisation When SCC is dead, but VCC is required then replace s_and / s_andn2 with s_mov into VCC when mask value is 0 or -1. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D83850 --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 16 ++- llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 2 +- llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir | 120 ++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/wave32.ll | 4 +- 4 files changed, 137 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index f31c722..c9bec36 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -70,6 +70,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; + const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), E = MBB.rend(); @@ -136,9 +137,20 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { if (A->getOpcode() == AndN2) MaskValue = ~MaskValue; - if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && - MI.killsRegister(CondReg, TRI)) + if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) { + if (!MI.killsRegister(CondReg, TRI)) { + // Replace AND with MOV + if (MaskValue == 0) { + BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg) + .addImm(0); + } else { + BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg) + .addReg(ExecReg); + } + } + // Remove AND instruction A->eraseFromParent(); + } bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; if (SReg == ExecReg) { diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 6d63ca5..d82d905 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -158,7 +158,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) { ; SI-NEXT: ; %bb.4: ; %loop.exit.guard ; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_and_b64 vcc, exec, 0 +; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: s_branch BB3_2 ; SI-NEXT: BB3_5: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir b/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir index ecfd59d..c6bb9dd 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir @@ -415,3 +415,123 @@ body: | S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc S_ENDPGM 0 ... +--- +# GCN-LABEL: name: and_0_mov +# GCN: bb.2: +# GCN-NOT: S_AND +# GCN: $vcc = S_MOV_B64 0 +# GCN-NEXT: S_BRANCH %bb.1 +name: and_0_mov +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 0 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_ENDPGM 0 +... +--- +# GCN-LABEL: name: andn2_m1_mov +# GCN: bb.2: +# GCN-NOT: S_ANDN2 +# GCN: $vcc = S_MOV_B64 0 +# GCN-NEXT: S_BRANCH %bb.1 +name: andn2_m1_mov +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_ENDPGM 0 +... +--- +# GCN-LABEL: name: and_m1_mov +# GCN: bb.2: +# GCN-NOT: S_AND +# GCN: $vcc = S_MOV_B64 $exec +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_m1_mov +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_ENDPGM 0 +... +--- +# GCN-LABEL: name: andn2_0_mov +# GCN: bb.2: +# GCN-NOT: S_ANDN2 +# GCN: $vcc = S_MOV_B64 $exec +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: andn2_0_mov +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 0 + $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_ENDPGM 0 +... +--- +# GCN-LABEL: name: and_0_scc_req +# GCN: bb.2: +# GCN-NOT: S_MOV_ +# GCN: S_AND_ +# GCN-NEXT: S_BRANCH %bb.1 +name: and_0_scc_req +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 0 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_ENDPGM 0 +... +--- +# GCN-LABEL: name: andn2_m1_scc_req +# GCN: bb.2: +# GCN-NOT: S_MOV_ +# GCN: S_ANDN2_ +# GCN-NEXT: S_BRANCH %bb.1 +name: andn2_m1_scc_req +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 388a75d..896b974 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -836,8 +836,8 @@ define amdgpu_ps void @test_wqm_vote(float %a) { } ; GCN-LABEL: {{^}}test_branch_true: -; GFX1032: s_and_b32 vcc_lo, exec_lo, -1 -; GFX1064: s_and_b64 vcc, exec, -1 +; GFX1032: s_mov_b32 vcc_lo, exec_lo +; GFX1064: s_mov_b64 vcc, exec define amdgpu_kernel void @test_branch_true() #2 { entry: br i1 true, label %for.end, label %for.body.lr.ph -- 2.7.4