From 57431c9680c8cde0f6cf0e85bed8507b307b080f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 10 Aug 2016 19:11:42 +0000 Subject: [PATCH] AMDGPU: Change insertion point of si_mask_branch Insert before the skip branch if one is created. This is a somewhat more natural placement relative to the skip branches, and makes it possible to implement analyzeBranch for skip blocks. The test changes are mostly due to a quirk where the block label is not emitted if there is a terminator that is not also a branch. llvm-svn: 278273 --- llvm/lib/Target/AMDGPU/SIInstructions.td | 4 ++-- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 27 ++++++++++++++-------- llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll | 10 +++++--- llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 11 +++++---- .../AMDGPU/uniform-loop-inside-nonuniform.ll | 14 +++++++---- llvm/test/CodeGen/AMDGPU/valu-i1.ll | 10 ++++---- llvm/test/CodeGen/AMDGPU/wqm.ll | 2 +- 7 files changed, 49 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index f6c2719..bea6149 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1777,9 +1777,9 @@ let hasSideEffects = 1 in { // replaced with exec mask operations. def SI_MASK_BRANCH : PseudoInstSI < (outs), (ins brtarget:$target, SReg_64:$dst)> { - let isBranch = 1; + let isBranch = 0; let isTerminator = 1; - let isBarrier = 1; + let isBarrier = 0; let SALU = 1; } diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 0f9d091..23043ea 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -80,7 +80,7 @@ private: bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); - void Skip(MachineInstr &From, MachineOperand &To); + MachineInstr *Skip(MachineInstr &From, MachineOperand &To); bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); void If(MachineInstr &MI); @@ -182,14 +182,15 @@ bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, return false; } -void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { - +MachineInstr *SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) - return; + return nullptr; - DebugLoc DL = From.getDebugLoc(); - BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + const DebugLoc &DL = From.getDebugLoc(); + MachineInstr *Skip = + BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) .addOperand(To); + return Skip; } bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { @@ -242,10 +243,13 @@ void SILowerControlFlow::If(MachineInstr &MI) { .addReg(AMDGPU::EXEC) .addReg(Reg); - Skip(MI, MI.getOperand(2)); + MachineInstr *SkipInst = Skip(MI, MI.getOperand(2)); + + // Insert before the new branch instruction. + MachineInstr *InsPt = SkipInst ? SkipInst : &MI; // Insert a pseudo terminator to help keep the verifier happy. - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) .addOperand(MI.getOperand(2)) .addReg(Reg); @@ -275,10 +279,13 @@ void SILowerControlFlow::Else(MachineInstr &MI) { .addReg(AMDGPU::EXEC) .addReg(Dst); - Skip(MI, MI.getOperand(2)); + MachineInstr *SkipInst = Skip(MI, MI.getOperand(2)); + + // Insert before the new branch instruction. + MachineInstr *InsPt = SkipInst ? SkipInst : &MI; // Insert a pseudo terminator to help keep the verifier happy. - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) .addOperand(MI.getOperand(2)) .addReg(Dst); diff --git a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll index 55a38e5..d0f98e5 100644 --- a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll +++ b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll @@ -4,7 +4,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}convergent_inlineasm: ; GCN: BB#0: ; GCN: v_cmp_ne_i32_e64 -; GCN: BB#1: +; GCN: ; mask branch +; GCN: BB{{[0-9]+_[0-9]+}}: define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() @@ -22,9 +23,12 @@ bb5: ; preds = %bb3, %bb } ; GCN-LABEL: {{^}}nonconvergent_inlineasm: -; GCN: BB#1: +; GCN: ; mask branch + +; GCN: BB{{[0-9]+_[0-9]+}}: ; GCN: v_cmp_ne_i32_e64 -; GCN: BB1_2: + +; GCN: BB{{[0-9]+_[0-9]+}}: define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) { bb: %tmp = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 4ba4ac7..425527d 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -202,8 +202,11 @@ exit: ; CHECK: v_cmp_eq_i32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc ; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]] -; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: ; mask branch [[EXIT]] +; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: s_cbranch_execz [[EXIT]] + +; CHECK: {{BB[0-9]+_[0-9]+}}: ; %bb.preheader +; CHECK: s_mov_b32 ; CHECK: [[LOOP_BB:BB[0-9]+_[0-9]+]]: @@ -353,7 +356,7 @@ bb7: ; preds = %bb4 ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]] ; CHECK-NOT: branch -; CHECK: ; BB#3: ; %bb8 +; CHECK: BB{{[0-9]+_[0-9]+}}: ; %bb8 ; CHECK: buffer_store_dword ; CHECK: [[END]]: @@ -387,4 +390,4 @@ declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } \ No newline at end of file +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll index 9f2f0d6..84a4c21 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -5,6 +5,11 @@ ; CHECK-LABEL: {{^}}test1: ; CHECK: v_cmp_ne_i32_e32 vcc, 0 ; CHECK: s_and_saveexec_b64 +; CHECK-NEXT: s_xor_b64 +; CHECK-NEXT: ; mask branch +; CHECK-NEXT: s_cbranch_execz + +; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader ; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: ; CHECK: s_and_b64 vcc, exec, vcc @@ -30,10 +35,11 @@ out: ret void } -;CHECK-LABEL: {{^}}test2: -;CHECK: s_and_saveexec_b64 -;CHECK: s_xor_b64 -;CHECK-NEXT: s_cbranch_execz +; CHECK-LABEL: {{^}}test2: +; CHECK: s_and_saveexec_b64 +; CHECK-NEXT: s_xor_b64 +; CHECK-NEXT: ; mask branch +; CHECK-NEXT: s_cbranch_execz define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { main_body: %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index 02a1278..35e06fa 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -47,7 +47,7 @@ end: ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: ; BB#1 +; SI: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword ; SI: s_endpgm @@ -68,7 +68,7 @@ exit: ret void } -; SI-LABEL: @simple_test_v_loop +; SI-LABEL: {{^}}simple_test_v_loop: ; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] @@ -106,7 +106,7 @@ exit: ret void } -; SI-LABEL: @multi_vcond_loop +; SI-LABEL: {{^}}multi_vcond_loop: ; Load loop limit from buffer ; Branch to exit if uniformly not taken @@ -118,7 +118,7 @@ exit: ; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] ; Initialize inner condition to false -; SI: ; BB#1: +; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader ; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] @@ -133,7 +133,7 @@ exit: ; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]] ; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] -; SI: BB#3: +; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 ; SI: buffer_store_dword ; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]] diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 809a7ba..9b4c7c3 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -123,7 +123,7 @@ END: ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] ;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]] -;CHECK-NEXT: ; BB#3: ; %ELSE +;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE ;CHECK: store_dword ;CHECK: [[END_BB]]: ; %END ;CHECK: s_or_b64 exec, exec, -- 2.7.4