#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
const SIRegisterInfo *TRI = nullptr;
const SIInstrInfo *TII = nullptr;
unsigned SkipThreshold = 0;
+ MachineDominatorTree *MDT = nullptr;
bool shouldSkip(const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
- bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
+ bool dominatesAllReachable(MachineBasicBlock &MBB);
+ void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ DebugLoc DL);
- void kill(MachineInstr &MI);
-
- MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const;
+ bool kill(MachineInstr &MI);
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
char SIInsertSkips::ID = 0;
-INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
- "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
return false;
}
-bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
- MachineBasicBlock &MBB = *MI.getParent();
- MachineFunction *MF = MBB.getParent();
-
- if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
- !shouldSkip(MBB, MBB.getParent()->back()))
- return false;
-
- MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
-
- const DebugLoc &DL = MI.getDebugLoc();
-
- // If the exec mask is non-zero, skip the next two instructions
- BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addMBB(&NextBB);
+/// Check whether \p MBB dominates all blocks that are reachable from it.
+bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
+ for (MachineBasicBlock *Other : depth_first(&MBB)) {
+ if (!MDT->dominates(&MBB, Other))
+ return false;
+ }
+ return true;
+}
- MachineBasicBlock::iterator Insert = SkipBB->begin();
+/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
+/// iterator. Only applies to pixel shaders.
+void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL) {
+ MachineFunction *MF = MBB.getParent();
+ assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
+
+ // Currently, SI_KILL_*_TERMINATOR is expected to occur only as the last
+ // terminator of a basic block. If this ever changes, we need to optionally
+ // split MBB here.
+ assert(I == MBB.end());
+
+ // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
+ // basic block that has no further successors (e.g., there was an
+ // `unreachable` there in IR). This can happen with original source of the
+ // form:
+ //
+ // if (uniform_condition) {
+ // write_to_memory();
+ // discard;
+ // }
+ //
+ // In this case, we write the "null_export; s_endpgm" skip code in the
+ // already-existing basic block.
+ auto NextBBI = std::next(MBB.getIterator());
+ bool NoSuccessor = llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
+ MachineBasicBlock *SkipBB;
+
+ if (NoSuccessor) {
+ SkipBB = &MBB;
+ } else {
+ // Create a new basic block that will contain the "null export; s_endpgm"
+ // and set up the branching to go around it.
+ SkipBB = MF->CreateMachineBasicBlock();
+ MF->insert(NextBBI, SkipBB);
- // Exec mask is zero: Export to NULL target...
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
- .addImm(0x09) // V_008DFC_SQ_EXP_NULL
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addImm(1) // vm
- .addImm(0) // compr
- .addImm(0); // en
+ BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&*NextBBI);
+ MBB.addSuccessor(SkipBB);
- // ... and terminate wavefront.
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
+ MDT->addNewBlock(SkipBB, &MBB);
+ }
- return true;
+ // Generate "null export; s_endpgm".
+ BuildMI(SkipBB, DL, TII->get(AMDGPU::EXP_DONE))
+ .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addImm(1) // vm
+ .addImm(0) // compr
+ .addImm(0); // en
+ BuildMI(SkipBB, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
}
-void SIInsertSkips::kill(MachineInstr &MI) {
+/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
+/// Return true unless the terminator is a no-op.
+bool SIInsertSkips::kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
I.addImm(0); // omod
}
- break;
+ return true;
}
case AMDGPU::SI_KILL_I1_TERMINATOR: {
const MachineFunction *MF = MI.getParent()->getParent();
int64_t Imm = Op.getImm();
assert(Imm == 0 || Imm == -1);
- if (Imm == KillVal)
+ if (Imm == KillVal) {
BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64), Exec)
.addImm(0);
- break;
+ return true;
+ }
+ return false;
}
unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
.addReg(Exec)
.add(Op);
- break;
+ return true;
}
default:
llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
}
}
-MachineBasicBlock *SIInsertSkips::insertSkipBlock(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
- MachineFunction *MF = MBB.getParent();
-
- MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
-
- MF->insert(MBBI, SkipBB);
- MBB.addSuccessor(SkipBB);
-
- return SkipBB;
-}
-
// Returns true if a branch over the block was inserted.
bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
SkipThreshold = SkipThresholdFlag;
- bool HaveKill = false;
- bool MadeChange = false;
-
- // Track depth of exec mask, divergent branches.
- SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
-
- MachineFunction::iterator NextBB;
-
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+ SmallVector<MachineInstr *, 4> KillInstrs;
+ bool MadeChange = false;
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; BI = NextBB) {
- NextBB = std::next(BI);
- MachineBasicBlock &MBB = *BI;
- bool HaveSkipBlock = false;
-
- if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
- // Reached convergence point for last divergent branch.
- ExecBranchStack.pop_back();
- }
-
- if (HaveKill && ExecBranchStack.empty()) {
- HaveKill = false;
-
- // TODO: Insert skip if exec is 0?
- }
-
+ for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
-
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
- case AMDGPU::S_CBRANCH_EXECZ:
- ExecBranchStack.push_back(MI.getOperand(0).getMBB());
- break;
case AMDGPU::SI_MASK_BRANCH:
- ExecBranchStack.push_back(MI.getOperand(0).getMBB());
MadeChange |= skipMaskBranch(MI, MBB);
break;
// Optimize out branches to the next block.
// FIXME: Shouldn't this be handled by BranchFolding?
if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
+ assert(&MI == &MBB.back());
MI.eraseFromParent();
- } else if (HaveSkipBlock) {
- // Remove the given unconditional branch when a skip block has been
- // inserted after the current one and let skip the two instructions
- // performing the kill if the exec mask is non-zero.
- MI.eraseFromParent();
+ MadeChange = true;
}
break;
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
- case AMDGPU::SI_KILL_I1_TERMINATOR:
+ case AMDGPU::SI_KILL_I1_TERMINATOR: {
MadeChange = true;
- kill(MI);
-
- if (ExecBranchStack.empty()) {
- if (NextBB != BE && skipIfDead(MI, *NextBB)) {
- HaveSkipBlock = true;
- NextBB = std::next(BI);
- BE = MF.end();
- }
+ bool CanKill = kill(MI);
+
+ // Check if we can add an early "if exec=0 { end shader }".
+ //
+ // Note that we _always_ do this if it is correct, even if the kill
+ // happens fairly late in the shader, because the null export should
+ // generally still be cheaper than normal export(s).
+ //
+ // TODO: The dominatesAllReachable check is conservative: if the
+ // dominance is only missing due to _uniform_ branches, we could
+ // in fact insert the early-exit as well.
+ if (CanKill &&
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
+ dominatesAllReachable(MBB)) {
+ // Mark the instruction for kill-if-dead insertion. We delay this
+ // change because it modifies the CFG.
+ KillInstrs.push_back(&MI);
} else {
- HaveKill = true;
+ MI.eraseFromParent();
}
-
- MI.eraseFromParent();
break;
+ }
case AMDGPU::SI_RETURN_TO_EPILOG:
// FIXME: Should move somewhere else
// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
// because external bytecode will be appended at the end.
- if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+ if (&MBB != &MF.back() || &MI != &MBB.back()) {
// SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
// the end and jump there.
if (!EmptyMBBAtEnd) {
}
MBB.addSuccessor(EmptyMBBAtEnd);
- BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
.addMBB(EmptyMBBAtEnd);
- I->eraseFromParent();
+ MI.eraseFromParent();
}
break;
}
}
+ for (MachineInstr *Kill : KillInstrs) {
+ skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
+ Kill->getDebugLoc());
+ Kill->eraseFromParent();
+ }
+ KillInstrs.clear();
+
return MadeChange;
}
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_mov_b64 exec, 0
+; CHECK-NEXT: s_cbranch_execnz BB1_2
; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: exp null off, off, off, off done vm
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB1_2:
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
call void @llvm.amdgcn.kill(i1 false)
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2:
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_mov_b64 exec, 0
-; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_cbranch_execnz BB2_2
+; CHECK: exp null
+; CHECK-NEXT: s_endpgm
+; CHECK: BB2_2:
; CHECK-NEXT: s_mov_b64 exec, 0
-; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_cbranch_execnz BB2_4
+; CHECK: exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB2_4:
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
call void @llvm.amdgcn.kill(i1 false)
; CHECK-LABEL: {{^}}test_kill_depth_var:
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_cbranch_execnz BB3_2
+; CHECK: exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB3_2:
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
%cmp = fcmp olt float %x, 0.0
; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_cbranch_execnz BB4_2
+; CHECK: exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB4_2:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_cbranch_execnz BB4_4
+; CHECK: exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB4_4:
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
%cmp = fcmp olt float %x, 0.0
ret void
}
+; FIXME: Ideally only one early-exit would be emitted
; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_cbranch_execnz BB5_2
+; CHECK: exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB5_2:
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1
-; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_cbranch_execnz BB5_4
+; CHECK: exp null
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: BB5_4:
; CHECK-NEXT: s_endpgm
define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
%cmp.x = fcmp olt float %x, 0.0
; CHECK: v_nop_e64
; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
-; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
-; CHECK-NEXT: ; %bb.2:
-; CHECK-NEXT: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
-; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
-; CHECK-NEXT: s_endpgm
-define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 {
+; TODO: We could do an early-exit here (the branch above is uniform!)
+; CHECK-NOT: exp null
+
+; CHECK: v_mov_b32_e32 v0, 1.0
+define amdgpu_ps float @test_kill_control_flow(i32 inreg %arg) #0 {
entry:
%cmp = icmp eq i32 %arg, 0
br i1 %cmp, label %bb, label %exit
br label %exit
exit:
- ret void
+ ret float 1.0
}
; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
; CHECK: v_mov_b32_e64 v8, -1
; CHECK: ;;#ASMEND
; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
-; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
-; CHECK-NEXT: ; %bb.2:
-; CHECK-NEXT: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
+; TODO: We could do an early-exit here (the branch above is uniform!)
+; CHECK-NOT: exp null
-; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
; CHECK: buffer_store_dword v8
; CHECK: v_mov_b32_e64 v9, -2
; CHECK-LABEL: {{^}}complex_loop:
; CHECK: s_mov_b64 exec, 0
-; The following is an error, since it happens nested inside the loop:
-; CHECK-NEXT: s_cbranch_execnz
-; CHECK-NEXT: ; %bb.{{[0-9]+}}
-; CHECK-NEXT: exp null
+; CHECK-NOT: exp null
define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
.entry:
%flaga = icmp sgt i32 %cmpa, 0