void getAnalysisUsage(AnalysisUsage &AU) const override;
BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
ArrayRef<BasicBlock *> ReturningBlocks,
- bool InsertExport, StringRef Name);
+ StringRef Name);
bool runOnFunction(Function &F) override;
};
return true;
}
-static void removeDoneExport(Function &F) {
- ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
- for (BasicBlock &BB : F) {
- for (Instruction &I : BB) {
- if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
- if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
- Intrin->setArgOperand(6, BoolFalse); // done
- } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
- Intrin->setArgOperand(4, BoolFalse); // done
- }
- }
- }
- }
-}
-
BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
- bool InsertExport, StringRef Name) {
+ StringRef Name) {
// Otherwise, we need to insert a new basic block into the function, add a PHI
// nodes (if the function returns values), and convert all of the return
// instructions into unconditional branches.
BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
IRBuilder<> B(NewRetBlock);
- if (InsertExport) {
- // Ensure that there's only one "done" export in the shader by removing the
- // "done" bit set on the original final export. More than one "done" export
- // can lead to undefined behavior.
- removeDoneExport(F);
-
- Value *Undef = UndefValue::get(B.getFloatTy());
- B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
- {
- B.getInt32(AMDGPU::Exp::ET_NULL),
- B.getInt32(0), // enabled channels
- Undef, Undef, Undef, Undef, // values
- B.getTrue(), // done
- B.getTrue(), // valid mask
- });
- }
-
PHINode *PN = nullptr;
if (F.getReturnType()->isVoidTy()) {
B.CreateRetVoid();
// If the function doesn't return void... add a PHI node to the block...
PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
"UnifiedRetVal");
- assert(!InsertExport);
B.CreateRet(PN);
}
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- // If there's only one exit, we don't need to do anything, unless this is a
- // pixel shader and that exit is an infinite loop, since we still have to
- // insert an export in that case.
- if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
+ // If there's only one exit, we don't need to do anything.
+ if (PDT.root_size() <= 1)
return false;
LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
SmallVector<BasicBlock *, 4> ReturningBlocks;
- SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
SmallVector<BasicBlock *, 4> UnreachableBlocks;
// Dummy return block for infinite loop.
BasicBlock *DummyReturnBB = nullptr;
- bool InsertExport = false;
-
bool Changed = false;
std::vector<DominatorTree::UpdateType> Updates;
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
ReturningBlocks.push_back(BB);
- else
- UniformlyReachedRetBlocks.push_back(BB);
} else if (isa<UnreachableInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
UnreachableBlocks.push_back(BB);
"DummyReturnBlock", &F);
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
-
- // For pixel shaders, the producer guarantees that an export is
- // executed before each return instruction. However, if there is an
- // infinite loop and we insert a return ourselves, we need to uphold
- // that guarantee by inserting a null export. This can happen e.g. in
- // an infinite loop with kill instructions, which is supposed to
- // terminate. However, we don't need to do this if there is a non-void
- // return value, since then there is an epilog afterwards which will
- // still export.
- //
- // Note: In the case where only some threads enter the infinite loop,
- // this can result in the null export happening redundantly after the
- // original exports. However, The last "real" export happens after all
- // the threads that didn't enter an infinite loop converged, which
- // means that the only extra threads to execute the null export are
- // threads that entered the infinite loop, and they only could've
- // exited through being killed which sets their exec bit to 0.
- // Therefore, unless there's an actual infinite loop, which can have
- // invalid results, or there's a kill after the last export, which we
- // assume the frontend won't do, this export will have the same exec
- // mask as the last "real" export, and therefore the valid mask will be
- // overwritten with the same value and will still be correct. Also,
- // even though this forces an extra unnecessary export wait, we assume
- // that this happens rare enough in practice to that we don't have to
- // worry about performance.
- if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
- RetTy->isVoidTy()) {
- InsertExport = true;
- }
-
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
ReturningBlocks.push_back(DummyReturnBB);
}
if (ReturningBlocks.empty())
return Changed; // No blocks return
- if (ReturningBlocks.size() == 1 && !InsertExport)
+ if (ReturningBlocks.size() == 1)
return Changed; // Already has a single return block
- // Unify returning blocks. If we are going to insert the export it is also
- // necessary to include blocks that are uniformly reached, because in addition
- // to inserting the export the "done" bits on existing exports will be cleared
- // and we do not want to end up with the normal export in a non-unified,
- // uniformly reached block with the "done" bit cleared.
- auto BlocksToUnify = std::move(ReturningBlocks);
- if (InsertExport) {
- llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
- }
-
- unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport,
- "UnifiedReturnBlock");
+ unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
return true;
}
; CHECK-NEXT: ; %bb.4: ; %end
; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: exp mrt0 v1, v1, v1, v0 vm
+; CHECK-NEXT: exp mrt0 v1, v1, v1, v0 done vm
; CHECK-NEXT: BB0_5: ; %UnifiedReturnBlock
-; CHECK-NEXT: s_waitcnt expcnt(0)
-; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
-; CHECK-NEXT: exp null off, off, off, off done vm
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: BB0_6:
; CHECK-NEXT: s_mov_b64 exec, 0
; CHECK-NEXT: s_cbranch_execz BB1_5
; CHECK-NEXT: ; %bb.4: ; %end
; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: exp mrt0 v0, off, v0, off compr vm
+; CHECK-NEXT: exp mrt0 v0, off, v0, off done compr vm
; CHECK-NEXT: BB1_5: ; %UnifiedReturnBlock
-; CHECK-NEXT: s_waitcnt expcnt(0)
-; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
-; CHECK-NEXT: exp null off, off, off, off done vm
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: BB1_6:
; CHECK-NEXT: s_mov_b64 exec, 0
; CHECK-NEXT: BB2_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT: s_cbranch_scc0 BB2_4
+; CHECK-NEXT: s_cbranch_scc0 BB2_3
; CHECK-NEXT: ; %bb.2: ; %loop
; CHECK-NEXT: ; in Loop: Header=BB2_1 Depth=1
; CHECK-NEXT: s_mov_b64 exec, 0
-; CHECK-NEXT: s_mov_b64 vcc, exec
-; CHECK-NEXT: s_cbranch_execnz BB2_1
-; CHECK-NEXT: ; %bb.3: ; %UnifiedReturnBlock
-; CHECK-NEXT: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
-; CHECK-NEXT: BB2_4:
+; CHECK-NEXT: s_branch BB2_1
+; CHECK-NEXT: BB2_3:
; CHECK-NEXT: s_mov_b64 exec, 0
; CHECK-NEXT: exp null off, off, off, off done vm
; CHECK-NEXT: s_endpgm
; IR-LABEL: @uniformly_reached_export
; IR-NEXT: .entry:
-; IR: br i1 [[CND:%.*]], label %[[EXP:.*]], label %[[FLOW:.*]]
-
-; IR: [[FLOW]]:
-; IR-NEXT: phi
-; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock
+; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %[[EXP:.*]]
; IR: [[LOOP]]:
-; IR-NEXT: br i1 false, label %[[FLOW1:.*]], label %[[LOOP]]
+; IR-NEXT: br i1 false, label %DummyReturnBlock, label %[[LOOP]]
; IR: [[EXP]]:
-; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg false, i1 immarg true)
-; IR-NEXT: br label %[[FLOW]]
-
-; IR: [[FLOW1]]:
-; IR-NEXT: br label %UnifiedReturnBlock
+; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
+; IR-NEXT: ret void
-; IR: UnifiedReturnBlock:
-; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
+; IR: DummyReturnBlock:
; IR-NEXT: ret void
define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {