bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- if (PDT.getRoots().size() <= 1)
+
+ // If there's only one exit, we don't need to do anything, unless this is a
+ // pixel shader and that exit is an infinite loop, since we still have to
+ // insert an export in that case.
+ if (PDT.getRoots().size() <= 1 &&
+ F.getCallingConv() != CallingConv::AMDGPU_PS)
return false;
LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
if (ReturningBlocks.empty())
return false; // No blocks return
- if (ReturningBlocks.size() == 1)
+ if (ReturningBlocks.size() == 1 && !InsertExport)
return false; // Already has a single return block
const TargetTransformInfo &TTI
ret void
}
+; test the case where there's only a kill in an infinite loop
+; CHECK-LABEL: only_kill
+; CHECK: exp null off, off, off, off done vm
+; CHECK-NEXT: s_endpgm
+; SIInsertSkips inserts an extra null export here, but it should be harmless.
+; CHECK: exp null off, off, off, off done vm
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @only_kill() #0 {
+main_body:
+ br label %loop
+
+loop:
+ call void @llvm.amdgcn.kill(i1 false) #3
+ br label %loop
+}
+
; In case there's an epilog, we shouldn't have to do this.
; CHECK-LABEL: return_nonvoid
; CHECK-NOT: exp null off, off, off, off done vm
; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ]
; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00
; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00
-; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[UNIFIEDRETURNBLOCK:%.*]]
; IR: TransitionBlock:
; IR-NEXT: br i1 [[N30]], label [[DOTLOOPEXIT]], label [[N28]]
; IR: n31:
; IR-NEXT: ret void
-; IR: DummyReturnBlock:
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
; IR-NEXT: ret void
;
.entry: