--- /dev/null
+//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
+// there is at most one ret and one unreachable instruction, it ensures there is
+// at most one divergent exiting block.
+//
+// StructurizeCFG can't deal with multi-exit regions formed by branches to
+// multiple return nodes. It is not desirable to structurize regions with
+// uniform branches, so unifying those to the same return block as divergent
+// branches inhibits use of scalar branching. It still can't deal with the case
+// where one branch goes to return, and one unreachable. Replace unreachable in
+// this case with a return.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
+
+namespace {
+
+class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
+ initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
+ }
+
+ // We can preserve non-critical-edgeness when we unify function exit nodes
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+};
+
+}
+
+char AMDGPUUnifyDivergentExitNodes::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+ "Unify divergent function exit nodes", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+ "Unify divergent function exit nodes", false, false)
+
+char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
+
+void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+ // TODO: Preserve dominator tree.
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+
+ AU.addRequired<DivergenceAnalysis>();
+
+ // No divergent values are changed, only blocks and branch edges.
+ AU.addPreserved<DivergenceAnalysis>();
+
+ // We preserve the non-critical-edgeness property
+ AU.addPreservedID(BreakCriticalEdgesID);
+
+ // This is a cluster of orthogonal Transforms
+ AU.addPreservedID(LowerSwitchID);
+ FunctionPass::getAnalysisUsage(AU);
+
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+/// \returns true if \p BB is reachable through only uniform branches.
+/// XXX - Is there a more efficient way to find this?
+static bool isUniformlyReached(const DivergenceAnalysis &DA,
+ BasicBlock &BB) {
+ SmallVector<BasicBlock *, 8> Stack;
+ SmallPtrSet<BasicBlock *, 8> Visited;
+
+ for (BasicBlock *Pred : predecessors(&BB))
+ Stack.push_back(Pred);
+
+ while (!Stack.empty()) {
+ BasicBlock *Top = Stack.pop_back_val();
+ if (!DA.isUniform(Top->getTerminator()))
+ return false;
+
+ for (BasicBlock *Pred : predecessors(Top)) {
+ if (Visited.insert(Pred).second)
+ Stack.push_back(Pred);
+ }
+ }
+
+ return true;
+}
+
+static BasicBlock *unifyReturnBlockSet(Function &F,
+ ArrayRef<BasicBlock *> ReturningBlocks,
+ const TargetTransformInfo &TTI,
+ StringRef Name) {
+ // Otherwise, we need to insert a new basic block into the function, add a PHI
+ // nodes (if the function returns values), and convert all of the return
+ // instructions into unconditional branches.
+ //
+ BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+
+ PHINode *PN = nullptr;
+ if (F.getReturnType()->isVoidTy()) {
+ ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+ } else {
+ // If the function doesn't return void... add a PHI node to the block...
+ PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+ "UnifiedRetVal");
+ NewRetBlock->getInstList().push_back(PN);
+ ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+ }
+
+ // Loop over all of the blocks, replacing the return instruction with an
+ // unconditional branch.
+ //
+ for (BasicBlock *BB : ReturningBlocks) {
+ // Add an incoming element to the PHI node for every return instruction that
+ // is merging into this new block...
+ if (PN)
+ PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+ BB->getInstList().pop_back(); // Remove the return insn
+ BranchInst::Create(NewRetBlock, BB);
+ }
+
+ for (BasicBlock *BB : ReturningBlocks) {
+ // Cleanup possible branch to unconditional branch to the return.
+ SimplifyCFG(BB, TTI, 2);
+ }
+
+ return NewRetBlock;
+}
+
+bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ if (PDT.getRoots().size() <= 1)
+ return false;
+
+ DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();
+
+ // Loop over all of the blocks in a function, tracking all of the blocks that
+ // return.
+ //
+ SmallVector<BasicBlock *, 4> ReturningBlocks;
+ SmallVector<BasicBlock *, 4> UnreachableBlocks;
+
+ for (BasicBlock *BB : PDT.getRoots()) {
+ if (isa<ReturnInst>(BB->getTerminator())) {
+ if (!isUniformlyReached(DA, *BB))
+ ReturningBlocks.push_back(BB);
+ } else if (isa<UnreachableInst>(BB->getTerminator())) {
+ if (!isUniformlyReached(DA, *BB))
+ UnreachableBlocks.push_back(BB);
+ }
+ }
+
+ if (!UnreachableBlocks.empty()) {
+ BasicBlock *UnreachableBlock = nullptr;
+
+ if (UnreachableBlocks.size() == 1) {
+ UnreachableBlock = UnreachableBlocks.front();
+ } else {
+ UnreachableBlock = BasicBlock::Create(F.getContext(),
+ "UnifiedUnreachableBlock", &F);
+ new UnreachableInst(F.getContext(), UnreachableBlock);
+
+ for (BasicBlock *BB : UnreachableBlocks) {
+ BB->getInstList().pop_back(); // Remove the unreachable inst.
+ BranchInst::Create(UnreachableBlock, BB);
+ }
+ }
+
+ if (!ReturningBlocks.empty()) {
+ // Don't create a new unreachable inst if we have a return. The
+ // structurizer/annotator can't handle the multiple exits
+
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+ UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst.
+
+ Function *UnreachableIntrin =
+ Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
+
+ // Insert a call to an intrinsic tracking that this is an unreachable
+ // point, in case we want to kill the active lanes or something later.
+ CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
+
+ // Don't create a scalar trap. We would only want to trap if this code was
+ // really reached, but a scalar trap would happen even if no lanes
+ // actually reached here.
+ ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
+ ReturningBlocks.push_back(UnreachableBlock);
+ }
+ }
+
+ // Now handle return blocks.
+ if (ReturningBlocks.empty())
+ return false; // No blocks return
+
+ if (ReturningBlocks.size() == 1)
+ return false; // Already has a single return block
+
+ const TargetTransformInfo &TTI
+ = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+ return true;
+}
--- /dev/null
+; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Add an extra verifier runs. There were some cases where invalid IR
+; was produced but happened to be fixed by the later passes.
+
+; Make sure divergent control flow with multiple exits from a region
+; is properly handled. UnifyFunctionExitNodes should be run before
+; StructurizeCFG.
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %2 = extractvalue { i1, i64 } %1, 0
+; IR: %3 = extractvalue { i1, i64 } %1, 1
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %7 = extractvalue { i1, i64 } %6, 0
+; IR: %8 = extractvalue { i1, i64 } %6, 1
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: LeafBlock:
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: br label %Flow{{$}}
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: %13 = extractvalue { i1, i64 } %12, 0
+; IR: %14 = extractvalue { i1, i64 } %12, 1
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+
+
+; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
+; GCN: v_cmp_lt_i32_e32 vcc, 1
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+
+
+; FIXME: Why is this compare essentially repeated?
+; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
+; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
+
+; GCN: ; %Flow1
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+
+; GCN: ; %exit1
+; GCN: ds_write_b32
+
+; GCN: %Flow2
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_and_saveexec_b64
+; GCN-NEXT: s_xor_b64
+
+; GCN: ; %exit0
+; GCN: buffer_store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
+
+
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: unreachable
+
+
+; FIXME: Probably should insert an s_endpgm anyway.
+; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
+; GCN: ; %UnifiedUnreachableBlock
+; GCN-NEXT: .Lfunc_end
+define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ unreachable
+}
+
+; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
+; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
+; IR: llvm.amdgcn.if
+; IR: br i1
+
+; IR: {{^}}Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: {{^}}LeafBlock:
+; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
+; IR: %9 = xor i1 %divergent.cond1, true
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
+; IR: %10 = xor i1 %uniform.cond0, true
+; IR: br label %Flow
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: {{^}}Flow1:
+; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %divergent.cond0 = icmp slt i32 %tmp16, 2
+ br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %divergent.cond1 = icmp eq i32 %tmp16, 1
+ br i1 %divergent.cond1, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %uniform.cond0 = icmp eq i32 %arg3, 2
+ br i1 %uniform.cond0, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+
+define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %arg3, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
+; IR: Flow2:
+; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
+; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+
+; IR: UnifiedReturnBlock:
+; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: ret float %UnifiedRetVal
+define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
+entry:
+ %Pivot = icmp slt i32 %vgpr, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %vgpr, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %vgpr, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store i32 9, i32 addrspace(1)* undef
+ ret float 1.0
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store i32 17, i32 addrspace(3)* undef
+ ret float 2.0
+}
+
+; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
+
+; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
+; GCN: s_cmp_gt_i32 s0, 1
+; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
+
+; GCN: {{^}}[[FLOW]]:
+; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
+
+; GCN: v_mov_b32_e32 v0, 2.0
+; GCN: s_or_b64 exec, exec
+; GCN: s_and_b64 exec, exec
+; GCN: v_mov_b32_e32 v0, 1.0
+
+; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+
+define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
+entry:
+ %uniform.cond = icmp slt i32 %sgpr, 2
+ br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %divergent.cond0 = icmp eq i32 %vgpr, 3
+ br i1 %divergent.cond0, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %divergent.cond1 = icmp eq i32 %vgpr, 7
+ br i1 %divergent.cond1, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store i32 9, i32 addrspace(1)* undef
+ ret float 1.0
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store i32 17, i32 addrspace(3)* undef
+ ret float 2.0
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+}
+
+; The non-uniformity of the branch to the exiting blocks requires
+; looking at transitive predecessors.
+
+; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
+
+; IR: exit0: ; preds = %Flow2
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+
+; IR: indirect.exit1:
+; IR: %load = load volatile i32, i32 addrspace(1)* undef
+; IR: store volatile i32 %load, i32 addrspace(1)* undef
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+
+indirect.exit1:
+ %load = load volatile i32, i32 addrspace(1)* undef
+ store volatile i32 %load, i32 addrspace(1)* undef
+ br label %exit1
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_switch(
+define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ switch i32 %tmp16, label %exit1
+ [ i32 1, label %LeafBlock
+ i32 2, label %LeafBlock1
+ i32 3, label %exit0 ]
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+}
+
+; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+ %uniform.cond0 = icmp eq i32 %arg0, 4
+ br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.cond0 = icmp eq i32 %id.x, 0
+ br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
+
+divergent.ret0:
+ store volatile i32 11, i32 addrspace(3)* undef
+ ret void
+
+divergent.ret1:
+ store volatile i32 42, i32 addrspace(3)* undef
+ ret void
+
+uniform.ret:
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+}
+
+; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+ %uniform.cond0 = icmp eq i32 %arg0, 4
+ br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.cond0 = icmp eq i32 %id.x, 0
+ br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
+
+divergent.if:
+ %vgpr0 = load volatile float, float addrspace(1)* undef
+ %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
+ br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
+
+divergent.then:
+ %vgpr1 = load volatile float, float addrspace(1)* undef
+ %divergent.cond2 = fcmp olt float %vgpr1, 4.0
+ store volatile i32 33, i32 addrspace(1)* undef
+ br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
+
+divergent.endif:
+ store volatile i32 38, i32 addrspace(1)* undef
+ br label %divergent.ret0
+
+divergent.ret0:
+ store volatile i32 11, i32 addrspace(3)* undef
+ ret void
+
+divergent.ret1:
+ store volatile i32 42, i32 addrspace(3)* undef
+ ret void
+
+uniform.ret:
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+}
+
+; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
+; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region
+; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %8, label %uniform.if, label %Flow2
+
+; IR: Flow: ; preds = %uniform.then, %uniform.if
+; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
+; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
+
+; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
+; IR-NEXT: ret void
+define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
+entry:
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.cond0 = icmp eq i32 %id.x, 0
+ br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
+
+uniform.multi.exit.region:
+ %uniform.cond0 = icmp eq i32 %arg0, 4
+ br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
+
+uniform.if:
+ %sgpr0 = load volatile i32, i32 addrspace(2)* undef
+ %uniform.cond1 = icmp slt i32 %sgpr0, 1
+ br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
+
+uniform.then:
+ %sgpr1 = load volatile i32, i32 addrspace(2)* undef
+ %uniform.cond2 = icmp sge i32 %sgpr1, 4
+ store volatile i32 33, i32 addrspace(1)* undef
+ br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
+
+uniform.endif:
+ store volatile i32 38, i32 addrspace(1)* undef
+ br label %uniform.ret0
+
+uniform.ret0:
+ store volatile i32 11, i32 addrspace(3)* undef
+ ret void
+
+uniform.ret1:
+ store volatile i32 42, i32 addrspace(3)* undef
+ ret void
+
+divergent.ret:
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_divergent_unreachable_exit(
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ switch i32 %tmp, label %bb3 [
+ i32 2, label %bb1
+ i32 0, label %bb2
+ ]
+
+bb1: ; preds = %bb
+ unreachable
+
+bb2: ; preds = %bb
+ unreachable
+
+bb3: ; preds = %bb
+ switch i32 undef, label %bb5 [
+ i32 2, label %bb4
+ ]
+
+bb4: ; preds = %bb3
+ ret void
+
+bb5: ; preds = %bb3
+ unreachable
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
; This should end with an no-op sequence of exec mask manipulations
; Mask should be in original state after executed unreachable block
-; GCN-LABEL: {{^}}main:
+
+; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable:
; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; %else
+
; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
-; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
-; GCN: [[RET_BB]]:
-; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
+; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
+; GCN-NEXT: ; divergent unreachable
-; GCN-NEXT: [[UNREACHABLE_BB]]:
-; GCN-NEXT: [[FINAL_BB]]:
+; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow
+; GCN-NEXT: s_or_b64 exec, exec
+
+; GCN-NEXT: [[RET_BB]]:
+; GCN-NEXT: ; return
; GCN-NEXT: .Lfunc_end0
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+entry:
+ %i.i = extractelement <2 x i32> %arg7, i32 0
+ %j.i = extractelement <2 x i32> %arg7, i32 1
+ %i.f.i = bitcast i32 %i.i to float
+ %j.f.i = bitcast i32 %j.i to float
+ %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
+ %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
+ %p87 = fmul float undef, %p2.i
+ %p88 = fadd float %p87, undef
+ %p93 = fadd float %p88, undef
+ %p97 = fmul float %p93, undef
+ %p102 = fsub float %p97, undef
+ %p104 = fmul float %p102, undef
+ %p106 = fadd float 0.000000e+00, %p104
+ %p108 = fadd float undef, %p106
+ %uniform.cond = icmp slt i32 %arg17, 0
+ br i1 %uniform.cond, label %ret.bb, label %else
+
+else: ; preds = %main_body
+ %p124 = fmul float %p108, %p108
+ %p125 = fsub float %p124, undef
+ %divergent.cond = fcmp olt float %p125, 0.000000e+00
+ br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
+
+unreachable.bb: ; preds = %else
+ unreachable
+
+ret.bb: ; preds = %else, %main_body
+ ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
+}
+
+; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: ; BB#{{[0-9]+}}: ; %else
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: ; %unreachable.bb
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+; GCN: ; divergent unreachable
+
+; GCN: ; %ret.bb
+; GCN: store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+; GCN-NEXT: .Lfunc_end
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
main_body:
%i.i = extractelement <2 x i32> %arg7, i32 0
%j.i = extractelement <2 x i32> %arg7, i32 1
%p104 = fmul float %p102, undef
%p106 = fadd float 0.000000e+00, %p104
%p108 = fadd float undef, %p106
- br i1 undef, label %ENDIF69, label %ELSE
+ %uniform.cond = icmp slt i32 %arg18, 0
+ br i1 %uniform.cond, label %ret.bb, label %else
-ELSE: ; preds = %main_body
+else: ; preds = %main_body
%p124 = fmul float %p108, %p108
%p125 = fsub float %p124, undef
- %p126 = fcmp olt float %p125, 0.000000e+00
- br i1 %p126, label %ENDIF69, label %ELSE41
+ %divergent.cond = fcmp olt float %p125, 0.000000e+00
+ br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
-ELSE41: ; preds = %ELSE
+unreachable.bb: ; preds = %else
+ store volatile i32 8, i32 addrspace(3)* undef
unreachable
-ENDIF69: ; preds = %ELSE, %main_body
+ret.bb: ; preds = %else, %main_body
+ store volatile i32 11, i32 addrspace(1)* undef
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
}