From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 24 Mar 2017 19:52:05 +0000 (+0000)
Subject: AMDGPU: Unify divergent function exits.
X-Git-Tag: llvmorg-5.0.0-rc1~9221
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b8f8dbc227e8d08d6685bb2bc3131ac86e3ac24e;p=platform%2Fupstream%2Fllvm.git

AMDGPU: Unify divergent function exits.

StructurizeCFG can't handle cases with multiple
returns creating regions with multiple exits.
Create a copy of UnifyFunctionExitNodes that only
unifies exit nodes that skips exit nodes
with uniform branch sources.

llvm-svn: 298729
---

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 3f6fa3a..d825a4c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -705,6 +705,9 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
 
 def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
 
+// Represent unreachable in a divergent region.
+def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>;
+
 // Emit 2.5 ulp, no denormal division. Should only be inserted by
 // pass based on !fpmath metadata.
 def int_amdgcn_fdiv_fast : Intrinsic<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 3e1e64a0..99d71f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -123,6 +123,9 @@ extern char &SIDebuggerInsertNopsID;
 void initializeSIInsertWaitsPass(PassRegistry&);
 extern char &SIInsertWaitsID;
 
+void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
+extern char &AMDGPUUnifyDivergentExitNodesID;
+
 ImmutablePass *createAMDGPUAAWrapperPass();
 void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 93c2814..46e458a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -132,6 +132,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIInsertSkipsPass(*PR);
   initializeSIDebuggerInsertNopsPass(*PR);
   initializeSIOptimizeExecMaskingPass(*PR);
+  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
 }
 
@@ -673,6 +674,10 @@ bool GCNPassConfig::addPreISel() {
   // supported.
   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
   addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM));
+
+  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
+  // regions formed by them.
+  addPass(&AMDGPUUnifyDivergentExitNodesID);
   addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
new file mode 100644
index 0000000..309913f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -0,0 +1,225 @@
+//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
+// there is at most one ret and one unreachable instruction, it ensures there is
+// at most one divergent exiting block.
+//
+// StructurizeCFG can't deal with multi-exit regions formed by branches to
+// multiple return nodes. It is not desirable to structurize regions with
+// uniform branches, so unifying those to the same return block as divergent
+// branches inhibits use of scalar branching. It still can't deal with the case
+// where one branch goes to return, and one unreachable. Replace unreachable in
+// this case with a return.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
+
+namespace {
+
+class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
+    initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
+  }
+
+  // We can preserve non-critical-edgeness when we unify function exit nodes
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+
+}
+
+char AMDGPUUnifyDivergentExitNodes::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+                     "Unify divergent function exit nodes", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+                    "Unify divergent function exit nodes", false, false)
+
+char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
+
+void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+  // TODO: Preserve dominator tree.
+  AU.addRequired<PostDominatorTreeWrapperPass>();
+
+  AU.addRequired<DivergenceAnalysis>();
+
+  // No divergent values are changed, only blocks and branch edges.
+  AU.addPreserved<DivergenceAnalysis>();
+
+  // We preserve the non-critical-edgeness property
+  AU.addPreservedID(BreakCriticalEdgesID);
+
+  // This is a cluster of orthogonal Transforms
+  AU.addPreservedID(LowerSwitchID);
+  FunctionPass::getAnalysisUsage(AU);
+
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+/// \returns true if \p BB is reachable through only uniform branches.
+/// XXX - Is there a more efficient way to find this?
+static bool isUniformlyReached(const DivergenceAnalysis &DA,
+                               BasicBlock &BB) {
+  SmallVector<BasicBlock *, 8> Stack;
+  SmallPtrSet<BasicBlock *, 8> Visited;
+
+  for (BasicBlock *Pred : predecessors(&BB))
+    Stack.push_back(Pred);
+
+  while (!Stack.empty()) {
+    BasicBlock *Top = Stack.pop_back_val();
+    if (!DA.isUniform(Top->getTerminator()))
+      return false;
+
+    for (BasicBlock *Pred : predecessors(Top)) {
+      if (Visited.insert(Pred).second)
+        Stack.push_back(Pred);
+    }
+  }
+
+  return true;
+}
+
+static BasicBlock *unifyReturnBlockSet(Function &F,
+                                       ArrayRef<BasicBlock *> ReturningBlocks,
+                                       const TargetTransformInfo &TTI,
+                                       StringRef Name) {
+  // Otherwise, we need to insert a new basic block into the function, add a PHI
+  // nodes (if the function returns values), and convert all of the return
+  // instructions into unconditional branches.
+  //
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+
+  PHINode *PN = nullptr;
+  if (F.getReturnType()->isVoidTy()) {
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Cleanup possible branch to unconditional branch to the return.
+    SimplifyCFG(BB, TTI, 2);
+  }
+
+  return NewRetBlock;
+}
+
+bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  if (PDT.getRoots().size() <= 1)
+    return false;
+
+  DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();
+
+  // Loop over all of the blocks in a function, tracking all of the blocks that
+  // return.
+  //
+  SmallVector<BasicBlock *, 4> ReturningBlocks;
+  SmallVector<BasicBlock *, 4> UnreachableBlocks;
+
+  for (BasicBlock *BB : PDT.getRoots()) {
+    if (isa<ReturnInst>(BB->getTerminator())) {
+      if (!isUniformlyReached(DA, *BB))
+        ReturningBlocks.push_back(BB);
+    } else if (isa<UnreachableInst>(BB->getTerminator())) {
+      if (!isUniformlyReached(DA, *BB))
+        UnreachableBlocks.push_back(BB);
+    }
+  }
+
+  if (!UnreachableBlocks.empty()) {
+    BasicBlock *UnreachableBlock = nullptr;
+
+    if (UnreachableBlocks.size() == 1) {
+      UnreachableBlock = UnreachableBlocks.front();
+    } else {
+      UnreachableBlock = BasicBlock::Create(F.getContext(),
+                                            "UnifiedUnreachableBlock", &F);
+      new UnreachableInst(F.getContext(), UnreachableBlock);
+
+      for (BasicBlock *BB : UnreachableBlocks) {
+        BB->getInstList().pop_back();  // Remove the unreachable inst.
+        BranchInst::Create(UnreachableBlock, BB);
+      }
+    }
+
+    if (!ReturningBlocks.empty()) {
+      // Don't create a new unreachable inst if we have a return. The
+      // structurizer/annotator can't handle the multiple exits
+
+      Type *RetTy = F.getReturnType();
+      Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+      UnreachableBlock->getInstList().pop_back();  // Remove the unreachable inst.
+
+      Function *UnreachableIntrin =
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
+
+      // Insert a call to an intrinsic tracking that this is an unreachable
+      // point, in case we want to kill the active lanes or something later.
+      CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
+
+      // Don't create a scalar trap. We would only want to trap if this code was
+      // really reached, but a scalar trap would happen even if no lanes
+      // actually reached here.
+      ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
+      ReturningBlocks.push_back(UnreachableBlock);
+    }
+  }
+
+  // Now handle return blocks.
+  if (ReturningBlocks.empty())
+    return false; // No blocks return
+
+  if (ReturningBlocks.size() == 1)
+    return false; // Already has a single return block
+
+  const TargetTransformInfo &TTI
+    = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index ffdcadd..689fc97 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUInstrInfo.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
+  AMDGPUUnifyDivergentExitNodes.cpp
   GCNHazardRecognizer.cpp
   GCNSchedStrategy.cpp
   R600ClauseMergePass.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 1db2212..b83a1fe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -138,19 +138,19 @@ class InstSI <dag outs, dag ins, string asm = "",
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
 
-class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : InstSI<outs, ins, "", pattern> {
+class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : InstSI<outs, ins, asm, pattern> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
-class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : PseudoInstSI<outs, ins, pattern> {
+class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : PseudoInstSI<outs, ins, pattern, asm> {
   let SALU = 1;
 }
 
-class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : PseudoInstSI<outs, ins, pattern> {
+class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : PseudoInstSI<outs, ins, pattern, asm> {
   let VALU = 1;
   let Uses = [EXEC];
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 40d35bf..e2e0895 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3803,16 +3803,11 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   if (DescSize != 0 && DescSize != 4)
     return DescSize;
 
-  if (Opc == AMDGPU::WAVE_BARRIER)
-    return 0;
-
   // 4-byte instructions may have a 32-bit literal encoded after them. Check
   // operands that coud ever be literals.
   if (isVALU(MI) || isSALU(MI)) {
-    if (isFixedSize(MI)) {
-      assert(DescSize == 4);
+    if (isFixedSize(MI))
       return DescSize;
-    }
 
     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     if (Src0Idx == -1)
@@ -3835,7 +3830,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     return 4;
 
   switch (Opc) {
-  case AMDGPU::SI_MASK_BRANCH:
   case TargetOpcode::IMPLICIT_DEF:
   case TargetOpcode::KILL:
   case TargetOpcode::DBG_VALUE:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cc76648..9e343aa 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -152,6 +152,8 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
   let mayStore = 1;
   let isBarrier = 1;
   let isConvergent = 1;
+  let FixedSize = 1;
+  let Size = 0;
 }
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
@@ -159,14 +161,15 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
 
 // Dummy terminator instruction to use after control flow instructions
 // replaced with exec mask operations.
-def SI_MASK_BRANCH : PseudoInstSI <
+def SI_MASK_BRANCH : VPseudoInstSI <
   (outs), (ins brtarget:$target)> {
   let isBranch = 0;
   let isTerminator = 1;
   let isBarrier = 0;
-  let Uses = [EXEC];
   let SchedRW = [];
   let hasNoSchedulingInfo = 1;
+  let FixedSize = 1;
+  let Size = 0;
 }
 
 let isTerminator = 1 in {
@@ -260,6 +263,14 @@ def SI_PS_LIVE : PseudoInstSI <
   let SALU = 1;
 }
 
+def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
+  [(int_amdgcn_unreachable)],
+  "; divergent unreachable"> {
+  let Size = 0;
+  let hasNoSchedulingInfo = 1;
+  let FixedSize = 1;
+}
+
 // Used as an isel pseudo to directly emit initialization with an
 // s_mov_b32 rather than a copy of another initialized
 // register. MachineCSE skips copies, and we don't want to have to
diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
index 94616a4..68b77ea 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -15,12 +15,16 @@
 ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
 ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
 ; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
-;
-; TODO: The following sequence is a bug (missing s_endpgm)!
-;
-; GCN: s_branch [[BB:BB[0-9]+_[0-9]+]]
-; GCN: [[BB]]:
-; GCN-NEXT: .Lfunc_end0:
+; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+
+; GCN-NEXT: [[BB5]]
+; GCN: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: .Lfunc_end
 define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
 bb:
   %tmp = fcmp ogt float %arg, 0.000000e+00
@@ -29,6 +33,7 @@ bb:
   br i1 %tmp3, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb
+  store volatile i32 4, i32 addrspace(3)* undef
   unreachable
 
 bb5:                                              ; preds = %bb
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
new file mode 100644
index 0000000..9d0b6b3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -0,0 +1,710 @@
+; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Add an extra verifier runs. There were some cases where invalid IR
+; was produced but happened to be fixed by the later passes.
+
+; Make sure divergent control flow with multiple exits from a region
+; is properly handled. UnifyFunctionExitNodes should be run before
+; StructurizeCFG.
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %2 = extractvalue { i1, i64 } %1, 0
+; IR: %3 = extractvalue { i1, i64 } %1, 1
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %7 = extractvalue { i1, i64 } %6, 0
+; IR: %8 = extractvalue { i1, i64 } %6, 1
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: LeafBlock:
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: br label %Flow{{$}}
+
+; IR:  Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: %13 = extractvalue { i1, i64 } %12, 0
+; IR: %14 = extractvalue { i1, i64 } %12, 1
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR:  br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+
+
+; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
+; GCN: v_cmp_lt_i32_e32 vcc, 1
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+
+
+; FIXME: Why is this compare essentially repeated?
+; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
+; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
+
+; GCN: ; %Flow1
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+
+; GCN: ; %exit1
+; GCN: ds_write_b32
+
+; GCN: %Flow2
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_and_saveexec_b64
+; GCN-NEXT: s_xor_b64
+
+; GCN: ; %exit0
+; GCN: buffer_store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
+
+
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: unreachable
+
+
+; FIXME: Probably should insert an s_endpgm anyway.
+; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
+; GCN: ; %UnifiedUnreachableBlock
+; GCN-NEXT: .Lfunc_end
+define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  unreachable
+}
+
+; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
+; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
+; IR: llvm.amdgcn.if
+; IR: br i1
+
+; IR: {{^}}Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: {{^}}LeafBlock:
+; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
+; IR: %9 = xor i1 %divergent.cond1, true
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
+; IR: %10 = xor i1 %uniform.cond0, true
+; IR: br label %Flow
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: {{^}}Flow1:
+; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %divergent.cond0 = icmp slt i32 %tmp16, 2
+  br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %divergent.cond1 = icmp eq i32 %tmp16, 1
+  br i1 %divergent.cond1, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %uniform.cond0 = icmp eq i32 %arg3, 2
+  br i1 %uniform.cond0, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+
+define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %arg3, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
+; IR: Flow2:
+; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
+; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+
+; IR: UnifiedReturnBlock:
+; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: ret float %UnifiedRetVal
+define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
+entry:
+  %Pivot = icmp slt i32 %vgpr, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %vgpr, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %vgpr, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 9, i32 addrspace(1)* undef
+  ret float 1.0
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 17, i32 addrspace(3)* undef
+  ret float 2.0
+}
+
+; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
+
+; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
+; GCN: s_cmp_gt_i32 s0, 1
+; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
+
+; GCN: {{^}}[[FLOW]]:
+; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
+
+; GCN: v_mov_b32_e32 v0, 2.0
+; GCN: s_or_b64 exec, exec
+; GCN: s_and_b64 exec, exec
+; GCN: v_mov_b32_e32 v0, 1.0
+
+; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+
+define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
+entry:
+  %uniform.cond = icmp slt i32 %sgpr, 2
+  br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %divergent.cond0 = icmp eq i32 %vgpr, 3
+  br i1 %divergent.cond0, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %divergent.cond1 = icmp eq i32 %vgpr, 7
+  br i1 %divergent.cond1, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 9, i32 addrspace(1)* undef
+  ret float 1.0
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 17, i32 addrspace(3)* undef
+  ret float 2.0
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; The non-uniformity of the branch to the exiting blocks requires
+; looking at transitive predecessors.
+
+; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
+
+; IR: exit0:                                            ; preds = %Flow2
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+
+; IR: indirect.exit1:
+; IR: %load = load volatile i32, i32 addrspace(1)* undef
+; IR: store volatile i32 %load, i32 addrspace(1)* undef
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+indirect.exit1:
+  %load = load volatile i32, i32 addrspace(1)* undef
+  store volatile i32 %load, i32 addrspace(1)* undef
+  br label %exit1
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_switch(
+define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  switch i32 %tmp16, label %exit1
+    [ i32 1, label %LeafBlock
+      i32 2, label %LeafBlock1
+      i32 3, label %exit0 ]
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
+
+divergent.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
+
+divergent.if:
+  %vgpr0 = load volatile float, float addrspace(1)* undef
+  %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
+  br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
+
+divergent.then:
+  %vgpr1 = load volatile float, float addrspace(1)* undef
+  %divergent.cond2 = fcmp olt float %vgpr1, 4.0
+  store volatile i32 33, i32 addrspace(1)* undef
+  br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
+
+divergent.endif:
+  store volatile i32 38, i32 addrspace(1)* undef
+  br label %divergent.ret0
+
+divergent.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
+; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
+; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %8, label %uniform.if, label %Flow2
+
+; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
+; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
+; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
+
+; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
+; IR-NEXT: ret void
+define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
+entry:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
+
+uniform.multi.exit.region:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
+
+uniform.if:
+  %sgpr0 = load volatile i32, i32 addrspace(2)* undef
+  %uniform.cond1 = icmp slt i32 %sgpr0, 1
+  br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
+
+uniform.then:
+  %sgpr1 = load volatile i32, i32 addrspace(2)* undef
+  %uniform.cond2 = icmp sge i32 %sgpr1, 4
+  store volatile i32 33, i32 addrspace(1)* undef
+  br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
+
+uniform.endif:
+  store volatile i32 38, i32 addrspace(1)* undef
+  br label %uniform.ret0
+
+uniform.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_unreachable_exit(
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  switch i32 %tmp, label %bb3 [
+    i32 2, label %bb1
+    i32 0, label %bb2
+  ]
+
+bb1:                                              ; preds = %bb
+  unreachable
+
+bb2:                                              ; preds = %bb
+  unreachable
+
+bb3:                                              ; preds = %bb
+  switch i32 undef, label %bb5 [
+    i32 2, label %bb4
+  ]
+
+bb4:                                              ; preds = %bb3
+  ret void
+
+bb5:                                              ; preds = %bb3
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
index 2aad3ce..f2fbacb 100644
--- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
@@ -4,20 +4,78 @@
 ; This should end with an no-op sequence of exec mask manipulations
 ; Mask should be in original state after executed unreachable block
 
-; GCN-LABEL: {{^}}main:
+
+; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable:
 ; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
 
+; GCN-NEXT: ; %else
+
 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
-; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
 
-; GCN: [[RET_BB]]:
-; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
+; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
+; GCN-NEXT: ; divergent unreachable
 
-; GCN-NEXT: [[UNREACHABLE_BB]]:
-; GCN-NEXT: [[FINAL_BB]]:
+; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow
+; GCN-NEXT: s_or_b64 exec, exec
+
+; GCN-NEXT: [[RET_BB]]:
+; GCN-NEXT: ; return
 ; GCN-NEXT: .Lfunc_end0
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+entry:
+  %i.i = extractelement <2 x i32> %arg7, i32 0
+  %j.i = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p87 = fmul float undef, %p2.i
+  %p88 = fadd float %p87, undef
+  %p93 = fadd float %p88, undef
+  %p97 = fmul float %p93, undef
+  %p102 = fsub float %p97, undef
+  %p104 = fmul float %p102, undef
+  %p106 = fadd float 0.000000e+00, %p104
+  %p108 = fadd float undef, %p106
+  %uniform.cond = icmp slt i32 %arg17, 0
+  br i1 %uniform.cond, label %ret.bb, label %else
+
+else:                                             ; preds = %main_body
+  %p124 = fmul float %p108, %p108
+  %p125 = fsub float %p124, undef
+  %divergent.cond = fcmp olt float %p125, 0.000000e+00
+  br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
+
+unreachable.bb:                                           ; preds = %else
+  unreachable
+
+ret.bb:                                          ; preds = %else, %main_body
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
+}
+
+; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: ; BB#{{[0-9]+}}: ; %else
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT:  ; %unreachable.bb
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+; GCN: ; divergent unreachable
+
+; GCN: ; %ret.bb
+; GCN: store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+; GCN-NEXT: .Lfunc_end
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
 main_body:
   %i.i = extractelement <2 x i32> %arg7, i32 0
   %j.i = extractelement <2 x i32> %arg7, i32 1
@@ -33,18 +91,21 @@ main_body:
   %p104 = fmul float %p102, undef
   %p106 = fadd float 0.000000e+00, %p104
   %p108 = fadd float undef, %p106
-  br i1 undef, label %ENDIF69, label %ELSE
+  %uniform.cond = icmp slt i32 %arg18, 0
+  br i1 %uniform.cond, label %ret.bb, label %else
 
-ELSE:                                             ; preds = %main_body
+else:                                             ; preds = %main_body
   %p124 = fmul float %p108, %p108
   %p125 = fsub float %p124, undef
-  %p126 = fcmp olt float %p125, 0.000000e+00
-  br i1 %p126, label %ENDIF69, label %ELSE41
+  %divergent.cond = fcmp olt float %p125, 0.000000e+00
+  br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
 
-ELSE41:                                           ; preds = %ELSE
+unreachable.bb:                                           ; preds = %else
+  store volatile i32 8, i32 addrspace(3)* undef
   unreachable
 
-ENDIF69:                                          ; preds = %ELSE, %main_body
+ret.bb:                                          ; preds = %else, %main_body
+  store volatile i32 11, i32 addrspace(1)* undef
   ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
index 2bc734c..5c6663d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -6,7 +6,7 @@
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_unreachable_noloop:
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
 define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
@@ -37,9 +37,14 @@ bb5:                                              ; preds = %bb3, %bb1
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_ret_noloop:
-; GCN: s_cbranch_scc1
-; GCN: s_endpgm
-; GCN: .Lfunc_end1
+; GCN: load_dwordx4
+; GCN: v_cmp_nlt_f32
+; GCN: s_and_saveexec_b64
+; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: [[UNIFIED_RET]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN: .Lfunc_end
 define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -49,6 +54,38 @@ bb1:                                              ; preds = %bb
   %tmp2 = sext i32 %tmp to i64
   %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
   %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  %tmp5 = extractelement <4 x float> %tmp4, i32 1
+  store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef
+  %cmp = fcmp ogt float %tmp5, 1.0
+  br i1 %cmp, label %bb5, label %bb3
+
+bb3:                                              ; preds = %bb1
+  %tmp6 = extractelement <4 x float> %tmp4, i32 2
+  %tmp7 = fcmp olt float %tmp6, 0.000000e+00
+  br i1 %tmp7, label %bb4, label %bb5 ; crash goes away if these are swapped
+
+bb4:                                              ; preds = %bb3
+  ret void
+
+bb5:                                              ; preds = %bb3, %bb1
+  ret void
+}
+
+; OPT-LABEL: @uniform_annotate_ret_noloop(
+; OPT-NOT: call i1 @llvm.amdgcn.loop
+
+; GCN-LABEL: {{^}}uniform_annotate_ret_noloop:
+; GCN: s_cbranch_scc1
+; GCN: s_endpgm
+; GCN: .Lfunc_end
+define amdgpu_kernel void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = sext i32 %tmp to i64
+  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
   br i1 undef, label %bb5, label %bb3
 
 bb3:                                              ; preds = %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 89c1eeb..cb010cf 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -4,16 +4,17 @@
 ; GCN: v_cmp_eq_u32
 ; GCN: s_and_saveexec_b64
 ; GCN: s_xor_b64
-; GCN: ; mask branch [[RET:BB[0-9]+]]
-; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 
-; GCN: [[RET]]
-; GCN: s_or_b64 exec, exec
-; GCN: s_endpgm
-
-; GCN: [[UNREACHABLE]]:
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
+; GCN: ; divergent unreachable
 ; GCN: s_waitcnt
+
+; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: s_endpgm
+
 define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -29,18 +30,19 @@ ret:
 }
 
 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
-; GCN: v_cmp_eq_u32
+; GCN: v_cmp_ne_u32
 ; GCN: s_and_saveexec_b64
 ; GCN: s_xor_b64
-; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
 
-; GCN-NEXT: ; %ret
-; GCN-NEXT: s_endpgm
-
-; GCN-NEXT: [[UNREACHABLE]]:
-; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
+; GCN: ; divergent unreachable
 ; GCN: s_waitcnt
+
+; GCN: [[RETURN]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -55,7 +57,29 @@ unreachable:
   unreachable
 }
 
-; Function Attrs: nounwind readnone
+; GCN-LABEL: {{^}}uniform_lower_control_flow_unreachable_terminator:
+; GCN: s_cmp_lg_u32
+; GCN: s_cbranch_scc0 [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB#{{[0-9]+}}: ; %ret
+; GCN-NEXT: s_endpgm
+
+; GCN: [[UNREACHABLE]]:
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
+bb:
+  %tmp63 = icmp eq i32 %arg0, 32
+  br i1 %tmp63, label %unreachable, label %ret
+
+unreachable:
+  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  unreachable
+
+ret:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.y() #1
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 41220ff..aad260c 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -64,29 +64,100 @@ end:
   ret void
 }
 
-; SI-LABEL: @simple_test_v_if
+; SI-LABEL: {{^}}simple_test_v_if:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 
-; SI: BB{{[0-9]+_[0-9]+}}:
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
-; SI: s_endpgm
+; SI-NEXT: s_waitcnt
 
-; SI: BB1_2:
+; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
-  br i1 %is.0, label %store, label %exit
+  br i1 %is.0, label %then, label %exit
+
+then:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  br label %exit
+
+exit:
+  ret void
+}
+
+; FIXME: It would be better to endpgm in the then block.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
+; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[EXIT]]:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %then, label %exit
+
+then:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  ret void
+
+exit:
+  ret void
+}
+
+; Final block has more than a ret to execute. This was miscompiled
+; before function exit blocks were unified since the endpgm would
+; terminate the then wavefront before reaching the store.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
+; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
+; SI: ds_write_b32
+; SI: s_waitcnt
+
+; SI-NEXT: {{^}}[[FLOW]]:
+; SI-NEXT: s_or_saveexec_b64
+; SI-NEXT: s_xor_b64 exec, exec
+; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
+; SI: s_or_b64 exec, exec
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %then, label %exit
 
-store:
+then:
   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
   store i32 999, i32 addrspace(1)* %gep
   ret void
 
 exit:
+  store volatile i32 7, i32 addrspace(3)* undef
   ret void
 }