From b32a5666a8ee5e1ce00918f9f2835b29973f6f64 Mon Sep 17 00:00:00 2001
From: Brendon Cahoon <brendon.cahoon@amd.com>
Date: Thu, 27 Oct 2022 16:27:50 -0500
Subject: [PATCH] [AMDGPU] Unify uniform return and divergent unreachable
 blocks

This patch fixes a "failed to annotate CFG" error in
SIAnnotateControlFlow. The problem occurs when there are
divergent and uniform unreachable/return blocks in the same
region. In this case, AMDGPUUnifyDivergentExitNodes does not
create a unified block so the region contains multiple exits.

StructurizeCFG does not work properly when there are multiple
exits, so the neccessary CFG transformations do not occur along
divergent control flow. Subsequently, SIAnnotateControlFlow
processes the path to the divergent exit block, but may only
partially process blocks along a unform control flow path to
another exit block.

This patch fixes the bug by creating a single exit block when
there is a divergent exit block in the function.

Differential revision: https://reviews.llvm.org/D136892
---
 .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp       |  19 +-
 .../CodeGen/AMDGPU/GlobalISel/bool-legalization.ll |  44 ++--
 ...-opt-cannot-create-empty-or-backward-segment.ll |   7 +-
 .../AMDGPU/si-unify-exit-return-unreachable.ll     | 222 +++++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll           |  97 ++++-----
 .../AMDGPU/unstructured-cfg-def-use-issue.ll       | 130 +++++++++---
 6 files changed, 400 insertions(+), 119 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 753e137..accc18e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -206,17 +206,22 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   bool Changed = false;
   std::vector<DominatorTree::UpdateType> Updates;
 
+  // TODO: For now we unify all exit blocks, even though they are uniformly
+  // reachable, if there are any exits not uniformly reached. This is to
+  // workaround the limitation of structurizer, which can not handle multiple
+  // function exits. After structurizer is able to handle multiple function
+  // exits, we should only unify UnreachableBlocks that are not uniformly
+  // reachable.
+  bool HasDivergentExitBlock = llvm::any_of(
+      PDT.roots(), [&](auto BB) { return !isUniformlyReached(DA, *BB); });
+
   for (BasicBlock *BB : PDT.roots()) {
     if (isa<ReturnInst>(BB->getTerminator())) {
-      if (!isUniformlyReached(DA, *BB))
+      if (HasDivergentExitBlock)
         ReturningBlocks.push_back(BB);
     } else if (isa<UnreachableInst>(BB->getTerminator())) {
-      // TODO: For now we unify UnreachableBlocks even though they are uniformly
-      // reachable. This is to workaround the limitation of structurizer, which
-      // can not handle multiple function exits. After structurizer is able to
-      // handle multiple function exits, we should only unify UnreachableBlocks
-      // that are not uniformly reachable.
-      UnreachableBlocks.push_back(BB);
+      if (HasDivergentExitBlock)
+        UnreachableBlocks.push_back(BB);
     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
 
       ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
index 9fd2653..5227d74 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@@ -50,28 +50,20 @@ define amdgpu_ps i32 @select_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inreg %a.1,
 define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) {
 ; GCN-LABEL: sgpr_trunc_brcond:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s0, -1
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s1, s1, -1
-; GCN-NEXT:    s_and_b32 s1, s1, 1
-; GCN-NEXT:    s_cmp_lg_u32 s1, 0
-; GCN-NEXT:    s_cbranch_scc0 .LBB3_2
-; GCN-NEXT:  ; %bb.1: ; %bb1
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    flat_store_dword v[0:1], v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB3_2: ; %Flow
 ; GCN-NEXT:    s_xor_b32 s0, s0, -1
 ; GCN-NEXT:    s_and_b32 s0, s0, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB3_4
-; GCN-NEXT:  ; %bb.3: ; %bb0
+; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
+; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB3_4: ; %UnifiedUnreachableBlock
+; GCN-NEXT:  .LBB3_2: ; %bb1
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    flat_store_dword v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 entry:
   %trunc = trunc i32 %cond to i1
   br i1 %trunc, label %bb0, label %bb1
@@ -90,27 +82,19 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s1, s0, s1
-; GCN-NEXT:    s_xor_b32 s1, s1, -1
-; GCN-NEXT:    s_and_b32 s1, s1, 1
-; GCN-NEXT:    s_mov_b32 s0, -1
-; GCN-NEXT:    s_cmp_lg_u32 s1, 0
-; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
-; GCN-NEXT:  ; %bb.1: ; %bb1
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    flat_store_dword v[0:1], v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB4_2: ; %Flow
+; GCN-NEXT:    s_and_b32 s0, s0, s1
 ; GCN-NEXT:    s_xor_b32 s0, s0, -1
 ; GCN-NEXT:    s_and_b32 s0, s0, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB4_4
-; GCN-NEXT:  ; %bb.3: ; %bb0
+; GCN-NEXT:    s_cbranch_scc1 .LBB4_2
+; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB4_4: ; %UnifiedUnreachableBlock
+; GCN-NEXT:  .LBB4_2: ; %bb1
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    flat_store_dword v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 entry:
   %trunc0 = trunc i32 %cond0 to i1
   %trunc1 = trunc i32 %cond1 to i1
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 8c32b6c..b4c00f3 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -112,16 +112,17 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_22
 ; CHECK-NEXT:  ; %bb.18: ; %loop.exit.guard5
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[18:19]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_22
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_23
 ; CHECK-NEXT:  ; %bb.19: ; %bb17
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_21
 ; CHECK-NEXT:  ; %bb.20: ; %bb19
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_22
-; CHECK-NEXT:  .LBB0_21: ; %bb21
+; CHECK-NEXT:  .LBB0_21: ; %bb18
 ; CHECK-NEXT:    s_endpgm
-; CHECK-NEXT:  .LBB0_22: ; %UnifiedUnreachableBlock
+; CHECK-NEXT:  .LBB0_22: ; %bb20
+; CHECK-NEXT:  .LBB0_23: ; %bb12
 bb:
   br label %bb6
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
new file mode 100644
index 0000000..c3d3993
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
@@ -0,0 +1,222 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -verify -S %s -o - | FileCheck -check-prefix=IR %s
+
+; A test with a divergent unreachable block and uniform return block. The
+; compiler needs to create a regions that includes them so that
+; StructurizeCFG correctly transform the CFG, and then SI Annotate Control
+; Flow does not fail during annotation.
+
+define void @my_func(i32 %0) {
+; IR-LABEL: @my_func(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) null, align 8
+; IR-NEXT:    br label [[NODEBLOCK:%.*]]
+; IR:       NodeBlock:
+; IR-NEXT:    [[PIVOT:%.*]] = icmp sge i32 [[TMP1]], 1
+; IR-NEXT:    br i1 [[PIVOT]], label [[LEAFBLOCK1:%.*]], label [[FLOW:%.*]]
+; IR:       LeafBlock1:
+; IR-NEXT:    [[SWITCHLEAF2:%.*]] = icmp ne i32 [[TMP1]], 1
+; IR-NEXT:    br label [[FLOW]]
+; IR:       Flow:
+; IR-NEXT:    [[TMP2:%.*]] = phi i1 [ [[SWITCHLEAF2]], [[LEAFBLOCK1]] ], [ false, [[NODEBLOCK]] ]
+; IR-NEXT:    [[TMP3:%.*]] = phi i1 [ false, [[LEAFBLOCK1]] ], [ true, [[NODEBLOCK]] ]
+; IR-NEXT:    br i1 [[TMP3]], label [[LEAFBLOCK:%.*]], label [[FLOW11:%.*]]
+; IR:       LeafBlock:
+; IR-NEXT:    [[SWITCHLEAF:%.*]] = icmp eq i32 [[TMP1]], 0
+; IR-NEXT:    br i1 [[SWITCHLEAF]], label [[SW_BB2:%.*]], label [[FLOW12:%.*]]
+; IR:       Flow11:
+; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP9:%.*]], [[FLOW12]] ], [ false, [[FLOW]] ]
+; IR-NEXT:    [[TMP5:%.*]] = phi i1 [ [[TMP10:%.*]], [[FLOW12]] ], [ [[TMP2]], [[FLOW]] ]
+; IR-NEXT:    [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP5]])
+; IR-NEXT:    [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP6]], 0
+; IR-NEXT:    [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1
+; IR-NEXT:    br i1 [[TMP7]], label [[DO_BODY:%.*]], label [[FLOW17:%.*]]
+; IR:       sw.bb2:
+; IR-NEXT:    br label [[NODEBLOCK7:%.*]]
+; IR:       Flow12:
+; IR-NEXT:    [[TMP9]] = phi i1 [ [[TMP24:%.*]], [[FLOW15:%.*]] ], [ false, [[LEAFBLOCK]] ]
+; IR-NEXT:    [[TMP10]] = phi i1 [ [[TMP25:%.*]], [[FLOW15]] ], [ true, [[LEAFBLOCK]] ]
+; IR-NEXT:    br label [[FLOW11]]
+; IR:       NodeBlock7:
+; IR-NEXT:    [[PIVOT8:%.*]] = icmp sge i32 [[TMP0:%.*]], 2
+; IR-NEXT:    [[TMP11:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[PIVOT8]])
+; IR-NEXT:    [[TMP12:%.*]] = extractvalue { i1, i64 } [[TMP11]], 0
+; IR-NEXT:    [[TMP13:%.*]] = extractvalue { i1, i64 } [[TMP11]], 1
+; IR-NEXT:    br i1 [[TMP12]], label [[LEAFBLOCK5:%.*]], label [[FLOW13:%.*]]
+; IR:       LeafBlock5:
+; IR-NEXT:    [[SWITCHLEAF6:%.*]] = icmp eq i32 [[TMP0]], 2
+; IR-NEXT:    br label [[FLOW13]]
+; IR:       Flow13:
+; IR-NEXT:    [[TMP14:%.*]] = phi i1 [ true, [[LEAFBLOCK5]] ], [ false, [[NODEBLOCK7]] ]
+; IR-NEXT:    [[TMP15:%.*]] = phi i1 [ [[SWITCHLEAF6]], [[LEAFBLOCK5]] ], [ false, [[NODEBLOCK7]] ]
+; IR-NEXT:    [[TMP16:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP13]])
+; IR-NEXT:    [[TMP17:%.*]] = extractvalue { i1, i64 } [[TMP16]], 0
+; IR-NEXT:    [[TMP18:%.*]] = extractvalue { i1, i64 } [[TMP16]], 1
+; IR-NEXT:    br i1 [[TMP17]], label [[LEAFBLOCK3:%.*]], label [[FLOW14:%.*]]
+; IR:       LeafBlock3:
+; IR-NEXT:    [[SWITCHLEAF4:%.*]] = icmp eq i32 [[TMP0]], 0
+; IR-NEXT:    [[SWITCHLEAF4_INV:%.*]] = xor i1 [[SWITCHLEAF4]], true
+; IR-NEXT:    br label [[FLOW14]]
+; IR:       Flow14:
+; IR-NEXT:    [[TMP19:%.*]] = phi i1 [ [[SWITCHLEAF4_INV]], [[LEAFBLOCK3]] ], [ [[TMP14]], [[FLOW13]] ]
+; IR-NEXT:    [[TMP20:%.*]] = phi i1 [ [[SWITCHLEAF4]], [[LEAFBLOCK3]] ], [ [[TMP15]], [[FLOW13]] ]
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP18]])
+; IR-NEXT:    [[TMP21:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP20]])
+; IR-NEXT:    [[TMP22:%.*]] = extractvalue { i1, i64 } [[TMP21]], 0
+; IR-NEXT:    [[TMP23:%.*]] = extractvalue { i1, i64 } [[TMP21]], 1
+; IR-NEXT:    br i1 [[TMP22]], label [[LAND_LHS_TRUE_I:%.*]], label [[FLOW15]]
+; IR:       land.lhs.true.i:
+; IR-NEXT:    br label [[LEAFBLOCK9:%.*]]
+; IR:       Flow15:
+; IR-NEXT:    [[TMP24]] = phi i1 [ [[TMP29:%.*]], [[FLOW16:%.*]] ], [ false, [[FLOW14]] ]
+; IR-NEXT:    [[TMP25]] = phi i1 [ [[TMP30:%.*]], [[FLOW16]] ], [ [[TMP19]], [[FLOW14]] ]
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP23]])
+; IR-NEXT:    br label [[FLOW12]]
+; IR:       LeafBlock9:
+; IR-NEXT:    [[SWITCHLEAF10:%.*]] = icmp sgt i32 [[TMP0]], 1
+; IR-NEXT:    [[TMP26:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[SWITCHLEAF10]])
+; IR-NEXT:    [[TMP27:%.*]] = extractvalue { i1, i64 } [[TMP26]], 0
+; IR-NEXT:    [[TMP28:%.*]] = extractvalue { i1, i64 } [[TMP26]], 1
+; IR-NEXT:    br i1 [[TMP27]], label [[DO_BODY_I_I_I_I:%.*]], label [[FLOW16]]
+; IR:       do.body.i.i.i.i:
+; IR-NEXT:    tail call fastcc void null()
+; IR-NEXT:    br label [[FLOW16]]
+; IR:       Flow16:
+; IR-NEXT:    [[TMP29]] = phi i1 [ true, [[DO_BODY_I_I_I_I]] ], [ false, [[LEAFBLOCK9]] ]
+; IR-NEXT:    [[TMP30]] = phi i1 [ false, [[DO_BODY_I_I_I_I]] ], [ true, [[LEAFBLOCK9]] ]
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP28]])
+; IR-NEXT:    br label [[FLOW15]]
+; IR:       do.body:
+; IR-NEXT:    tail call fastcc void null()
+; IR-NEXT:    br label [[FLOW17]]
+; IR:       Flow17:
+; IR-NEXT:    [[TMP31:%.*]] = phi i1 [ true, [[DO_BODY]] ], [ [[TMP4]], [[FLOW11]] ]
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
+; IR-NEXT:    [[TMP32:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP31]])
+; IR-NEXT:    [[TMP33:%.*]] = extractvalue { i1, i64 } [[TMP32]], 0
+; IR-NEXT:    [[TMP34:%.*]] = extractvalue { i1, i64 } [[TMP32]], 1
+; IR-NEXT:    br i1 [[TMP33]], label [[UNIFIEDUNREACHABLEBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       UnifiedUnreachableBlock:
+; IR-NEXT:    call void @llvm.amdgcn.unreachable()
+; IR-NEXT:    br label [[UNIFIEDRETURNBLOCK]]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP34]])
+; IR-NEXT:    ret void
+;
+; GCN-LABEL: my_func:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_load_dword s10, s[4:5], 0x0
+; GCN-NEXT:    s_mov_b64 s[8:9], -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lt_i32 s10, 1
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_7
+; GCN-NEXT:  ; %bb.1: ; %LeafBlock1
+; GCN-NEXT:    s_cmp_lg_u32 s10, 1
+; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT:    s_mov_b64 vcc, exec
+; GCN-NEXT:    s_cbranch_execz .LBB0_8
+; GCN-NEXT:  .LBB0_2: ; %Flow11
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; GCN-NEXT:  .LBB0_3: ; %do.body
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:  .LBB0_4: ; %Flow17
+; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:  ; %bb.5: ; %UnifiedUnreachableBlock
+; GCN-NEXT:    ; divergent unreachable
+; GCN-NEXT:  ; %bb.6: ; %UnifiedReturnBlock
+; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:  .LBB0_7: ; %Flow
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_2
+; GCN-NEXT:  .LBB0_8: ; %LeafBlock
+; GCN-NEXT:    s_cmp_eq_u32 s10, 0
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_10
+; GCN-NEXT:  ; %bb.9:
+; GCN-NEXT:    s_mov_b64 s[6:7], -1
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; GCN-NEXT:    s_cbranch_execnz .LBB0_3
+; GCN-NEXT:    s_branch .LBB0_4
+; GCN-NEXT:  .LBB0_10: ; %NodeBlock7
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v0
+; GCN-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT:  ; %bb.11: ; %LeafBlock5
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GCN-NEXT:  ; %bb.12: ; %Flow13
+; GCN-NEXT:    s_andn2_saveexec_b64 s[10:11], s[4:5]
+; GCN-NEXT:  ; %bb.13: ; %LeafBlock3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_andn2_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_and_b64 s[12:13], vcc, exec
+; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GCN-NEXT:  ; %bb.14: ; %Flow14
+; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_and_saveexec_b64 s[10:11], s[8:9]
+; GCN-NEXT:    s_cbranch_execz .LBB0_18
+; GCN-NEXT:  ; %bb.15: ; %LeafBlock9
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v0
+; GCN-NEXT:    s_mov_b64 s[8:9], -1
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GCN-NEXT:  ; %bb.16: ; %do.body.i.i.i.i
+; GCN-NEXT:    s_mov_b64 s[4:5], exec
+; GCN-NEXT:    s_xor_b64 s[8:9], exec, -1
+; GCN-NEXT:  ; %bb.17: ; %Flow16
+; GCN-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GCN-NEXT:  .LBB0_18: ; %Flow15
+; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; GCN-NEXT:    s_cbranch_execnz .LBB0_3
+; GCN-NEXT:    s_branch .LBB0_4
+entry:
+  %1 = load i32, ptr addrspace(4) null, align 8
+  switch i32 %1, label %do.body [
+  i32 1, label %sw.bb
+  i32 0, label %sw.bb2
+  ]
+
+sw.bb:
+  ret void
+
+sw.bb2:
+  switch i32 %0, label %do.body [
+  i32 0, label %land.lhs.true.i
+  i32 2, label %land.lhs.true.i
+  ]
+
+land.lhs.true.i:
+  switch i32 %0, label %do.body.i.i.i.i [
+  i32 0, label %do.body
+  i32 1, label %do.body
+  ]
+
+do.body.i.i.i.i:
+  tail call fastcc void null()
+  unreachable
+
+do.body:
+  tail call fastcc void null()
+  unreachable
+
+}
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 2aa8121..7080c84 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1292,45 +1292,40 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1
 ; SI-LABEL: no_skip_no_successors:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    v_cmp_nge_f32_e64 s[4:5], s1, 0
-; SI-NEXT:    s_mov_b64 s[2:3], exec
-; SI-NEXT:    s_mov_b64 s[0:1], -1
 ; SI-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_vccz .LBB12_3
-; SI-NEXT:  ; %bb.1: ; %Flow
-; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; SI-NEXT:    s_cbranch_vccnz .LBB12_4
-; SI-NEXT:  .LBB12_2: ; %UnifiedUnreachableBlock
-; SI-NEXT:  .LBB12_3: ; %bb3
-; SI-NEXT:    s_branch .LBB12_2
-; SI-NEXT:  .LBB12_4: ; %bb6
+; SI-NEXT:  ; %bb.1: ; %bb6
+; SI-NEXT:    s_mov_b64 s[2:3], exec
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; SI-NEXT:    s_cbranch_scc0 .LBB12_6
-; SI-NEXT:  ; %bb.5: ; %bb6
+; SI-NEXT:    s_cbranch_scc0 .LBB12_5
+; SI-NEXT:  ; %bb.2: ; %bb6
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  .LBB12_6:
+; SI-NEXT:  .LBB12_3: ; %bb3
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7ae148
+; SI-NEXT:    v_cmp_nge_f32_e32 vcc, s0, v0
+; SI-NEXT:    s_and_b64 vcc, exec, vcc
+; SI-NEXT:  ; %bb.4: ; %bb5
+; SI-NEXT:  .LBB12_5:
 ; SI-NEXT:    s_mov_b64 exec, 0
 ; SI-NEXT:    exp null off, off, off, off done vm
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX10-WAVE64-LABEL: no_skip_no_successors:
 ; GFX10-WAVE64:       ; %bb.0: ; %bb
-; GFX10-WAVE64-NEXT:    v_cmp_nge_f32_e64 s[0:1], s1, 0
-; GFX10-WAVE64-NEXT:    s_mov_b64 s[2:3], exec
-; GFX10-WAVE64-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; GFX10-WAVE64-NEXT:    s_mov_b64 s[0:1], -1
+; GFX10-WAVE64-NEXT:    v_cmp_nge_f32_e64 s[4:5], s1, 0
+; GFX10-WAVE64-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GFX10-WAVE64-NEXT:    s_cbranch_vccz .LBB12_3
-; GFX10-WAVE64-NEXT:  ; %bb.1: ; %Flow
-; GFX10-WAVE64-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; GFX10-WAVE64-NEXT:    s_cbranch_vccnz .LBB12_4
-; GFX10-WAVE64-NEXT:  .LBB12_2: ; %UnifiedUnreachableBlock
-; GFX10-WAVE64-NEXT:  .LBB12_3: ; %bb3
-; GFX10-WAVE64-NEXT:    s_branch .LBB12_2
-; GFX10-WAVE64-NEXT:  .LBB12_4: ; %bb6
+; GFX10-WAVE64-NEXT:  ; %bb.1: ; %bb6
+; GFX10-WAVE64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; GFX10-WAVE64-NEXT:    s_cbranch_scc0 .LBB12_6
-; GFX10-WAVE64-NEXT:  ; %bb.5: ; %bb6
+; GFX10-WAVE64-NEXT:    s_cbranch_scc0 .LBB12_5
+; GFX10-WAVE64-NEXT:  ; %bb.2: ; %bb6
 ; GFX10-WAVE64-NEXT:    s_mov_b64 exec, 0
-; GFX10-WAVE64-NEXT:  .LBB12_6:
+; GFX10-WAVE64-NEXT:  .LBB12_3: ; %bb3
+; GFX10-WAVE64-NEXT:    v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0
+; GFX10-WAVE64-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX10-WAVE64-NEXT:  ; %bb.4: ; %bb5
+; GFX10-WAVE64-NEXT:  .LBB12_5:
 ; GFX10-WAVE64-NEXT:    s_mov_b64 exec, 0
 ; GFX10-WAVE64-NEXT:    exp null off, off, off, off done vm
 ; GFX10-WAVE64-NEXT:    s_endpgm
@@ -1338,46 +1333,42 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1
 ; GFX10-WAVE32-LABEL: no_skip_no_successors:
 ; GFX10-WAVE32:       ; %bb.0: ; %bb
 ; GFX10-WAVE32-NEXT:    v_cmp_nge_f32_e64 s1, s1, 0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-WAVE32-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
-; GFX10-WAVE32-NEXT:    s_mov_b32 s1, -1
 ; GFX10-WAVE32-NEXT:    s_cbranch_vccz .LBB12_3
-; GFX10-WAVE32-NEXT:  ; %bb.1: ; %Flow
-; GFX10-WAVE32-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
-; GFX10-WAVE32-NEXT:    s_cbranch_vccnz .LBB12_4
-; GFX10-WAVE32-NEXT:  .LBB12_2: ; %UnifiedUnreachableBlock
-; GFX10-WAVE32-NEXT:  .LBB12_3: ; %bb3
-; GFX10-WAVE32-NEXT:    s_branch .LBB12_2
-; GFX10-WAVE32-NEXT:  .LBB12_4: ; %bb6
-; GFX10-WAVE32-NEXT:    s_andn2_b32 s0, s0, exec_lo
-; GFX10-WAVE32-NEXT:    s_cbranch_scc0 .LBB12_6
-; GFX10-WAVE32-NEXT:  ; %bb.5: ; %bb6
+; GFX10-WAVE32-NEXT:  ; %bb.1: ; %bb6
+; GFX10-WAVE32-NEXT:    s_mov_b32 s2, exec_lo
+; GFX10-WAVE32-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-WAVE32-NEXT:    s_cbranch_scc0 .LBB12_5
+; GFX10-WAVE32-NEXT:  ; %bb.2: ; %bb6
 ; GFX10-WAVE32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-WAVE32-NEXT:  .LBB12_6:
+; GFX10-WAVE32-NEXT:  .LBB12_3: ; %bb3
+; GFX10-WAVE32-NEXT:    v_cmp_nle_f32_e64 s0, 0x3e7ae148, s0
+; GFX10-WAVE32-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX10-WAVE32-NEXT:  ; %bb.4: ; %bb5
+; GFX10-WAVE32-NEXT:  .LBB12_5:
 ; GFX10-WAVE32-NEXT:    s_mov_b32 exec_lo, 0
 ; GFX10-WAVE32-NEXT:    exp null off, off, off, off done vm
 ; GFX10-WAVE32-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: no_skip_no_successors:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_cmp_nge_f32_e64 s[0:1], s1, 0
-; GFX11-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11-NEXT:    v_cmp_nge_f32_e64 s[4:5], s1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; GFX11-NEXT:    s_mov_b64 s[0:1], -1
+; GFX11-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GFX11-NEXT:    s_cbranch_vccz .LBB12_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; GFX11-NEXT:    s_cbranch_vccnz .LBB12_4
-; GFX11-NEXT:  .LBB12_2: ; %UnifiedUnreachableBlock
-; GFX11-NEXT:  .LBB12_3: ; %bb3
-; GFX11-NEXT:    s_branch .LBB12_2
-; GFX11-NEXT:  .LBB12_4: ; %bb6
+; GFX11-NEXT:  ; %bb.1: ; %bb6
+; GFX11-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b64 s[2:3], s[2:3], exec
-; GFX11-NEXT:    s_cbranch_scc0 .LBB12_6
-; GFX11-NEXT:  ; %bb.5: ; %bb6
+; GFX11-NEXT:    s_cbranch_scc0 .LBB12_5
+; GFX11-NEXT:  ; %bb.2: ; %bb6
 ; GFX11-NEXT:    s_mov_b64 exec, 0
-; GFX11-NEXT:  .LBB12_6:
+; GFX11-NEXT:  .LBB12_3: ; %bb3
+; GFX11-NEXT:    v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX11-NEXT:  ; %bb.4: ; %bb5
+; GFX11-NEXT:  .LBB12_5:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
 ; GFX11-NEXT:    exp mrt0 off, off, off, off done
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 9c4147c..48e7db5 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -7,57 +7,135 @@ define hidden void @widget() {
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v40, s33, 2
+; GCN-NEXT:    v_writelane_b32 v40, s33, 16
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    v_writelane_b32 v40, s36, 4
+; GCN-NEXT:    v_writelane_b32 v40, s37, 5
+; GCN-NEXT:    v_writelane_b32 v40, s38, 6
+; GCN-NEXT:    v_writelane_b32 v40, s39, 7
+; GCN-NEXT:    v_writelane_b32 v40, s40, 8
+; GCN-NEXT:    v_writelane_b32 v40, s41, 9
+; GCN-NEXT:    v_writelane_b32 v40, s42, 10
+; GCN-NEXT:    v_writelane_b32 v40, s43, 11
+; GCN-NEXT:    v_writelane_b32 v40, s44, 12
+; GCN-NEXT:    v_writelane_b32 v40, s45, 13
+; GCN-NEXT:    v_writelane_b32 v40, s46, 14
+; GCN-NEXT:    v_writelane_b32 v40, s47, 15
+; GCN-NEXT:    v_mov_b32_e32 v41, v31
+; GCN-NEXT:    s_mov_b32 s42, s15
+; GCN-NEXT:    s_mov_b32 s43, s14
+; GCN-NEXT:    s_mov_b32 s44, s13
+; GCN-NEXT:    s_mov_b32 s45, s12
+; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_mov_b64 s[8:9], -1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 21, v0
-; GCN-NEXT:    v_readfirstlane_b32 s16, v0
-; GCN-NEXT:    s_cbranch_vccz .LBB0_3
-; GCN-NEXT:  ; %bb.1: ; %bb4
-; GCN-NEXT:    s_cmp_lg_u32 s16, 9
-; GCN-NEXT:    s_cbranch_scc1 .LBB0_4
-; GCN-NEXT:  ; %bb.2: ; %bb7
-; GCN-NEXT:    s_getpc_b64 s[16:17]
-; GCN-NEXT:    s_add_u32 s16, s16, wibble@rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s17, s17, wibble@rel32@hi+12
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    s_branch .LBB0_7
-; GCN-NEXT:  .LBB0_3: ; %bb2
-; GCN-NEXT:    s_cmp_eq_u32 s16, 21
-; GCN-NEXT:    s_cbranch_scc1 .LBB0_6
-; GCN-NEXT:  .LBB0_4: ; %bb9
+; GCN-NEXT:    s_mov_b64 s[46:47], 0
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    s_cbranch_vccz .LBB0_9
+; GCN-NEXT:  ; %bb.1: ; %Flow
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GCN-NEXT:    s_cbranch_vccz .LBB0_10
+; GCN-NEXT:  .LBB0_2: ; %Flow1
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_4
+; GCN-NEXT:  .LBB0_3: ; %bb9
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, wibble@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble@rel32@hi+12
+; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GCN-NEXT:    s_mov_b32 s12, s45
+; GCN-NEXT:    s_mov_b32 s13, s44
+; GCN-NEXT:    s_mov_b32 s14, s43
+; GCN-NEXT:    s_mov_b32 s15, s42
+; GCN-NEXT:    v_mov_b32_e32 v31, v41
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v0
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_cbranch_execnz .LBB0_7
-; GCN-NEXT:  ; %bb.5: ; %bb9.bb12_crit_edge
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:  .LBB0_6: ; %bb12
+; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_andn2_b64 s[6:7], s[46:47], exec
+; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GCN-NEXT:    s_or_b64 s[46:47], s[6:7], s[8:9]
+; GCN-NEXT:  .LBB0_4: ; %Flow2
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[46:47]
+; GCN-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB0_6
+; GCN-NEXT:  ; %bb.5: ; %bb12
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
-; GCN-NEXT:  .LBB0_7: ; %UnifiedReturnBlock
+; GCN-NEXT:  .LBB0_6: ; %Flow3
+; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_8
+; GCN-NEXT:  ; %bb.7: ; %bb7
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, wibble@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, wibble@rel32@hi+12
+; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GCN-NEXT:    s_mov_b32 s12, s45
+; GCN-NEXT:    s_mov_b32 s13, s44
+; GCN-NEXT:    s_mov_b32 s14, s43
+; GCN-NEXT:    s_mov_b32 s15, s42
+; GCN-NEXT:    v_mov_b32_e32 v31, v41
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:  .LBB0_8: ; %UnifiedReturnBlock
+; GCN-NEXT:    v_readlane_b32 s47, v40, 15
+; GCN-NEXT:    v_readlane_b32 s46, v40, 14
+; GCN-NEXT:    v_readlane_b32 s45, v40, 13
+; GCN-NEXT:    v_readlane_b32 s44, v40, 12
+; GCN-NEXT:    v_readlane_b32 s43, v40, 11
+; GCN-NEXT:    v_readlane_b32 s42, v40, 10
+; GCN-NEXT:    v_readlane_b32 s41, v40, 9
+; GCN-NEXT:    v_readlane_b32 s40, v40, 8
+; GCN-NEXT:    v_readlane_b32 s39, v40, 7
+; GCN-NEXT:    v_readlane_b32 s38, v40, 6
+; GCN-NEXT:    v_readlane_b32 s37, v40, 5
+; GCN-NEXT:    v_readlane_b32 s36, v40, 4
+; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v40, 2
+; GCN-NEXT:    v_readlane_b32 s33, v40, 16
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:  .LBB0_9: ; %bb2
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[46:47], 21, v0
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 21, v0
+; GCN-NEXT:    s_mov_b64 vcc, exec
+; GCN-NEXT:    s_cbranch_execnz .LBB0_2
+; GCN-NEXT:  .LBB0_10: ; %bb4
+; GCN-NEXT:    s_mov_b64 s[4:5], -1
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 9, v0
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GCN-NEXT:    s_cbranch_vccz .LBB0_3
+; GCN-NEXT:    s_branch .LBB0_4
 ; SI-OPT-LABEL: @widget(
 ; SI-OPT-NEXT:  bb:
 ; SI-OPT-NEXT:    [[TMP:%.*]] = load i32, i32 addrspace(1)* null, align 16
-- 
2.7.4