if (!isUniformlyReached(DA, *BB))
ReturningBlocks.push_back(BB);
} else if (isa<UnreachableInst>(BB->getTerminator())) {
- if (!isUniformlyReached(DA, *BB))
- UnreachableBlocks.push_back(BB);
+ // TODO: For now we unify UnreachableBlocks even though they are uniformly
+ // reachable. This is to workaround the limitation of structurizer, which
+ // can not handle multiple function exits. After structurizer is able to
+ // handle multiple function exits, we should only unify UnreachableBlocks
+ // that are not uniformly reachable.
+ UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) {
; GCN-LABEL: sgpr_trunc_brcond:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x9
+; GCN-NEXT: s_load_dword s1, s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s0, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_xor_b32 s1, s1, -1
+; GCN-NEXT: s_and_b32 s1, s1, 1
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_cbranch_scc0 .LBB3_2
+; GCN-NEXT: ; %bb.1: ; %bb1
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: flat_store_dword v[0:1], v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB3_2: ; %Flow
; GCN-NEXT: s_xor_b32 s0, s0, -1
; GCN-NEXT: s_and_b32 s0, s0, 1
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cbranch_scc1 .LBB3_2
-; GCN-NEXT: ; %bb.1: ; %bb0
+; GCN-NEXT: s_cbranch_scc1 .LBB3_4
+; GCN-NEXT: ; %bb.3: ; %bb0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: flat_store_dword v[0:1], v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: .LBB3_2: ; %bb1
-; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: flat_store_dword v[0:1], v0
-; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB3_4: ; %UnifiedUnreachableBlock
entry:
%trunc = trunc i32 %cond to i1
br i1 %trunc, label %bb0, label %bb1
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_and_b32 s0, s0, s1
+; GCN-NEXT: s_and_b32 s1, s0, s1
+; GCN-NEXT: s_xor_b32 s1, s1, -1
+; GCN-NEXT: s_and_b32 s1, s1, 1
+; GCN-NEXT: s_mov_b32 s0, -1
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_cbranch_scc0 .LBB4_2
+; GCN-NEXT: ; %bb.1: ; %bb1
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: flat_store_dword v[0:1], v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB4_2: ; %Flow
; GCN-NEXT: s_xor_b32 s0, s0, -1
; GCN-NEXT: s_and_b32 s0, s0, 1
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cbranch_scc1 .LBB4_2
-; GCN-NEXT: ; %bb.1: ; %bb0
+; GCN-NEXT: s_cbranch_scc1 .LBB4_4
+; GCN-NEXT: ; %bb.3: ; %bb0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: flat_store_dword v[0:1], v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: .LBB4_2: ; %bb1
-; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: flat_store_dword v[0:1], v0
-; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB4_4: ; %UnifiedUnreachableBlock
entry:
%trunc0 = trunc i32 %cond0 to i1
%trunc1 = trunc i32 %cond1 to i1
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
+; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs | FileCheck %s
+
+declare void @llvm.trap()
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @kernel(i32 %a, i32 addrspace(1)* %x, i32 noundef %n) {
+; This used to bypass the structurization process because structurizer is unable to
+; handle multiple-exits CFG. This should be correctly structurized.
+; UNIFY-LABEL: define amdgpu_kernel void @kernel
+; UNIFY-LABEL: entry:
+; UNIFY: %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT: %cmp = icmp eq i32 %n.load, 256
+; UNIFY-NEXT: br i1 %cmp, label %if.then, label %if.else
+; UNIFY-LABEL: if.then:
+; UNIFY-NEXT: %cmp1 = icmp eq i32 %a.load, 0
+; UNIFY-NEXT: br i1 %cmp1, label %if.end6.sink.split, label %cond.false
+; UNIFY-LABEL: cond.false:
+; UNIFY-NEXT: call void @llvm.trap()
+; UNIFY-NEXT: br label %UnifiedUnreachableBlock
+; UNIFY-LABEL: if.else:
+; UNIFY-NEXT: %cmp2 = icmp ult i32 %tid, 10
+; UNIFY-NEXT: br i1 %cmp2, label %if.then3, label %UnifiedReturnBlock
+; UNIFY-LABEL: if.then3:
+; UNIFY-NEXT: %cmp1.i7 = icmp eq i32 %a.load, 0
+; UNIFY-NEXT: br i1 %cmp1.i7, label %if.end6.sink.split, label %cond.false.i8
+; UNIFY-LABEL: cond.false.i8:
+; UNIFY-NEXT: call void @llvm.trap()
+; UNIFY-NEXT: br label %UnifiedUnreachableBlock
+; UNIFY-LABEL: if.end6.sink.split:
+; UNIFY-NEXT: %idxprom = sext i32 %tid to i64
+; UNIFY-NEXT: %x1 = getelementptr inbounds i32, i32 addrspace(1)* %x.load, i64 %idxprom
+; UNIFY-NEXT: store i32 %a.load, i32 addrspace(1)* %x1, align 4
+; UNIFY-NEXT: br label %UnifiedReturnBlock
+; UNIFY-LABEL: UnifiedUnreachableBlock:
+; UNIFY-NEXT: call void @llvm.amdgcn.unreachable()
+; UNIFY-NEXT: br label %UnifiedReturnBlock
+; UNIFY-LABEL: UnifiedReturnBlock:
+; UNIFY-NEXT: ret void
+
+; CHECK-LABEL: kernel:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x10
+; CHECK-NEXT: s_load_dword s10, s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_cmpk_lg_i32 s0, 0x100
+; CHECK-NEXT: s_cbranch_scc0 .LBB0_6
+; CHECK-NEXT: ; %bb.1: ; %if.else
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0
+; CHECK-NEXT: s_mov_b64 s[6:7], 0
+; CHECK-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
+; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; CHECK-NEXT: s_cbranch_execz .LBB0_5
+; CHECK-NEXT: ; %bb.2: ; %if.then3
+; CHECK-NEXT: s_cmp_lg_u32 s10, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB0_14
+; CHECK-NEXT: ; %bb.3:
+; CHECK-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-NEXT: s_mov_b64 s[0:1], -1
+; CHECK-NEXT: .LBB0_4: ; %Flow3
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT: .LBB0_5: ; %Flow2
+; CHECK-NEXT: s_or_b64 exec, exec, s[8:9]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7]
+; CHECK-NEXT: s_cbranch_vccz .LBB0_8
+; CHECK-NEXT: s_branch .LBB0_7
+; CHECK-NEXT: .LBB0_6:
+; CHECK-NEXT: s_mov_b64 s[2:3], 0
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_execz .LBB0_8
+; CHECK-NEXT: .LBB0_7: ; %if.then
+; CHECK-NEXT: s_cmp_lg_u32 s10, 0
+; CHECK-NEXT: s_mov_b64 s[0:1], -1
+; CHECK-NEXT: s_cbranch_scc1 .LBB0_13
+; CHECK-NEXT: .LBB0_8: ; %Flow4
+; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3]
+; CHECK-NEXT: .LBB0_9: ; %UnifiedUnreachableBlock
+; CHECK-NEXT: ; divergent unreachable
+; CHECK-NEXT: .LBB0_10: ; %Flow6
+; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; CHECK-NEXT: s_cbranch_execz .LBB0_12
+; CHECK-NEXT: ; %bb.11: ; %if.end6.sink.split
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, s10
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT: .LBB0_12: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: .LBB0_13: ; %cond.false
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
+; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT: s_trap 2
+; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3]
+; CHECK-NEXT: s_cbranch_execnz .LBB0_9
+; CHECK-NEXT: s_branch .LBB0_10
+; CHECK-NEXT: .LBB0_14: ; %cond.false.i8
+; CHECK-NEXT: s_mov_b64 s[2:3], -1
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
+; CHECK-NEXT: s_trap 2
+; CHECK-NEXT: s_branch .LBB0_4
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cmp = icmp eq i32 %n, 256
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp1 = icmp eq i32 %a, 0
+ br i1 %cmp1, label %if.end6.sink.split, label %cond.false
+
+cond.false:
+ call void @llvm.trap()
+ unreachable
+
+if.else:
+ %cmp2 = icmp ult i32 %tid, 10
+ br i1 %cmp2, label %if.then3, label %if.end6
+
+if.then3:
+ %cmp1.i7 = icmp eq i32 %a, 0
+ br i1 %cmp1.i7, label %if.end6.sink.split, label %cond.false.i8
+
+cond.false.i8:
+ call void @llvm.trap()
+ unreachable
+
+if.end6.sink.split:
+ %x1 = getelementptr inbounds i32, i32 addrspace(1)* %x, i32 %tid
+ store i32 %a, i32 addrspace(1)* %x1, align 4
+ br label %if.end6
+
+if.end6:
+ ret void
+}
; SI-LABEL: no_skip_no_successors:
; SI: ; %bb.0: ; %bb
; SI-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0
+; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: s_mov_b64 s[0:1], -1
; SI-NEXT: s_and_b64 vcc, exec, s[4:5]
; SI-NEXT: s_cbranch_vccz .LBB12_3
-; SI-NEXT: ; %bb.1: ; %bb6
-; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: ; %bb.1: ; %Flow
+; SI-NEXT: s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT: s_cbranch_vccnz .LBB12_4
+; SI-NEXT: .LBB12_2: ; %UnifiedUnreachableBlock
+; SI-NEXT: .LBB12_3: ; %bb3
+; SI-NEXT: s_branch .LBB12_2
+; SI-NEXT: .LBB12_4: ; %bb6
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
-; SI-NEXT: s_cbranch_scc0 .LBB12_5
-; SI-NEXT: ; %bb.2: ; %bb6
+; SI-NEXT: s_cbranch_scc0 .LBB12_6
+; SI-NEXT: ; %bb.5: ; %bb6
; SI-NEXT: s_mov_b64 exec, 0
-; SI-NEXT: .LBB12_3: ; %bb3
-; SI-NEXT: v_mov_b32_e32 v0, 0x3e7ae148
-; SI-NEXT: v_cmp_nge_f32_e32 vcc, s0, v0
-; SI-NEXT: s_and_b64 vcc, exec, vcc
-; SI-NEXT: ; %bb.4: ; %bb5
-; SI-NEXT: .LBB12_5:
+; SI-NEXT: .LBB12_6:
; SI-NEXT: s_mov_b64 exec, 0
; SI-NEXT: exp null off, off, off, off done vm
; SI-NEXT: s_endpgm
;
; GFX10-WAVE64-LABEL: no_skip_no_successors:
; GFX10-WAVE64: ; %bb.0: ; %bb
-; GFX10-WAVE64-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0
-; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, s[4:5]
-; GFX10-WAVE64-NEXT: s_cbranch_vccz .LBB12_3
-; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb6
+; GFX10-WAVE64-NEXT: v_cmp_nge_f32_e64 s[0:1], s1, 0
; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec
+; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], -1
+; GFX10-WAVE64-NEXT: s_cbranch_vccz .LBB12_3
+; GFX10-WAVE64-NEXT: ; %bb.1: ; %Flow
+; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX10-WAVE64-NEXT: s_cbranch_vccnz .LBB12_4
+; GFX10-WAVE64-NEXT: .LBB12_2: ; %UnifiedUnreachableBlock
+; GFX10-WAVE64-NEXT: .LBB12_3: ; %bb3
+; GFX10-WAVE64-NEXT: s_branch .LBB12_2
+; GFX10-WAVE64-NEXT: .LBB12_4: ; %bb6
; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
-; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB12_5
-; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb6
+; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB12_6
+; GFX10-WAVE64-NEXT: ; %bb.5: ; %bb6
; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0
-; GFX10-WAVE64-NEXT: .LBB12_3: ; %bb3
-; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0
-; GFX10-WAVE64-NEXT: s_and_b64 vcc, exec, s[0:1]
-; GFX10-WAVE64-NEXT: ; %bb.4: ; %bb5
-; GFX10-WAVE64-NEXT: .LBB12_5:
+; GFX10-WAVE64-NEXT: .LBB12_6:
; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0
; GFX10-WAVE64-NEXT: exp null off, off, off, off done vm
; GFX10-WAVE64-NEXT: s_endpgm
; GFX10-WAVE32-LABEL: no_skip_no_successors:
; GFX10-WAVE32: ; %bb.0: ; %bb
; GFX10-WAVE32-NEXT: v_cmp_nge_f32_e64 s1, s1, 0
+; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-WAVE32-NEXT: s_and_b32 vcc_lo, exec_lo, s1
+; GFX10-WAVE32-NEXT: s_mov_b32 s1, -1
; GFX10-WAVE32-NEXT: s_cbranch_vccz .LBB12_3
-; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb6
-; GFX10-WAVE32-NEXT: s_mov_b32 s2, exec_lo
-; GFX10-WAVE32-NEXT: s_andn2_b32 s2, s2, exec_lo
-; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB12_5
-; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb6
-; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0
+; GFX10-WAVE32-NEXT: ; %bb.1: ; %Flow
+; GFX10-WAVE32-NEXT: s_and_b32 vcc_lo, exec_lo, s1
+; GFX10-WAVE32-NEXT: s_cbranch_vccnz .LBB12_4
+; GFX10-WAVE32-NEXT: .LBB12_2: ; %UnifiedUnreachableBlock
; GFX10-WAVE32-NEXT: .LBB12_3: ; %bb3
-; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e64 s0, 0x3e7ae148, s0
-; GFX10-WAVE32-NEXT: s_and_b32 vcc_lo, exec_lo, s0
-; GFX10-WAVE32-NEXT: ; %bb.4: ; %bb5
-; GFX10-WAVE32-NEXT: .LBB12_5:
+; GFX10-WAVE32-NEXT: s_branch .LBB12_2
+; GFX10-WAVE32-NEXT: .LBB12_4: ; %bb6
+; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, exec_lo
+; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB12_6
+; GFX10-WAVE32-NEXT: ; %bb.5: ; %bb6
+; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0
+; GFX10-WAVE32-NEXT: .LBB12_6:
; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0
; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm
; GFX10-WAVE32-NEXT: s_endpgm
;
; GFX11-LABEL: no_skip_no_successors:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0
+; GFX11-NEXT: v_cmp_nge_f32_e64 s[0:1], s1, 0
+; GFX11-NEXT: s_mov_b64 s[2:3], exec
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX11-NEXT: s_mov_b64 s[0:1], -1
; GFX11-NEXT: s_cbranch_vccz .LBB12_3
-; GFX11-NEXT: ; %bb.1: ; %bb6
-; GFX11-NEXT: s_mov_b64 s[2:3], exec
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX11-NEXT: s_cbranch_vccnz .LBB12_4
+; GFX11-NEXT: .LBB12_2: ; %UnifiedUnreachableBlock
+; GFX11-NEXT: .LBB12_3: ; %bb3
+; GFX11-NEXT: s_branch .LBB12_2
+; GFX11-NEXT: .LBB12_4: ; %bb6
; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec
-; GFX11-NEXT: s_cbranch_scc0 .LBB12_5
-; GFX11-NEXT: ; %bb.2: ; %bb6
+; GFX11-NEXT: s_cbranch_scc0 .LBB12_6
+; GFX11-NEXT: ; %bb.5: ; %bb6
; GFX11-NEXT: s_mov_b64 exec, 0
-; GFX11-NEXT: .LBB12_3: ; %bb3
-; GFX11-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1]
-; GFX11-NEXT: ; %bb.4: ; %bb5
-; GFX11-NEXT: .LBB12_5:
+; GFX11-NEXT: .LBB12_6:
; GFX11-NEXT: s_mov_b64 exec, 0
; GFX11-NEXT: exp mrt0 off, off, off, off done
; GFX11-NEXT: s_endpgm
;
; GCN-LABEL: name: test
; GCN: bb.{{[0-9]+}}.entry:
+ ; GCN: bb.{{[0-9]+}}.Flow1:
; GCN: bb.{{[0-9]+}}.entry.true.blk:
; GCN: bb.{{[0-9]+}}.entry.false.blk:
; GCN: bb.{{[0-9]+}}.switch.blk:
; GCN-NOT: bb.{{[0-9]+}}.unreach.blk:
; GCN-NOT: PHI
- ; GCN: bb.{{[0-9]+}}.exit:
+ ; GCN: bb.{{[0-9]+}}.Flow:
+ ; GCN: bb.{{[0-9]+}}.UnifiedReturnBlock:
entry:
%idx = tail call i32 @llvm.amdgcn.workitem.id.x() #0
br i1 undef, label %entry.true.blk, label %entry.false.blk