From b8616e40daf7a4c910f5fc0201c7ddd64082aaf0 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 18 Mar 2021 17:00:14 +0000 Subject: [PATCH] [AMDGPU] Add atomic optimizer nouse tests Add some atomic optimizer tests where there is no use of the result of the atomic operation, which is a common case in real code. NFC. Differential Revision: https://reviews.llvm.org/D98952 --- .../AMDGPU/atomic_optimizations_local_pointer.ll | 640 +++++++++++++++------ 1 file changed, 476 insertions(+), 164 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index b797e3e..f3de201 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -575,6 +575,162 @@ entry: ret void } +define amdgpu_kernel void @add_i32_varying_nouse() { +; GFX7LESS-LABEL: add_i32_varying_nouse: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: ds_add_u32 v1, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: add_i32_varying_nouse: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8-NEXT: s_mov_b64 exec, s[0:1] +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz BB3_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_add_u32 v0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: BB3_2: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add_i32_varying_nouse: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz BB3_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_u32 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: BB3_2: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: add_i32_varying_nouse: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GFX1064-NEXT: s_mov_b32 s0, s2 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz BB3_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: ds_add_u32 v0, v3 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: BB3_2: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: add_i32_varying_nouse: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s0, s1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB3_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: ds_add_u32 v0, v3 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: BB3_2: +; GFX1032-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel + ret void +} + define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; ; @@ -587,7 +743,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB3_2 +; GFX7LESS-NEXT: s_cbranch_execz BB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -598,7 +754,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB3_2: +; GFX7LESS-NEXT: BB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 @@ -622,7 +778,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB3_2 +; GFX8-NEXT: s_cbranch_execz BB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -633,7 +789,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB3_2: +; GFX8-NEXT: BB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -656,7 +812,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB3_2 +; GFX9-NEXT: s_cbranch_execz BB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -666,7 +822,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB3_2: +; GFX9-NEXT: BB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -689,7 +845,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB3_2 +; GFX1064-NEXT: s_cbranch_execz BB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -701,7 +857,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB3_2: +; GFX1064-NEXT: BB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -721,7 +877,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB3_2 +; GFX1032-NEXT: s_cbranch_execz BB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -733,7 +889,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB3_2: +; GFX1032-NEXT: BB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -762,7 +918,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB4_2 +; GFX7LESS-NEXT: s_cbranch_execz BB5_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -777,7 +933,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB4_2: +; GFX7LESS-NEXT: BB5_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -805,7 +961,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz BB4_2 +; GFX8-NEXT: s_cbranch_execz BB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 @@ -820,7 +976,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB4_2: +; GFX8-NEXT: BB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 @@ -848,7 +1004,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz BB4_2 +; GFX9-NEXT: s_cbranch_execz BB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -862,7 +1018,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB4_2: +; GFX9-NEXT: BB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -890,7 +1046,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB4_2 +; GFX1064-NEXT: s_cbranch_execz BB5_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -906,7 +1062,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB4_2: +; GFX1064-NEXT: BB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -931,7 +1087,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB4_2 +; GFX1032-NEXT: s_cbranch_execz BB5_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -947,7 +1103,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB4_2: +; GFX1032-NEXT: BB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1062,7 +1218,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB6_2 +; GFX7LESS-NEXT: s_cbranch_execz BB7_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 @@ -1072,7 +1228,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB6_2: +; GFX7LESS-NEXT: BB7_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 @@ -1092,7 +1248,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz BB6_2 +; GFX8-NEXT: s_cbranch_execz BB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_mul_i32 s2, s2, 5 @@ -1102,7 +1258,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB6_2: +; GFX8-NEXT: BB7_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1123,7 +1279,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz BB6_2 +; GFX9-NEXT: s_cbranch_execz BB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 @@ -1132,7 +1288,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB6_2: +; GFX9-NEXT: BB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1153,7 +1309,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB6_2 +; GFX1064-NEXT: s_cbranch_execz BB7_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1164,7 +1320,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB6_2: +; GFX1064-NEXT: BB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -1184,7 +1340,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB6_2 +; GFX1032-NEXT: s_cbranch_execz BB7_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1195,7 +1351,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB6_2: +; GFX1032-NEXT: BB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -1225,7 +1381,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB7_2 +; GFX7LESS-NEXT: s_cbranch_execz BB8_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1236,7 +1392,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB7_2: +; GFX7LESS-NEXT: BB8_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 @@ -1257,7 +1413,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX8-NEXT: s_cbranch_execz BB7_2 +; GFX8-NEXT: s_cbranch_execz BB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1268,7 +1424,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB7_2: +; GFX8-NEXT: BB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1289,7 +1445,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz BB7_2 +; GFX9-NEXT: s_cbranch_execz BB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1299,7 +1455,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB7_2: +; GFX9-NEXT: BB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1321,7 +1477,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz BB7_2 +; GFX1064-NEXT: s_cbranch_execz BB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1333,7 +1489,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB7_2: +; GFX1064-NEXT: BB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1355,7 +1511,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB7_2 +; GFX1032-NEXT: s_cbranch_execz BB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1367,7 +1523,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB7_2: +; GFX1032-NEXT: BB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1431,7 +1587,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB8_2 +; GFX8-NEXT: s_cbranch_execz BB9_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -1439,7 +1595,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB8_2: +; GFX8-NEXT: BB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -1482,14 +1638,14 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB8_2 +; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB8_2: +; GFX9-NEXT: BB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -1541,7 +1697,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB8_2 +; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -1551,7 +1707,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB8_2: +; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -1592,7 +1748,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB8_2 +; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -1601,7 +1757,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB8_2: +; GFX1032-NEXT: BB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -1618,6 +1774,162 @@ entry: ret void } +define amdgpu_kernel void @sub_i32_varying_nouse() { +; GFX7LESS-LABEL: sub_i32_varying_nouse: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: ds_sub_u32 v1, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: sub_i32_varying_nouse: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8-NEXT: s_mov_b64 exec, s[0:1] +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz BB10_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_sub_u32 v0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: BB10_2: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sub_i32_varying_nouse: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz BB10_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_sub_u32 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: BB10_2: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: sub_i32_varying_nouse: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GFX1064-NEXT: s_mov_b32 s0, s2 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz BB10_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: ds_sub_u32 v0, v3 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: BB10_2: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: sub_i32_varying_nouse: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s0, s1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB10_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: ds_sub_u32 v0, v3 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: BB10_2: +; GFX1032-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel + ret void +} + define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; ; @@ -1630,7 +1942,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB9_2 +; GFX7LESS-NEXT: s_cbranch_execz BB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1641,7 +1953,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB9_2: +; GFX7LESS-NEXT: BB11_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 @@ -1665,7 +1977,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB9_2 +; GFX8-NEXT: s_cbranch_execz BB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -1676,7 +1988,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB9_2: +; GFX8-NEXT: BB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 @@ -1700,7 +2012,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB9_2 +; GFX9-NEXT: s_cbranch_execz BB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -1710,7 +2022,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB9_2: +; GFX9-NEXT: BB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s3, v2 @@ -1734,7 +2046,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB9_2 +; GFX1064-NEXT: s_cbranch_execz BB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1746,7 +2058,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB9_2: +; GFX1064-NEXT: BB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -1769,7 +2081,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB9_2 +; GFX1032-NEXT: s_cbranch_execz BB11_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1781,7 +2093,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB9_2: +; GFX1032-NEXT: BB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -1813,7 +2125,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB10_2 +; GFX7LESS-NEXT: s_cbranch_execz BB12_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1828,7 +2140,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB10_2: +; GFX7LESS-NEXT: BB12_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -1856,7 +2168,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz BB10_2 +; GFX8-NEXT: s_cbranch_execz BB12_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 @@ -1871,7 +2183,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB10_2: +; GFX8-NEXT: BB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 @@ -1899,7 +2211,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz BB10_2 +; GFX9-NEXT: s_cbranch_execz BB12_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,7 +2225,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB10_2: +; GFX9-NEXT: BB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -1941,7 +2253,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB10_2 +; GFX1064-NEXT: s_cbranch_execz BB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1957,7 +2269,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB10_2: +; GFX1064-NEXT: BB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1982,7 +2294,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB10_2 +; GFX1032-NEXT: s_cbranch_execz BB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1998,7 +2310,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB10_2: +; GFX1032-NEXT: BB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2148,7 +2460,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB12_2 +; GFX8-NEXT: s_cbranch_execz BB14_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2156,7 +2468,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB12_2: +; GFX8-NEXT: BB14_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2199,14 +2511,14 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB12_2 +; GFX9-NEXT: s_cbranch_execz BB14_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB12_2: +; GFX9-NEXT: BB14_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2258,7 +2570,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB12_2 +; GFX1064-NEXT: s_cbranch_execz BB14_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2268,7 +2580,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB12_2: +; GFX1064-NEXT: BB14_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -2309,7 +2621,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB12_2 +; GFX1032-NEXT: s_cbranch_execz BB14_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -2318,7 +2630,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB12_2: +; GFX1032-NEXT: BB14_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -2382,7 +2694,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB13_2 +; GFX8-NEXT: s_cbranch_execz BB15_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2390,7 +2702,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB13_2: +; GFX8-NEXT: BB15_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2433,14 +2745,14 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB13_2 +; GFX9-NEXT: s_cbranch_execz BB15_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB13_2: +; GFX9-NEXT: BB15_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2492,7 +2804,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB13_2 +; GFX1064-NEXT: s_cbranch_execz BB15_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2502,7 +2814,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB13_2: +; GFX1064-NEXT: BB15_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -2543,7 +2855,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB13_2 +; GFX1032-NEXT: s_cbranch_execz BB15_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -2552,7 +2864,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB13_2: +; GFX1032-NEXT: BB15_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -2616,7 +2928,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB14_2 +; GFX8-NEXT: s_cbranch_execz BB16_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2624,7 +2936,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB14_2: +; GFX8-NEXT: BB16_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2667,14 +2979,14 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB14_2 +; GFX9-NEXT: s_cbranch_execz BB16_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB14_2: +; GFX9-NEXT: BB16_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2726,7 +3038,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB14_2 +; GFX1064-NEXT: s_cbranch_execz BB16_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2736,7 +3048,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB14_2: +; GFX1064-NEXT: BB16_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -2777,7 +3089,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB14_2 +; GFX1032-NEXT: s_cbranch_execz BB16_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -2786,7 +3098,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB14_2: +; GFX1032-NEXT: BB16_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -2850,7 +3162,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB15_2 +; GFX8-NEXT: s_cbranch_execz BB17_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2858,7 +3170,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB15_2: +; GFX8-NEXT: BB17_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2901,14 +3213,14 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB15_2 +; GFX9-NEXT: s_cbranch_execz BB17_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB15_2: +; GFX9-NEXT: BB17_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2962,7 +3274,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB15_2 +; GFX1064-NEXT: s_cbranch_execz BB17_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2972,7 +3284,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB15_2: +; GFX1064-NEXT: BB17_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3015,7 +3327,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB15_2 +; GFX1032-NEXT: s_cbranch_execz BB17_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3024,7 +3336,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB15_2: +; GFX1032-NEXT: BB17_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3052,7 +3364,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB16_2 +; GFX7LESS-NEXT: s_cbranch_execz BB18_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -3061,7 +3373,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB16_2: +; GFX7LESS-NEXT: BB18_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -3087,7 +3399,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB16_2 +; GFX8-NEXT: s_cbranch_execz BB18_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3096,7 +3408,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB16_2: +; GFX8-NEXT: BB18_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3122,7 +3434,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB16_2 +; GFX9-NEXT: s_cbranch_execz BB18_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3130,7 +3442,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB16_2: +; GFX9-NEXT: BB18_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3156,7 +3468,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB16_2 +; GFX1064-NEXT: s_cbranch_execz BB18_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3166,7 +3478,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB16_2: +; GFX1064-NEXT: BB18_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -3189,7 +3501,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB16_2 +; GFX1032-NEXT: s_cbranch_execz BB18_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3199,7 +3511,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB16_2: +; GFX1032-NEXT: BB18_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -3267,7 +3579,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB17_2 +; GFX8-NEXT: s_cbranch_execz BB19_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -3275,7 +3587,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB17_2: +; GFX8-NEXT: BB19_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3318,14 +3630,14 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB17_2 +; GFX9-NEXT: s_cbranch_execz BB19_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB17_2: +; GFX9-NEXT: BB19_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3379,7 +3691,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB17_2 +; GFX1064-NEXT: s_cbranch_execz BB19_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -3389,7 +3701,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB17_2: +; GFX1064-NEXT: BB19_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3432,7 +3744,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB17_2 +; GFX1032-NEXT: s_cbranch_execz BB19_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3441,7 +3753,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB17_2: +; GFX1032-NEXT: BB19_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3469,7 +3781,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB18_2 +; GFX7LESS-NEXT: s_cbranch_execz BB20_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -3478,7 +3790,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB18_2: +; GFX7LESS-NEXT: BB20_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -3504,7 +3816,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB18_2 +; GFX8-NEXT: s_cbranch_execz BB20_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3513,7 +3825,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB18_2: +; GFX8-NEXT: BB20_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 @@ -3539,7 +3851,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB18_2 +; GFX9-NEXT: s_cbranch_execz BB20_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3547,7 +3859,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB18_2: +; GFX9-NEXT: BB20_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 @@ -3573,7 +3885,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB18_2 +; GFX1064-NEXT: s_cbranch_execz BB20_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3583,7 +3895,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB18_2: +; GFX1064-NEXT: BB20_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -3606,7 +3918,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB18_2 +; GFX1032-NEXT: s_cbranch_execz BB20_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3616,7 +3928,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB18_2: +; GFX1032-NEXT: BB20_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -3684,7 +3996,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB19_2 +; GFX8-NEXT: s_cbranch_execz BB21_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -3692,7 +4004,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB19_2: +; GFX8-NEXT: BB21_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3735,14 +4047,14 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB19_2 +; GFX9-NEXT: s_cbranch_execz BB21_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB19_2: +; GFX9-NEXT: BB21_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3794,7 +4106,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB19_2 +; GFX1064-NEXT: s_cbranch_execz BB21_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -3804,7 +4116,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB19_2: +; GFX1064-NEXT: BB21_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3845,7 +4157,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB19_2 +; GFX1032-NEXT: s_cbranch_execz BB21_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3854,7 +4166,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB19_2: +; GFX1032-NEXT: BB21_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3882,7 +4194,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB20_2 +; GFX7LESS-NEXT: s_cbranch_execz BB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -3891,7 +4203,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB20_2: +; GFX7LESS-NEXT: BB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -3916,7 +4228,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB20_2 +; GFX8-NEXT: s_cbranch_execz BB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3925,7 +4237,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB20_2: +; GFX8-NEXT: BB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3950,7 +4262,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB20_2 +; GFX9-NEXT: s_cbranch_execz BB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3958,7 +4270,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB20_2: +; GFX9-NEXT: BB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3983,7 +4295,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB20_2 +; GFX1064-NEXT: s_cbranch_execz BB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3993,7 +4305,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB20_2: +; GFX1064-NEXT: BB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -4016,7 +4328,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB20_2 +; GFX1032-NEXT: s_cbranch_execz BB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4026,7 +4338,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB20_2: +; GFX1032-NEXT: BB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -4094,7 +4406,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB21_2 +; GFX8-NEXT: s_cbranch_execz BB23_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -4102,7 +4414,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB21_2: +; GFX8-NEXT: BB23_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -4145,14 +4457,14 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB21_2 +; GFX9-NEXT: s_cbranch_execz BB23_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB21_2: +; GFX9-NEXT: BB23_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -4204,7 +4516,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB21_2 +; GFX1064-NEXT: s_cbranch_execz BB23_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -4214,7 +4526,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB21_2: +; GFX1064-NEXT: BB23_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -4255,7 +4567,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB21_2 +; GFX1032-NEXT: s_cbranch_execz BB23_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -4264,7 +4576,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB21_2: +; GFX1032-NEXT: BB23_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -4292,7 +4604,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB22_2 +; GFX7LESS-NEXT: s_cbranch_execz BB24_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4301,7 +4613,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB22_2: +; GFX7LESS-NEXT: BB24_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -4326,7 +4638,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB22_2 +; GFX8-NEXT: s_cbranch_execz BB24_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4335,7 +4647,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB22_2: +; GFX8-NEXT: BB24_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -4360,7 +4672,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB22_2 +; GFX9-NEXT: s_cbranch_execz BB24_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4368,7 +4680,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB22_2: +; GFX9-NEXT: BB24_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -4393,7 +4705,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB22_2 +; GFX1064-NEXT: s_cbranch_execz BB24_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4403,7 +4715,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB22_2: +; GFX1064-NEXT: BB24_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -4426,7 +4738,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB22_2 +; GFX1032-NEXT: s_cbranch_execz BB24_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4436,7 +4748,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB22_2: +; GFX1032-NEXT: BB24_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -- 2.7.4