From d53699cc4570b18e41dbda4a2ee643a8db5ef66f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 4 Feb 2023 08:09:16 -0400 Subject: [PATCH] AMDGPU: Add some regression tests that infinite looped combiner Prevent a future patch from introducing an infinite combine loop. --- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 441 ++++++++++++++++++++++++++ 1 file changed, 441 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 3e066cf..cc32c25 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2780,11 +2780,452 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha ret <2 x half> %add } +; -------------------------------------------------------------------------------- +; select tests +; -------------------------------------------------------------------------------- + +define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { +; SI-LABEL: s_fneg_select_infloop_regression_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: v_bfrev_b32_e32 v0, 1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitcmp1_b32 s1, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: flat_store_dword v[0:1], v2 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_select_infloop_regression_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_bfrev_b32_e32 v0, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitcmp1_b32 s1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm + %i = select i1 %arg1, float 0.0, float %arg + %i2 = fneg float %i + %i3 = select i1 %arg1, float 0.0, float %i2 + store float %i3, ptr addrspace(1) %ptr, align 4 + ret void +} + +define float @v_fneg_select_infloop_regression_f32(float %arg, i1 %arg1) { +; GCN-LABEL: v_fneg_select_infloop_regression_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %i = select i1 %arg1, float 0.0, float %arg + %i2 = fneg float %i + %i3 = select i1 %arg1, float 0.0, float %i2 + ret float %i3 +} + +define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) { +; SI-LABEL: s_fneg_select_infloop_regression_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitcmp1_b32 s4, 0 +; SI-NEXT: s_cselect_b32 s3, 0, s3 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, 0x80000000, s3 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_xor_b32 s2, s3, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_select_infloop_regression_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitcmp1_b32 s4, 0 +; VI-NEXT: s_cselect_b32 s3, 0, s3 +; VI-NEXT: s_cselect_b32 s2, 0, s2 +; VI-NEXT: s_cselect_b32 s3, 0x80000000, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_xor_b32 s2, s3, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm + %i = select i1 %arg1, double 0.0, double %arg + %i2 = fneg double %i + %i3 = select i1 %arg1, double 0.0, double %i2 + store double %i3, ptr addrspace(1) %ptr, align 4 + ret void +} + +define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) { +; GCN-LABEL: v_fneg_select_infloop_regression_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GCN-NEXT: v_bfrev_b32_e32 v2, 1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %i = select i1 %arg1, double 0.0, double %arg + %i2 = fneg double %i + %i3 = select i1 %arg1, double 0.0, double %i2 + ret double %i3 +} + +define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %arg1, ptr addrspace(1) %ptr) { +; SI-LABEL: s_fneg_select_infloop_regression_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: v_bfrev_b32_e32 v1, 1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_bitcmp1_b32 s2, 16 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cvt_f16_f32_e64 v2, -v0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: flat_store_short v[0:1], v2 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_select_infloop_regression_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 0x8000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitcmp1_b32 s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm + %i = select i1 %arg1, half 0.0, half %arg + %i2 = fneg half %i + %i3 = select i1 %arg1, half 0.0, half %i2 + store half %i3, ptr addrspace(1) %ptr, align 4 + ret void +} + +define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { +; SI-LABEL: v_fneg_select_infloop_regression_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_bfrev_b32_e32 v1, 1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fneg_select_infloop_regression_f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x8000 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %i = select i1 %arg1, half 0.0, half %arg + %i2 = fneg half %i + %i3 = select i1 %arg1, half 0.0, half %i2 + ret half %i3 +} + +define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1, ptr addrspace(1) %ptr) { +; SI-LABEL: s_fneg_select_infloop_regression_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s1, 1, s1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_xor_b32 s0, s0, 0x80008000 +; SI-NEXT: s_cmp_eq_u32 s1, 1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: flat_store_dword v[0:1], v2 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_select_infloop_regression_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s1, 1, s1 +; VI-NEXT: s_cselect_b32 s0, 0, s0 +; VI-NEXT: s_xor_b32 s0, s0, 0x80008000 +; VI-NEXT: s_cmp_eq_u32 s1, 1 +; VI-NEXT: s_cselect_b32 s0, 0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm + %i = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %arg + %i2 = fneg <2 x half> %i + %i3 = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %i2 + store <2 x half> %i3, ptr addrspace(1) %ptr, align 4 + ret void +} + +define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1) { +; SI-LABEL: v_fneg_select_infloop_regression_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fneg_select_infloop_regression_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: s_setpc_b64 s[30:31] + %i = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %arg + %i2 = fneg <2 x half> %i + %i3 = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %i2 + ret <2 x half> %i3 +} + +define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) { +; SI-LABEL: s_fneg_select_infloop_regression_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, 1, s4 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; SI-NEXT: s_cmp_eq_u32 s4, 1 +; SI-NEXT: s_cselect_b32 s3, 0, s3 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_xor_b32 s3, s3, 0x80000000 +; SI-NEXT: s_cmp_eq_u32 s4, 1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_cselect_b32 s2, 0, s3 +; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_select_infloop_regression_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_and_b32 s4, 1, s4 +; VI-NEXT: s_cselect_b32 s2, 0, s2 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: s_cmp_eq_u32 s4, 1 +; VI-NEXT: s_cselect_b32 s3, 0, s3 +; VI-NEXT: s_cselect_b32 s2, 0, s2 +; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 +; VI-NEXT: s_cmp_eq_u32 s4, 1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_cselect_b32 s2, 0, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm + %i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg + %i2 = fneg <2 x float> %i + %i3 = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %i2 + store <2 x float> %i3, ptr addrspace(1) %ptr, align 4 + ret void +} + +define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1) { +; GCN-LABEL: v_fneg_select_infloop_regression_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] + %i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg + %i2 = fneg <2 x float> %i + %i3 = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %i2 + ret <2 x float> %i3 +} + +define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { +; SI-LABEL: s_fabs_select_infloop_regression_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitcmp1_b32 s1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: flat_store_dword v[0:1], v2 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fabs_select_infloop_regression_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitcmp1_b32 s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm + %i = select i1 %arg1, float 0.0, float %arg + %i2 = call float @llvm.fabs.f32(float %i) + %i3 = select i1 %arg1, float 0.0, float %i2 + store float %i3, ptr addrspace(1) %ptr, align 4 + ret void +} + +define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) { +; GCN-LABEL: v_fabs_select_infloop_regression_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %i = select i1 %arg1, float 0.0, float %arg + %i2 = call float @llvm.fabs.f32(float %i) + %i3 = select i1 %arg1, float 0.0, float %i2 + ret float %i3 +} + +define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { +; SI-LABEL: s_fneg_fabs_select_infloop_regression: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: v_bfrev_b32_e32 v0, 1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitcmp1_b32 s1, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, |v1|, v0, s[0:1] +; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: flat_store_dword v[0:1], v2 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_fabs_select_infloop_regression: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_bfrev_b32_e32 v0, 1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitcmp1_b32 s1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, |v1|, v0, s[0:1] +; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm + %i = select i1 %arg1, float 0.0, float %arg + %i2 = call float @llvm.fabs.f32(float %i) + %neg.i2 = fneg float %i2 + %i3 = select i1 %arg1, float 0.0, float %neg.i2 + store float %i3, ptr addrspace(1) %ptr, align 4 + ret void +} + +define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) { +; GCN-LABEL: v_fneg_fabs_select_infloop_regression: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_bfrev_b32_e32 v1, 1 +; GCN-NEXT: v_cndmask_b32_e64 v0, |v0|, v1, vcc +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %i = select i1 %arg1, float 0.0, float %arg + %i2 = call float @llvm.fabs.f32(float %i) + %neg.i2 = fneg float %i2 + %i3 = select i1 %arg1, float 0.0, float %neg.i2 + ret float %i3 +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fma.f32(float, float, float) #1 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) declare float @llvm.fmuladd.f32(float, float, float) #1 declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 +declare float @llvm.fabs.f32(float) #1 declare float @llvm.sin.f32(float) #1 declare float @llvm.trunc.f32(float) #1 declare float @llvm.round.f32(float) #1 -- 2.7.4