From 8e5a41e8271fb95411a2867db00980abf7fe04fd Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 16 Feb 2023 17:05:33 +0000 Subject: [PATCH] Revert "AMDGPU: Override getNegatedExpression constant handling" This reverts commit 11c3cead23783e65fb30e673d62771352078ff05. It was causing infinite loops in the DAG combiner. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 7 -- llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll | 10 +- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 25 +++-- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 113 ++++++++++++++-------- llvm/test/CodeGen/AMDGPU/v_pack.ll | 2 +- 5 files changed, 89 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 79fbbf5..bb02eb3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -793,13 +793,6 @@ SDValue AMDGPUTargetLowering::getNegatedExpression( NegatibleCost &Cost, unsigned Depth) const { switch (Op.getOpcode()) { - case ISD::ConstantFP: { - auto *C = cast(Op); - Cost = getConstantNegateCost(C); - APFloat V = C->getValueAPF(); - V.changeSign(); - return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType()); - } case ISD::FMA: case ISD::FMAD: { // Negating a fma is not free if it has users without source mods. diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index b2a27b9..f6ac0f6 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -276,10 +276,10 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { } ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, s{{[0-9]+}}, s{{[0-9]+}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, s{{[0-9]+}}, s{{[0-9]+}}, 2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0, s{{[0-9]+}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0, s{{[0-9]+}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 @@ -301,7 +301,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-DAG: v_div_fmas_f32 ; GCN-DENORM-DAG: v_div_fmas_f32 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} ; GCN-FLUSH-DAG: v_rcp_f32_e32 ; GCN-FLUSH-DAG: v_rcp_f32_e64 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index f0d9d03..a4cc952 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -519,9 +519,9 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 ; SI-NSZ-NEXT: v_rcp_f32_e32 v3, v2 -; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NSZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; SI-NSZ-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -531,8 +531,8 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; SI-NSZ-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NSZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 +; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 @@ -563,10 +563,9 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 ; VI-NSZ-NEXT: v_rcp_f32_e32 v0, v0 -; VI-NSZ-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 ; VI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, -1.0 -; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0, v0 +; VI-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, 1.0 +; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 ; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 @@ -599,15 +598,13 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 ; GFX11-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, 1.0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, -1.0 -; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo ; GFX11-NSZ-NEXT: ; return to shader part epilog @@ -644,10 +641,10 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, < ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 ; SI-NSZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_rcp_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 53f97ad..30177a5 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -203,9 +203,9 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; ; SI-NSZ-LABEL: fneg_fadd_0_f32: ; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, -1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 ; SI-NSZ-NEXT: v_rcp_f32_e32 v1, v0 -; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, -1.0, s1, -1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NSZ-NEXT: v_fma_f32 v3, -v0, v1, 1.0 ; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, v1 @@ -215,8 +215,8 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; SI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v2 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, -1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 +; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 ; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc @@ -251,8 +251,8 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; ; VI-NSZ-LABEL: fneg_fadd_0_f32: ; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, -1.0 -; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, -1.0, s1, -1.0 +; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 ; VI-NSZ-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NSZ-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -265,8 +265,8 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; VI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, -1.0 -; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 +; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 ; VI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 @@ -575,30 +575,32 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; SI-SAFE-NEXT: s_mov_b32 s0, 0 ; SI-SAFE-NEXT: ; return to shader part epilog ; -; GCN-NSZ-LABEL: fneg_fadd_0_f64: -; GCN-NSZ: ; %bb.0: ; %.entry -; GCN-NSZ-NEXT: v_div_scale_f64 v[0:1], s[4:5], s[2:3], s[2:3], -1.0 -; GCN-NSZ-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; GCN-NSZ-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; GCN-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] -; GCN-NSZ-NEXT: v_div_scale_f64 v[4:5], vcc, -1.0, s[2:3], -1.0 -; GCN-NSZ-NEXT: v_fma_f64 v[6:7], -v[0:1], v[2:3], 1.0 -; GCN-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; GCN-NSZ-NEXT: v_mul_f64 v[6:7], v[4:5], v[2:3] -; GCN-NSZ-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], v[4:5] -; GCN-NSZ-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[6:7] -; GCN-NSZ-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NSZ-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], -1.0 -; GCN-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 -; GCN-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] -; GCN-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; GCN-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec -; GCN-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; GCN-NSZ-NEXT: s_mov_b32 s0, 0 -; GCN-NSZ-NEXT: ; return to shader part epilog +; SI-NSZ-LABEL: fneg_fadd_0_f64: +; SI-NSZ: ; %bb.0: ; %.entry +; SI-NSZ-NEXT: v_div_scale_f64 v[0:1], s[4:5], s[2:3], s[2:3], 1.0 +; SI-NSZ-NEXT: s_mov_b32 s4, 0 +; SI-NSZ-NEXT: s_brev_b32 s5, 1 +; SI-NSZ-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-NSZ-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] +; SI-NSZ-NEXT: v_div_scale_f64 v[4:5], vcc, 1.0, s[2:3], 1.0 +; SI-NSZ-NEXT: v_fma_f64 v[6:7], -v[0:1], v[2:3], 1.0 +; SI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; SI-NSZ-NEXT: v_mul_f64 v[6:7], v[4:5], v[2:3] +; SI-NSZ-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], v[4:5] +; SI-NSZ-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; SI-NSZ-NEXT: v_mov_b32_e32 v2, s1 +; SI-NSZ-NEXT: v_mov_b32_e32 v3, s0 +; SI-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0 +; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; SI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] +; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; SI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 +; SI-NSZ-NEXT: s_mov_b32 s0, 0 +; SI-NSZ-NEXT: ; return to shader part epilog ; ; VI-SAFE-LABEL: fneg_fadd_0_f64: ; VI-SAFE: ; %bb.0: ; %.entry @@ -626,6 +628,33 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; VI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 ; VI-SAFE-NEXT: s_mov_b32 s0, 0 ; VI-SAFE-NEXT: ; return to shader part epilog +; +; VI-NSZ-LABEL: fneg_fadd_0_f64: +; VI-NSZ: ; %bb.0: ; %.entry +; VI-NSZ-NEXT: v_div_scale_f64 v[0:1], s[4:5], s[2:3], s[2:3], 1.0 +; VI-NSZ-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; VI-NSZ-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; VI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] +; VI-NSZ-NEXT: v_div_scale_f64 v[4:5], vcc, 1.0, s[2:3], 1.0 +; VI-NSZ-NEXT: v_fma_f64 v[6:7], -v[0:1], v[2:3], 1.0 +; VI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; VI-NSZ-NEXT: v_mul_f64 v[6:7], v[4:5], v[2:3] +; VI-NSZ-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], v[4:5] +; VI-NSZ-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; VI-NSZ-NEXT: v_mov_b32_e32 v2, s1 +; VI-NSZ-NEXT: v_mov_b32_e32 v3, s0 +; VI-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0 +; VI-NSZ-NEXT: s_mov_b32 s2, 0 +; VI-NSZ-NEXT: s_brev_b32 s3, 1 +; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] +; VI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] +; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; VI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 +; VI-NSZ-NEXT: s_mov_b32 s0, 0 +; VI-NSZ-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv double 1.000000e+00, %tmp6 %tmp8 = fmul double 0.000000e+00, %tmp7 @@ -662,13 +691,14 @@ define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %t ; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] ; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 ; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-NSZ-NEXT: v_mul_f64 v[2:3], v[0:1], -1.0 -; SI-NSZ-NEXT: v_fma_f64 v[4:5], -s[2:3], v[2:3], -1.0 -; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-NSZ-NEXT: s_mov_b32 s2, 0 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-NSZ-NEXT: s_brev_b32 s3, 1 +; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] ; SI-NSZ-NEXT: v_mov_b32_e32 v2, s1 -; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 -; SI-NSZ-NEXT: v_mov_b32_e32 v3, s0 ; SI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] +; SI-NSZ-NEXT: v_mov_b32_e32 v3, s0 ; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] @@ -684,12 +714,13 @@ define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %t ; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] ; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 ; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NSZ-NEXT: v_mul_f64 v[2:3], v[0:1], -1.0 -; VI-NSZ-NEXT: v_fma_f64 v[4:5], -s[2:3], v[2:3], -1.0 -; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NSZ-NEXT: s_mov_b32 s2, 0 +; VI-NSZ-NEXT: s_brev_b32 s3, 1 +; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] ; VI-NSZ-NEXT: v_mov_b32_e32 v2, s1 ; VI-NSZ-NEXT: v_mov_b32_e32 v3, s0 -; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 +; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] ; VI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] ; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index a94542a..5ec9284 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -63,7 +63,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f16_e32 v0, -2.0, v1 +; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 ; GCN-NEXT: ;;#ASMSTART -- 2.7.4