From 11c3cead23783e65fb30e673d62771352078ff05 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 13 Feb 2023 14:27:29 -0400 Subject: [PATCH] AMDGPU: Override getNegatedExpression constant handling Ignore the multiple use heuristics of the default implementation, and report cost based on inline immediates. This is mostly interesting for -0 vs. 0. Gets a few small improvements. fneg_fadd_0_f16 is a small regression. We could probably avoid this if we handled folding fneg into div_fixup. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 7 ++ llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll | 10 +- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 25 ++--- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 113 ++++++++-------------- llvm/test/CodeGen/AMDGPU/v_pack.ll | 2 +- 5 files changed, 68 insertions(+), 89 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 58a53f1..1b1abc4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -793,6 +793,13 @@ SDValue AMDGPUTargetLowering::getNegatedExpression( NegatibleCost &Cost, unsigned Depth) const { switch (Op.getOpcode()) { + case ISD::ConstantFP: { + auto *C = cast(Op); + Cost = getConstantNegateCost(C); + APFloat V = C->getValueAPF(); + V.changeSign(); + return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType()); + } case ISD::FMA: case ISD::FMAD: { // Negating a fma is not free if it has users without source mods. diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index f6ac0f6..b2a27b9 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -276,10 +276,10 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { } ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, s{{[0-9]+}}, s{{[0-9]+}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, s{{[0-9]+}}, s{{[0-9]+}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0, s{{[0-9]+}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0, s{{[0-9]+}}, 2.0{{$}} ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 @@ -301,7 +301,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-DAG: v_div_fmas_f32 ; GCN-DENORM-DAG: v_div_fmas_f32 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}} ; GCN-FLUSH-DAG: v_rcp_f32_e32 ; GCN-FLUSH-DAG: v_rcp_f32_e64 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index a4cc952..f0d9d03 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -519,9 +519,9 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0 ; SI-NSZ-NEXT: v_rcp_f32_e32 v3, v2 -; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NSZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; SI-NSZ-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -531,8 +531,8 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; SI-NSZ-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NSZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 +; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 @@ -563,9 +563,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 ; VI-NSZ-NEXT: v_rcp_f32_e32 v0, v0 +; VI-NSZ-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 ; VI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, 1.0 -; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 +; VI-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, -1.0 +; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0, v0 ; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 @@ -598,13 +599,15 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NSZ-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 ; GFX11-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, 1.0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 +; GFX11-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, -1.0 +; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0, v0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo ; GFX11-NSZ-NEXT: ; return to shader part epilog @@ -641,10 +644,10 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, < ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 ; SI-NSZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_rcp_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 30177a5..53f97ad 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -203,9 +203,9 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; ; SI-NSZ-LABEL: fneg_fadd_0_f32: ; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, -1.0 ; SI-NSZ-NEXT: v_rcp_f32_e32 v1, v0 -; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, -1.0, s1, -1.0 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NSZ-NEXT: v_fma_f32 v3, -v0, v1, 1.0 ; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, v1 @@ -215,8 +215,8 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; SI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v2 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 +; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, -1.0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 ; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc @@ -251,8 +251,8 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; ; VI-NSZ-LABEL: fneg_fadd_0_f32: ; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 +; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, -1.0 +; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, -1.0, s1, -1.0 ; VI-NSZ-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NSZ-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -265,8 +265,8 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; VI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 +; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, -1.0 +; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 ; VI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 @@ -575,32 +575,30 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; SI-SAFE-NEXT: s_mov_b32 s0, 0 ; SI-SAFE-NEXT: ; return to shader part epilog ; -; SI-NSZ-LABEL: fneg_fadd_0_f64: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_div_scale_f64 v[0:1], s[4:5], s[2:3], s[2:3], 1.0 -; SI-NSZ-NEXT: s_mov_b32 s4, 0 -; SI-NSZ-NEXT: s_brev_b32 s5, 1 -; SI-NSZ-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; SI-NSZ-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; SI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] -; SI-NSZ-NEXT: v_div_scale_f64 v[4:5], vcc, 1.0, s[2:3], 1.0 -; SI-NSZ-NEXT: v_fma_f64 v[6:7], -v[0:1], v[2:3], 1.0 -; SI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; SI-NSZ-NEXT: v_mul_f64 v[6:7], v[4:5], v[2:3] -; SI-NSZ-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], v[4:5] -; SI-NSZ-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[6:7] -; SI-NSZ-NEXT: v_mov_b32_e32 v2, s1 -; SI-NSZ-NEXT: v_mov_b32_e32 v3, s0 -; SI-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0 -; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] -; SI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] -; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; SI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; SI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; SI-NSZ-NEXT: s_mov_b32 s0, 0 -; SI-NSZ-NEXT: ; return to shader part epilog +; GCN-NSZ-LABEL: fneg_fadd_0_f64: +; GCN-NSZ: ; %bb.0: ; %.entry +; GCN-NSZ-NEXT: v_div_scale_f64 v[0:1], s[4:5], s[2:3], s[2:3], -1.0 +; GCN-NSZ-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; GCN-NSZ-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] +; GCN-NSZ-NEXT: v_div_scale_f64 v[4:5], vcc, -1.0, s[2:3], -1.0 +; GCN-NSZ-NEXT: v_fma_f64 v[6:7], -v[0:1], v[2:3], 1.0 +; GCN-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] +; GCN-NSZ-NEXT: v_mul_f64 v[6:7], v[4:5], v[2:3] +; GCN-NSZ-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], v[4:5] +; GCN-NSZ-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; GCN-NSZ-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NSZ-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], -1.0 +; GCN-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 +; GCN-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] +; GCN-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; GCN-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec +; GCN-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 +; GCN-NSZ-NEXT: s_mov_b32 s0, 0 +; GCN-NSZ-NEXT: ; return to shader part epilog ; ; VI-SAFE-LABEL: fneg_fadd_0_f64: ; VI-SAFE: ; %bb.0: ; %.entry @@ -628,33 +626,6 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; VI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 ; VI-SAFE-NEXT: s_mov_b32 s0, 0 ; VI-SAFE-NEXT: ; return to shader part epilog -; -; VI-NSZ-LABEL: fneg_fadd_0_f64: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_div_scale_f64 v[0:1], s[4:5], s[2:3], s[2:3], 1.0 -; VI-NSZ-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; VI-NSZ-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; VI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] -; VI-NSZ-NEXT: v_div_scale_f64 v[4:5], vcc, 1.0, s[2:3], 1.0 -; VI-NSZ-NEXT: v_fma_f64 v[6:7], -v[0:1], v[2:3], 1.0 -; VI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; VI-NSZ-NEXT: v_mul_f64 v[6:7], v[4:5], v[2:3] -; VI-NSZ-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], v[4:5] -; VI-NSZ-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[6:7] -; VI-NSZ-NEXT: v_mov_b32_e32 v2, s1 -; VI-NSZ-NEXT: v_mov_b32_e32 v3, s0 -; VI-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0 -; VI-NSZ-NEXT: s_mov_b32 s2, 0 -; VI-NSZ-NEXT: s_brev_b32 s3, 1 -; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] -; VI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; VI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; VI-NSZ-NEXT: s_mov_b32 s0, 0 -; VI-NSZ-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv double 1.000000e+00, %tmp6 %tmp8 = fmul double 0.000000e+00, %tmp7 @@ -691,14 +662,13 @@ define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %t ; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] ; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 ; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-NSZ-NEXT: s_mov_b32 s2, 0 -; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-NSZ-NEXT: s_brev_b32 s3, 1 -; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] +; SI-NSZ-NEXT: v_mul_f64 v[2:3], v[0:1], -1.0 +; SI-NSZ-NEXT: v_fma_f64 v[4:5], -s[2:3], v[2:3], -1.0 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] ; SI-NSZ-NEXT: v_mov_b32_e32 v2, s1 -; SI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] +; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 ; SI-NSZ-NEXT: v_mov_b32_e32 v3, s0 +; SI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] ; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] @@ -714,13 +684,12 @@ define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %t ; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] ; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 ; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-NSZ-NEXT: s_mov_b32 s2, 0 -; VI-NSZ-NEXT: s_brev_b32 s3, 1 -; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NSZ-NEXT: v_mul_f64 v[2:3], v[0:1], -1.0 +; VI-NSZ-NEXT: v_fma_f64 v[4:5], -s[2:3], v[2:3], -1.0 +; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] ; VI-NSZ-NEXT: v_mov_b32_e32 v2, s1 ; VI-NSZ-NEXT: v_mov_b32_e32 v3, s0 -; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] +; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 ; VI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] ; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 5ec9284..a94542a 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -63,7 +63,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1 +; GCN-NEXT: v_add_f16_e32 v0, -2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 ; GCN-NEXT: ;;#ASMSTART -- 2.7.4