From dec42ffa2869faab6384fd534d2d90072bf46db3 Mon Sep 17 00:00:00 2001 From: Ivan Kosarev Date: Tue, 20 Jun 2023 10:32:12 +0100 Subject: [PATCH] [AMDGPU][GFX11] Add test coverage for FMA instructions. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D153269 --- .../AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll | 358 ++++ .../AMDGPU/GlobalISel/combine-fma-sub-mul.ll | 250 +++ llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll | 352 ++-- llvm/test/CodeGen/AMDGPU/fma-combine.ll | 2034 ++++++++++++++++++-- llvm/test/CodeGen/AMDGPU/fma.f16.ll | 107 + llvm/test/CodeGen/AMDGPU/fma.f64.ll | 37 +- 6 files changed, 2801 insertions(+), 337 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll index 88ab58d..a4261ad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll @@ -3,6 +3,8 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -fp-contract=fast < %s | FileCheck -check-prefix=GFX11-CONTRACT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX11-DENORM %s ; fadd (fma a, b, (fmul c, d)), e --> fma a, b, (fma c, d, e) ; fadd e, (fma a, b, (fmul c, d)) --> fma a, b, (fma c, d, e) @@ -40,6 +42,26 @@ define float @test_f32_add_mul(float %a, float %b, float %c, float %d, float %e) ; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_f32_add_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, v4 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX11-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_f32_add_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_fma_f32 v2, v2, v3, v4 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast float %c, %d %y = call fast float @llvm.fmuladd.f32(float %a, float %b, float %x) @@ -80,6 +102,26 @@ define float @test_f32_add_mul_rhs(float %a, float %b, float %c, float %d, float ; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_f32_add_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, v4 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX11-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_f32_add_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_fma_f32 v2, v2, v3, v4 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast float %c, %d %y = call fast float @llvm.fmuladd.f32(float %a, float %b, float %x) @@ -121,6 +163,27 @@ define half @test_half_add_mul(half %a, half %b, half %c, half %d, half %e) { ; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v4 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_half_add_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f16 v2, v2, v3, v4 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX11-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_half_add_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX11-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-DENORM-NEXT: v_add_f16_e32 v0, v0, v4 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast half %c, %d %y = call fast half @llvm.fmuladd.f16(half %a, half %b, half %x) @@ -162,6 +225,27 @@ define half @test_half_add_mul_rhs(half %a, half %b, half %c, half %d, half %e) ; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v4, v0 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_half_add_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f16 v2, v2, v3, v4 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX11-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_half_add_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX11-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-DENORM-NEXT: v_add_f16_e32 v0, v4, v0 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast half %c, %d %y = call fast half @llvm.fmuladd.f16(half %a, half %b, half %x) @@ -199,6 +283,24 @@ define double @test_double_add_mul(double %a, double %b, double %c, double %d, d ; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] ; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_double_add_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_double_add_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast double %c, %d %y = call fast double @llvm.fmuladd.f64(double %a, double %b, double %x) @@ -236,6 +338,24 @@ define double @test_double_add_mul_rhs(double %a, double %b, double %c, double % ; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] ; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_double_add_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_double_add_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast double %c, %d %y = call fast double @llvm.fmuladd.f64(double %a, double %b, double %x) @@ -309,6 +429,38 @@ define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x floa ; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, v10 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v3, v11 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_v4f32_add_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f32 v8, v8, v12, v16 +; GFX11-CONTRACT-NEXT: v_fma_f32 v9, v9, v13, v17 +; GFX11-CONTRACT-NEXT: v_fma_f32 v10, v10, v14, v18 +; GFX11-CONTRACT-NEXT: v_fma_f32 v11, v11, v15, v19 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-CONTRACT-NEXT: v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5 +; GFX11-CONTRACT-NEXT: v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-CONTRACT-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 +; GFX11-CONTRACT-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_v4f32_add_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_fma_f32 v8, v8, v12, v16 +; GFX11-DENORM-NEXT: v_fma_f32 v9, v9, v13, v17 +; GFX11-DENORM-NEXT: v_fma_f32 v10, v10, v14, v18 +; GFX11-DENORM-NEXT: v_fma_f32 v11, v11, v15, v19 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5 +; GFX11-DENORM-NEXT: v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 +; GFX11-DENORM-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast <4 x float> %c, %d %y = call fast <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %x) @@ -382,6 +534,38 @@ define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x ; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, v10 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v3, v11 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_v4f32_add_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f32 v8, v8, v12, v16 +; GFX11-CONTRACT-NEXT: v_fma_f32 v9, v9, v13, v17 +; GFX11-CONTRACT-NEXT: v_fma_f32 v10, v10, v14, v18 +; GFX11-CONTRACT-NEXT: v_fma_f32 v11, v11, v15, v19 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-CONTRACT-NEXT: v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5 +; GFX11-CONTRACT-NEXT: v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-CONTRACT-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 +; GFX11-CONTRACT-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_v4f32_add_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_fma_f32 v8, v8, v12, v16 +; GFX11-DENORM-NEXT: v_fma_f32 v9, v9, v13, v17 +; GFX11-DENORM-NEXT: v_fma_f32 v10, v10, v14, v18 +; GFX11-DENORM-NEXT: v_fma_f32 v11, v11, v15, v19 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5 +; GFX11-DENORM-NEXT: v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 +; GFX11-DENORM-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast <4 x float> %c, %d %y = call fast <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %x) @@ -435,6 +619,33 @@ define <4 x half> @test_f16_add_mul(<4 x half> %a, <4 x half> %b, <4 x half> %c, ; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v8 ; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v9 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_f16_add_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8 +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_f16_add_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v0, v8 +; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v1, v9 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast <4 x half> %c, %d %y = call fast <4 x half> @llvm.fmuladd.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %x) @@ -488,6 +699,33 @@ define <4 x half> @test_f16_add_mul_rhs(<4 x half> %a, <4 x half> %b, <4 x half> ; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v8, v0 ; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v9, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_f16_add_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8 +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9 +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_f16_add_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v8, v0 +; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v9, v1 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast <4 x half> %c, %d %y = call fast <4 x half> @llvm.fmuladd.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %x) @@ -603,6 +841,66 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou ; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX10-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_f64_add_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: s_clause 0x8 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(6) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(2) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_f64_add_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: s_clause 0x8 +; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-DENORM-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-DENORM-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-DENORM-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-DENORM-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-DENORM-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX11-DENORM-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-DENORM-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX11-DENORM-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX11-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX11-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(2) +; GFX11-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX11-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX11-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast <4 x double> %c, %d %y = call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %x) @@ -718,6 +1016,66 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x ; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX10-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_f64_add_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: s_clause 0x8 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX11-CONTRACT-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(6) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(2) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_f64_add_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: s_clause 0x8 +; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-DENORM-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-DENORM-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-DENORM-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-DENORM-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-DENORM-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX11-DENORM-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-DENORM-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX11-DENORM-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX11-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX11-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(2) +; GFX11-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX11-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX11-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %x = fmul fast <4 x double> %c, %d %y = call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %x) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll index 2e9a374..11a3b96 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -5,6 +5,8 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -fp-contract=fast < %s | FileCheck -check-prefix=GFX11-CONTRACT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX11-DENORM %s ; fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) ; fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) @@ -50,6 +52,22 @@ define float @test_f32_sub_mul(float %x, float %y, float %z) { ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, -v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_f32_sub_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_f32_sub_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul float %x, %y %b = fsub float %a, %z @@ -97,6 +115,22 @@ define float @test_f32_sub_mul_rhs(float %x, float %y, float %z) { ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: v_mad_f32 v0, -v0, v1, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_f32_sub_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f32 v0, -v0, v1, v2 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_f32_sub_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul float %x, %y %b = fsub float %z, %a @@ -145,6 +179,22 @@ define half @test_half_sub_mul(half %x, half %y, half %z) { ; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-DENORM-NEXT: v_sub_f16_e32 v0, v0, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_half_sub_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, -v2 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_half_sub_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul half %x, %y %b = fsub half %a, %z @@ -193,6 +243,22 @@ define half @test_half_sub_mul_rhs(half %x, half %y, half %z) { ; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-DENORM-NEXT: v_sub_f16_e32 v0, v2, v0 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_half_sub_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f16 v0, -v0, v1, v2 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_half_sub_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul half %x, %y %b = fsub half %z, %a @@ -242,6 +308,22 @@ define double @test_double_sub_mul(double %x, double %y, double %z) { ; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_double_sub_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_double_sub_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul double %x, %y %b = fsub double %a, %z @@ -291,6 +373,22 @@ define double @test_double_sub_mul_rhs(double %x, double %y, double %z) { ; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_double_sub_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_double_sub_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1] +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul double %x, %y %b = fsub double %z, %a @@ -362,6 +460,27 @@ define <4 x float> @test_v4f32_sub_mul(<4 x float> %x, <4 x float> %y, <4 x floa ; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v6, -v10 ; GFX10-DENORM-NEXT: v_mad_f32 v3, v3, v7, -v11 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_v4f32_sub_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, -v8 +; GFX11-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, -v9 +; GFX11-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, -v10 +; GFX11-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, -v11 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_v4f32_sub_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5 +; GFX11-DENORM-NEXT: v_dual_mul_f32 v2, v2, v6 :: v_dual_mul_f32 v3, v3, v7 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_dual_sub_f32 v0, v0, v8 :: v_dual_sub_f32 v1, v1, v9 +; GFX11-DENORM-NEXT: v_dual_sub_f32 v2, v2, v10 :: v_dual_sub_f32 v3, v3, v11 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x float> %x, %y %b = fsub <4 x float> %a, %z @@ -433,6 +552,27 @@ define <4 x float> @test_v4f32_sub_mul_rhs(<4 x float> %x, <4 x float> %y, <4 x ; GFX10-DENORM-NEXT: v_mad_f32 v2, -v2, v6, v10 ; GFX10-DENORM-NEXT: v_mad_f32 v3, -v3, v7, v11 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_v4f32_sub_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f32 v0, -v0, v4, v8 +; GFX11-CONTRACT-NEXT: v_fma_f32 v1, -v1, v5, v9 +; GFX11-CONTRACT-NEXT: v_fma_f32 v2, -v2, v6, v10 +; GFX11-CONTRACT-NEXT: v_fma_f32 v3, -v3, v7, v11 +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_v4f32_sub_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5 +; GFX11-DENORM-NEXT: v_dual_mul_f32 v2, v2, v6 :: v_dual_mul_f32 v3, v3, v7 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_dual_sub_f32 v0, v8, v0 :: v_dual_sub_f32 v1, v9, v1 +; GFX11-DENORM-NEXT: v_dual_sub_f32 v2, v10, v2 :: v_dual_sub_f32 v3, v11, v3 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x float> %x, %y %b = fsub <4 x float> %z, %a @@ -508,6 +648,35 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 ; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_v4f16_sub_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-DENORM-NEXT: v_sub_f16_e32 v0, v0, v4 +; GFX11-DENORM-NEXT: v_sub_f16_e32 v1, v1, v5 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_sub_f16_e32 v2, v6, v2 +; GFX11-DENORM-NEXT: v_sub_f16_e32 v3, v7, v3 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX11-DENORM-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y %b = fsub <4 x half> %a, %z @@ -583,6 +752,35 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 ; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_v4f16_sub_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX11-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-DENORM-NEXT: v_sub_f16_e32 v0, v4, v0 +; GFX11-DENORM-NEXT: v_sub_f16_e32 v1, v5, v1 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_sub_f16_e32 v2, v2, v6 +; GFX11-DENORM-NEXT: v_sub_f16_e32 v3, v3, v7 +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DENORM-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX11-DENORM-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y %b = fsub <4 x half> %z, %a @@ -662,6 +860,32 @@ define <4 x double> @test_v4f64_sub_mul(<4 x double> %x, <4 x double> %y, <4 x d ; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[4:5], -v[20:21] ; GFX10-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], -v[22:23] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_v4f64_sub_mul: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], -v[16:17] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], -v[18:19] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], -v[20:21] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], -v[22:23] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_v4f64_sub_mul: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX11-DENORM-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11] +; GFX11-DENORM-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13] +; GFX11-DENORM-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], -v[16:17] +; GFX11-DENORM-NEXT: v_add_f64 v[2:3], v[2:3], -v[18:19] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_add_f64 v[4:5], v[4:5], -v[20:21] +; GFX11-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], -v[22:23] +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x double> %x, %y %b = fsub <4 x double> %a, %z @@ -741,6 +965,32 @@ define <4 x double> @test_v4f64_sub_mul_rhs(<4 x double> %x, <4 x double> %y, <4 ; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[20:21], -v[4:5] ; GFX10-DENORM-NEXT: v_add_f64 v[6:7], v[22:23], -v[6:7] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-CONTRACT-LABEL: test_v4f64_sub_mul_rhs: +; GFX11-CONTRACT: ; %bb.0: ; %.entry +; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], v[16:17] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], v[18:19] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], -v[4:5], v[12:13], v[20:21] +; GFX11-CONTRACT-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[22:23] +; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-DENORM-LABEL: test_v4f64_sub_mul_rhs: +; GFX11-DENORM: ; %bb.0: ; %.entry +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX11-DENORM-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11] +; GFX11-DENORM-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13] +; GFX11-DENORM-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_add_f64 v[0:1], v[16:17], -v[0:1] +; GFX11-DENORM-NEXT: v_add_f64 v[2:3], v[18:19], -v[2:3] +; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DENORM-NEXT: v_add_f64 v[4:5], v[20:21], -v[4:5] +; GFX11-DENORM-NEXT: v_add_f64 v[6:7], v[22:23], -v[6:7] +; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x double> %x, %y %b = fsub <4 x double> %z, %a diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll index eda1355..3cf0a82 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -1,71 +1,140 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s define amdgpu_ps float @_amdgpu_ps_main() #0 { -; GCN-LABEL: _amdgpu_ps_main: -; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D -; GCN-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm -; GCN-NEXT: s_clause 0x3 -; GCN-NEXT: s_buffer_load_dword s24, s[0:3], 0x5c -; GCN-NEXT: s_buffer_load_dword s28, s[0:3], 0x7c -; GCN-NEXT: s_buffer_load_dword s29, s[0:3], 0xc0 -; GCN-NEXT: s_waitcnt_depctr 0xffe3 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x40 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x50 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x2c -; GCN-NEXT: v_sub_f32_e64 v5, s24, s28 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_clause 0x4 -; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60 -; GCN-NEXT: s_buffer_load_dwordx4 s[12:15], s[0:3], 0x20 -; GCN-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0 -; GCN-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70 -; GCN-NEXT: s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10 -; GCN-NEXT: v_fma_f32 v1, v1, v5, s28 -; GCN-NEXT: v_max_f32_e64 v6, s0, s0 clamp -; GCN-NEXT: v_add_f32_e64 v5, s29, -1.0 -; GCN-NEXT: v_sub_f32_e32 v8, s0, v1 -; GCN-NEXT: v_fma_f32 v7, -s2, v6, s6 -; GCN-NEXT: v_fma_f32 v5, v6, v5, 1.0 -; GCN-NEXT: v_mad_f32 v10, s2, v6, v2 -; GCN-NEXT: s_mov_b32 s0, 0x3c23d70a -; GCN-NEXT: v_fmac_f32_e32 v1, v6, v8 -; GCN-NEXT: v_fmac_f32_e32 v10, v7, v6 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v9, s10, v0 -; GCN-NEXT: v_fma_f32 v0, -v0, s10, s14 -; GCN-NEXT: v_mul_f32_e32 v8, s18, v2 -; GCN-NEXT: v_mul_f32_e32 v3, s22, v3 -; GCN-NEXT: v_fmac_f32_e32 v9, v0, v6 -; GCN-NEXT: v_sub_f32_e32 v0, v1, v5 -; GCN-NEXT: v_mul_f32_e32 v1, v8, v6 -; GCN-NEXT: v_mul_f32_e32 v7, v6, v3 -; GCN-NEXT: v_fma_f32 v3, -v6, v3, v9 -; GCN-NEXT: v_fmac_f32_e32 v5, v0, v6 -; GCN-NEXT: v_fma_f32 v0, v2, s26, -v1 -; GCN-NEXT: v_fmac_f32_e32 v7, v3, v6 -; GCN-NEXT: v_fmac_f32_e32 v1, v0, v6 -; GCN-NEXT: v_mul_f32_e32 v0, v2, v6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v4, v4, v10 -; GCN-NEXT: v_mul_f32_e32 v3, v4, v6 -; GCN-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a -; GCN-NEXT: v_mul_f32_e32 v1, v3, v1 -; GCN-NEXT: v_mul_f32_e32 v2, v7, v4 -; GCN-NEXT: v_fmac_f32_e32 v1, v2, v0 -; GCN-NEXT: v_max_f32_e32 v0, 0, v1 -; GCN-NEXT: ; return to shader part epilog +; GFX10-LABEL: _amdgpu_ps_main: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: s_buffer_load_dword s24, s[0:3], 0x5c +; GFX10-NEXT: s_buffer_load_dword s28, s[0:3], 0x7c +; GFX10-NEXT: s_buffer_load_dword s29, s[0:3], 0xc0 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x40 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x50 +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_buffer_load_dword s0, s[0:3], 0x2c +; GFX10-NEXT: v_sub_f32_e64 v5, s24, s28 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x4 +; GFX10-NEXT: s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60 +; GFX10-NEXT: s_buffer_load_dwordx4 s[12:15], s[0:3], 0x20 +; GFX10-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0 +; GFX10-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70 +; GFX10-NEXT: s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10 +; GFX10-NEXT: v_fma_f32 v1, v1, v5, s28 +; GFX10-NEXT: v_max_f32_e64 v6, s0, s0 clamp +; GFX10-NEXT: v_add_f32_e64 v5, s29, -1.0 +; GFX10-NEXT: v_sub_f32_e32 v8, s0, v1 +; GFX10-NEXT: v_fma_f32 v7, -s2, v6, s6 +; GFX10-NEXT: v_fma_f32 v5, v6, v5, 1.0 +; GFX10-NEXT: v_mad_f32 v10, s2, v6, v2 +; GFX10-NEXT: s_mov_b32 s0, 0x3c23d70a +; GFX10-NEXT: v_fmac_f32_e32 v1, v6, v8 +; GFX10-NEXT: v_fmac_f32_e32 v10, v7, v6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mul_f32_e32 v9, s10, v0 +; GFX10-NEXT: v_fma_f32 v0, -v0, s10, s14 +; GFX10-NEXT: v_mul_f32_e32 v8, s18, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, s22, v3 +; GFX10-NEXT: v_fmac_f32_e32 v9, v0, v6 +; GFX10-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX10-NEXT: v_mul_f32_e32 v1, v8, v6 +; GFX10-NEXT: v_mul_f32_e32 v7, v6, v3 +; GFX10-NEXT: v_fma_f32 v3, -v6, v3, v9 +; GFX10-NEXT: v_fmac_f32_e32 v5, v0, v6 +; GFX10-NEXT: v_fma_f32 v0, v2, s26, -v1 +; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v6 +; GFX10-NEXT: v_fmac_f32_e32 v1, v0, v6 +; GFX10-NEXT: v_mul_f32_e32 v0, v2, v6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v6 +; GFX10-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a +; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v7, v4 +; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v0 +; GFX10-NEXT: v_max_f32_e32 v0, 0, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: _amdgpu_ps_main: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_buffer_load_b32 s24, s[0:3], 0x5c +; GFX11-NEXT: s_buffer_load_b32 s28, s[0:3], 0x7c +; GFX11-NEXT: s_buffer_load_b32 s29, s[0:3], 0xc0 +; GFX11-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0x40 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_buffer_load_b128 s[4:7], s[0:3], 0x50 +; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x2c +; GFX11-NEXT: v_sub_f32_e64 v5, s24, s28 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_buffer_load_b128 s[8:11], s[0:3], 0x60 +; GFX11-NEXT: s_buffer_load_b128 s[12:15], s[0:3], 0x20 +; GFX11-NEXT: s_buffer_load_b128 s[16:19], s[0:3], 0x0 +; GFX11-NEXT: s_buffer_load_b128 s[20:23], s[0:3], 0x70 +; GFX11-NEXT: v_fma_f32 v1, v1, v5, s28 +; GFX11-NEXT: v_max_f32_e64 v6, s0, s0 clamp +; GFX11-NEXT: s_buffer_load_b128 s[24:27], s[0:3], 0x10 +; GFX11-NEXT: v_add_f32_e64 v5, s29, -1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_f32_e32 v8, s0, v1 +; GFX11-NEXT: v_fma_f32 v7, -s2, v6, s6 +; GFX11-NEXT: v_fma_f32 v10, s2, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_fma_f32 v5, v6, v5, 1.0 +; GFX11-NEXT: s_mov_b32 s0, 0x3c23d70a +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v9, s10, v0 +; GFX11-NEXT: v_fma_f32 v0, -v0, s10, s14 +; GFX11-NEXT: v_mul_f32_e32 v3, s22, v3 +; GFX11-NEXT: v_dual_fmac_f32 v1, v6, v8 :: v_dual_mul_f32 v8, s18, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fmac_f32_e32 v9, v0, v6 +; GFX11-NEXT: v_dual_fmac_f32 v10, v7, v6 :: v_dual_mul_f32 v7, v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX11-NEXT: v_fma_f32 v3, -v6, v3, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fmac_f32_e32 v7, v3, v6 +; GFX11-NEXT: v_fmac_f32_e32 v5, v0, v6 +; GFX11-NEXT: v_mul_f32_e32 v1, v8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mul_f32 v3, v4, v6 :: v_dual_fmaak_f32 v4, s0, v5, 0x3ca3d70a +; GFX11-NEXT: v_fma_f32 v0, v2, s26, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_fmac_f32_e32 v1, v0, v6 +; GFX11-NEXT: v_mul_f32_e32 v0, v2, v6 +; GFX11-NEXT: v_mul_f32_e32 v2, v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, 0, v1 +; GFX11-NEXT: ; return to shader part epilog .entry: %0 = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) %.i2243 = extractelement <3 x float> %0, i32 2 @@ -168,14 +237,24 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 { } define float @fmac_sequence_simple(float %a, float %b, float %c, float %d, float %e) #0 { -; GCN-LABEL: fmac_sequence_simple: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: v_fma_f32 v2, v2, v3, v4 -; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmac_sequence_simple: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v2, v2, v3, v4 +; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmac_sequence_simple: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fma_f32 v2, v2, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %t0 = fmul fast float %a, %b %t1 = fmul fast float %c, %d %t2 = fadd fast float %t0, %t1 @@ -184,15 +263,27 @@ define float @fmac_sequence_simple(float %a, float %b, float %c, float %d, float } define float @fmac_sequence_innermost_fmul(float %a, float %b, float %c, float %d, float %e, float %f, float %g) #0 { -; GCN-LABEL: fmac_sequence_innermost_fmul: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: v_mad_f32 v2, v2, v3, v6 -; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GCN-NEXT: v_fmac_f32_e32 v2, v4, v5 -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmac_sequence_innermost_fmul: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mad_f32 v2, v2, v3, v6 +; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmac_sequence_innermost_fmul: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fma_f32 v2, v2, v3, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX11-NEXT: v_fmac_f32_e32 v2, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %t0 = fmul fast float %a, %b %t1 = fmul fast float %c, %d %t2 = fadd fast float %t0, %t1 @@ -203,15 +294,27 @@ define float @fmac_sequence_innermost_fmul(float %a, float %b, float %c, float % } define float @fmac_sequence_innermost_fmul_swapped_operands(float %a, float %b, float %c, float %d, float %e, float %f, float %g) #0 { -; GCN-LABEL: fmac_sequence_innermost_fmul_swapped_operands: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: v_mad_f32 v2, v2, v3, v6 -; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GCN-NEXT: v_fmac_f32_e32 v2, v4, v5 -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmac_sequence_innermost_fmul_swapped_operands: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mad_f32 v2, v2, v3, v6 +; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmac_sequence_innermost_fmul_swapped_operands: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fma_f32 v2, v2, v3, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX11-NEXT: v_fmac_f32_e32 v2, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %t0 = fmul fast float %a, %b %t1 = fmul fast float %c, %d %t2 = fadd fast float %t0, %t1 @@ -222,12 +325,20 @@ define float @fmac_sequence_innermost_fmul_swapped_operands(float %a, float %b, } define amdgpu_ps float @fmac_sequence_innermost_fmul_sgpr(float inreg %a, float inreg %b, float inreg %c, float inreg %d, float inreg %e, float inreg %f, float %g) #0 { -; GCN-LABEL: fmac_sequence_innermost_fmul_sgpr: -; GCN: ; %bb.0: -; GCN-NEXT: v_mac_f32_e64 v0, s2, s3 -; GCN-NEXT: v_fmac_f32_e64 v0, s0, s1 -; GCN-NEXT: v_fmac_f32_e64 v0, s4, s5 -; GCN-NEXT: ; return to shader part epilog +; GFX10-LABEL: fmac_sequence_innermost_fmul_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mac_f32_e64 v0, s2, s3 +; GFX10-NEXT: v_fmac_f32_e64 v0, s0, s1 +; GFX10-NEXT: v_fmac_f32_e64 v0, s4, s5 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: fmac_sequence_innermost_fmul_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_fmac_f32_e64 v0, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e64 v0, s0, s1 +; GFX11-NEXT: v_fmac_f32_e64 v0, s4, s5 +; GFX11-NEXT: ; return to shader part epilog %t0 = fmul fast float %a, %b %t1 = fmul fast float %c, %d %t2 = fadd fast float %t0, %t1 @@ -238,14 +349,25 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_sgpr(float inreg %a, float } define amdgpu_ps float @fmac_sequence_innermost_fmul_multiple_use(float inreg %a, float inreg %b, float inreg %c, float inreg %d, float inreg %e, float inreg %f, float %g) #0 { -; GCN-LABEL: fmac_sequence_innermost_fmul_multiple_use: -; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, s2, s3 -; GCN-NEXT: v_fmac_f32_e64 v1, s0, s1 -; GCN-NEXT: v_fma_f32 v2, s5, s4, v1 -; GCN-NEXT: v_fmac_f32_e32 v1, s5, v2 -; GCN-NEXT: v_add_f32_e32 v0, v1, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX10-LABEL: fmac_sequence_innermost_fmul_multiple_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e64 v1, s2, s3 +; GFX10-NEXT: v_fmac_f32_e64 v1, s0, s1 +; GFX10-NEXT: v_fma_f32 v2, s5, s4, v1 +; GFX10-NEXT: v_fmac_f32_e32 v1, s5, v2 +; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: fmac_sequence_innermost_fmul_multiple_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mul_f32_e64 v1, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e64 v1, s0, s1 +; GFX11-NEXT: v_fma_f32 v2, s5, s4, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v1, s5, v2 +; GFX11-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-NEXT: ; return to shader part epilog %t0 = fmul fast float %a, %b %t1 = fmul fast float %c, %d %t2 = fadd fast float %t0, %t1 @@ -261,12 +383,20 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_multiple_use(float inreg %a ; selecting it as a multiply. In some cases the multiply is better because ; SIFoldOperands can fold it into a previous instruction as an output modifier. define amdgpu_ps float @fma_vs_output_modifier(float %x, i32 %n) #0 { -; GCN-LABEL: fma_vs_output_modifier: -; GCN: ; %bb.0: -; GCN-NEXT: v_cvt_f32_i32_e64 v1, v1 mul:2 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-NEXT: ; return to shader part epilog +; GFX10-LABEL: fma_vs_output_modifier: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cvt_f32_i32_e64 v1, v1 mul:2 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: fma_vs_output_modifier: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cvt_f32_i32_e64 v1, v1 mul:2 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: ; return to shader part epilog %s = sitofp i32 %n to float %m = fmul contract float %x, %x %a = fmul contract float %m, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index 058b978..afffa5d 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -1,6 +1,9 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s ; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs @@ -15,13 +18,42 @@ declare float @llvm.fma.f32(float, float, float) #0 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 ; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_fma_f64_0: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] -; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_f64_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_f64_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -39,17 +71,54 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p } ; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_f64_0_2use: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[6:7], v[2:3], v[4:5], v[6:7] +; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_f64_0_2use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], v[4:5] +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -72,13 +141,42 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o } ; (fadd x, (fmul y, z)) -> (fma y, z, x) -; FUNC-LABEL: {{^}}combine_to_fma_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] -; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_f64_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_f64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -96,13 +194,42 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p } ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] -; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_fsub_0_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[6:7] +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_fsub_0_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -120,17 +247,54 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o } ; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_fsub_f64_0_2use: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[6:7], v[2:3], v[4:5], -v[6:7] +; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9] +; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], -v[4:5] +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7] +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -153,13 +317,42 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali } ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] -; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_fsub_1_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], v[6:7] +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_fsub_1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -177,17 +370,54 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o } ; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_fsub_1_f64_2use: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], v[6:7] +; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], v[8:9] +; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], v[4:5] +; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[6:7] +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -210,13 +440,42 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali } ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] -; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_fsub_2_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[6:7] +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_fsub_2_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -236,17 +495,54 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o } ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7] +; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[8:9] +; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5] +; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[6:7] +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -271,17 +567,54 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) } ; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7] +; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9] +; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5] +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7] +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -306,23 +639,102 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) } ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) - -; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: -; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}} - -; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] -; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]] -; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]] - -; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] -; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] - -; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, 0 +; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] +; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] +; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, 0 +; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-FMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FMA-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], -v[6:7] +; SI-FMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5] +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -347,23 +759,102 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; fold (fsub x, (fma y, z, (fmul u, v))) ; -> (fma (fneg y), z, (fma (fneg u), v, x)) - -; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: -; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}} -; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}} - -; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]] -; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]] -; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]] - -; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] -; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] - -; SI: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { +; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, 0 +; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] +; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, 0 +; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-FMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[8:9], v[10:11], v[2:3] +; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[4:5], v[6:7], v[2:3] +; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1] +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1] +; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -391,12 +882,87 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) ; -; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y: -; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_add_x_one_y: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load volatile float, ptr addrspace(1) %in1 @@ -407,12 +973,87 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one: -; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_y_add_x_one: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load volatile float, ptr addrspace(1) %in1 @@ -423,12 +1064,87 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y: -; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_add_x_negone_y: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -439,12 +1155,87 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone: -; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_y_add_x_negone: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -455,12 +1246,87 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y: -; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_sub_one_x_y: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -471,12 +1337,87 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x: -; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_y_sub_one_x: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -487,12 +1428,87 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y: -; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -503,12 +1519,87 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x: -; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -519,12 +1610,87 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y: -; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_sub_x_one_y: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -535,12 +1701,87 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one: -; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_y_sub_x_one: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -551,12 +1792,87 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y: -; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -567,12 +1883,87 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone: -; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] -; -; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s12, s6 +; SI-NOFMA-NEXT: s_mov_b32 s13, s7 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s12, s6 +; SI-FMA-NEXT: s_mov_b32 s13, s7 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x1 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x1 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -587,14 +1978,106 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) ; -; FUNC-LABEL: {{^}}test_f32_interp: -; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]] -; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]] -; -; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] -; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]] define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_interp: +; SI-NOFMA: ; %bb.0: +; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s10, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOFMA-NEXT: s_mov_b32 s16, s4 +; SI-NOFMA-NEXT: s_mov_b32 s17, s5 +; SI-NOFMA-NEXT: s_mov_b32 s4, s6 +; SI-NOFMA-NEXT: s_mov_b32 s5, s7 +; SI-NOFMA-NEXT: s_mov_b32 s6, s10 +; SI-NOFMA-NEXT: s_mov_b32 s7, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s18, s10 +; SI-NOFMA-NEXT: s_mov_b32 s19, s11 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NOFMA-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-NOFMA-NEXT: s_mov_b32 s8, s0 +; SI-NOFMA-NEXT: s_mov_b32 s9, s1 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(2) +; SI-NOFMA-NEXT: v_sub_f32_e32 v3, 1.0, v0 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) +; SI-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) +; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v2, v0 +; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_endpgm +; +; SI-FMA-LABEL: test_f32_interp: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s18, s10 +; SI-FMA-NEXT: s_mov_b32 s19, s11 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s16, s4 +; SI-FMA-NEXT: s_mov_b32 s17, s5 +; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: s_mov_b32 s4, s6 +; SI-FMA-NEXT: s_mov_b32 s5, s7 +; SI-FMA-NEXT: s_mov_b32 s6, s10 +; SI-FMA-NEXT: s_mov_b32 s7, s11 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-FMA-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s0 +; SI-FMA-NEXT: s_mov_b32 s9, s1 +; SI-FMA-NEXT: s_waitcnt vmcnt(1) +; SI-FMA-NEXT: v_fma_f32 v0, -v1, v0, v0 +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f32 v0, v2, v1, v0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f32_interp: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x2 +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[2:3] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2) +; GFX11-NOFMA-NEXT: v_sub_f32_e32 v4, 1.0, v1 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f32_interp: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x2 +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-FMA-NEXT: v_fma_f32 v1, -v2, v1, v1 +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) { @@ -609,14 +2092,74 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ret void } -; FUNC-LABEL: {{^}}test_f64_interp: -; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0 -; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]] -; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]] -; -; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] -; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, +; SI-FMA-LABEL: test_f64_interp: +; SI-FMA: ; %bb.0: +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s10, -1 +; SI-FMA-NEXT: s_mov_b32 s18, s10 +; SI-FMA-NEXT: s_mov_b32 s19, s11 +; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-FMA-NEXT: s_mov_b32 s16, s4 +; SI-FMA-NEXT: s_mov_b32 s17, s5 +; SI-FMA-NEXT: s_mov_b32 s4, s6 +; SI-FMA-NEXT: s_mov_b32 s5, s7 +; SI-FMA-NEXT: s_mov_b32 s6, s10 +; SI-FMA-NEXT: s_mov_b32 s7, s11 +; SI-FMA-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 +; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s11 +; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0 +; SI-FMA-NEXT: s_mov_b32 s8, s0 +; SI-FMA-NEXT: s_mov_b32 s9, s1 +; SI-FMA-NEXT: s_waitcnt vmcnt(1) +; SI-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1] +; SI-FMA-NEXT: s_waitcnt vmcnt(0) +; SI-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-FMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-FMA-NEXT: s_endpgm +; +; GFX11-NOFMA-LABEL: test_f64_interp: +; GFX11-NOFMA: ; %bb.0: +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOFMA-NEXT: s_clause 0x2 +; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[6:7] +; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[4:5] +; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[2:3] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2) +; GFX11-NOFMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0 +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NOFMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NOFMA-NEXT: s_endpgm +; +; GFX11-FMA-LABEL: test_f64_interp: +; GFX11-FMA: ; %bb.0: +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FMA-NEXT: s_clause 0x2 +; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[4:5] +; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[6:7] +; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[2:3] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(1) +; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1] +; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) +; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) { @@ -632,13 +2175,36 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, } ; Make sure negative constant cancels out fneg -; SI-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32: -; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; SI-NOT: [[A]] -; SI-NOT: [[B]] -; SI: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]] define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: fma_neg_2.0_neg_a_b_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -654,13 +2220,36 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ret void } -; SI-LABEL: {{^}}fma_2.0_neg_a_b_f32: -; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; SI-NOT: [[A]] -; SI-NOT: [[B]] -; SI: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]] define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: fma_2.0_neg_a_b_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_fma_f32 v2, v2, -2.0, v3 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: fma_2.0_neg_a_b_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -676,12 +2265,45 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; SI-LABEL: {{^}}fma_neg_b_c_v4f32: -; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} -; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} -; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} -; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { +; SI-LABEL: fma_neg_b_c_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0 +; SI-NEXT: v_mov_b32_e32 v13, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:48 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_fma_f32 v3, v11, -v3, -v7 +; SI-NEXT: v_fma_f32 v2, v10, -v2, -v6 +; SI-NEXT: v_fma_f32 v1, v9, -v1, -v5 +; SI-NEXT: v_fma_f32 v0, v8, -v0, -v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[12:13], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: fma_neg_b_c_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3] +; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3 +; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2 +; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1 +; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0 +; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr <4 x float>, ptr addrspace(1) %gep.0, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 25937773..a8a2610 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -3,6 +3,8 @@ ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL declare half @llvm.fma.f16(half, half, half) declare half @llvm.maxnum.f16(half, half) @@ -20,6 +22,13 @@ define half @test_fma(half %x, half %y, half %z) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fma: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %r = call half @llvm.fma.f16(half %x, half %y, half %z) ret half %r } @@ -38,6 +47,13 @@ define half @test_fmac(half %x, half %y, half %z) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_fmac_f16_e32 v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmac: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %r = call half @llvm.fma.f16(half %y, half %z, half %x) ret half %r } @@ -64,6 +80,13 @@ define half @test_fmaak(half %x, half %y, half %z) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmaak: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 +; GFX11-NEXT: s_setpc_b64 s[30:31] %r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200) ret half %r } @@ -90,6 +113,13 @@ define half @test_fmamk(half %x, half %y, half %z) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_fmamk: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z) ret half %r } @@ -134,6 +164,37 @@ define i32 @test_D139469_f16(half %arg) { ; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_D139469_f16: +; GFX11-SDAG: ; %bb.0: ; %bb +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX11-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 +; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 +; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_D139469_f16: +; GFX11-GISEL: ; %bb.0: ; %bb +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_movk_i32 s0, 0x291e +; GFX11-GISEL-NEXT: v_mul_f16_e32 v1, 0x291e, v0 +; GFX11-GISEL-NEXT: v_fmaak_f16 v0, s0, v0, 0x211e +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] bb: %i = fmul contract half %arg, 0xH291E %i1 = fcmp olt half %i, 0xH0000 @@ -216,6 +277,52 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX10-GISEL-NEXT: s_or_b32 s4, s6, s5 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_D139469_v2f16: +; GFX11-SDAG: ; %bb.0: ; %bb +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x211e +; GFX11-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] +; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX11-SDAG-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cmp_gt_f16_e64 s1, 0, v2 +; GFX11-SDAG-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_or_b32 s0, s1, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_D139469_v2f16: +; GFX11-GISEL: ; %bb.0: ; %bb +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0x291e291e +; GFX11-GISEL-NEXT: v_pk_mul_f16 v1, v0, 0x291e op_sel_hi:[1,0] +; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v0, s0, 0x211e op_sel_hi:[1,1,0] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v2 +; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_or_b32 s0, s1, s2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] bb: %i = fmul contract <2 x half> %arg, %i1 = fcmp olt <2 x half> %i, diff --git a/llvm/test/CodeGen/AMDGPU/fma.f64.ll b/llvm/test/CodeGen/AMDGPU/fma.f64.ll index 75bc89f..f623dd8 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f64.ll @@ -1,6 +1,7 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX90A %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SIGFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SIGFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX90A %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SIGFX11 %s declare double @llvm.fma.f64(double, double, double) nounwind readnone declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone @@ -8,7 +9,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) n declare double @llvm.fabs.f64(double) nounwind readnone ; FUNC-LABEL: {{^}}fma_f64: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) { @@ -21,8 +22,8 @@ define amdgpu_kernel void @fma_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, } ; FUNC-LABEL: {{^}}fma_v2f64: -; SI: v_fma_f64 -; SI: v_fma_f64 +; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, @@ -36,10 +37,10 @@ define amdgpu_kernel void @fma_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in } ; FUNC-LABEL: {{^}}fma_v4f64: -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 +; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} ; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} @@ -81,8 +82,7 @@ define amdgpu_kernel void @fma_f64_abs_src1(ptr addrspace(1) %out, ptr addrspace } ; FUNC-LABEL: {{^}}fma_f64_abs_src2: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}} -; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}} define amdgpu_kernel void @fma_f64_abs_src2(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) { %r0 = load double, ptr addrspace(1) %in1 @@ -121,8 +121,7 @@ define amdgpu_kernel void @fma_f64_neg_src1(ptr addrspace(1) %out, ptr addrspace } ; FUNC-LABEL: {{^}}fma_f64_neg_src2: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_neg_src2(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) { %r0 = load double, ptr addrspace(1) %in1 @@ -163,8 +162,7 @@ define amdgpu_kernel void @fma_f64_abs_neg_src1(ptr addrspace(1) %out, ptr addrs } ; FUNC-LABEL: {{^}}fma_f64_abs_neg_src2: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} -; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} define amdgpu_kernel void @fma_f64_abs_neg_src2(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) { %r0 = load double, ptr addrspace(1) %in1 @@ -178,7 +176,7 @@ define amdgpu_kernel void @fma_f64_abs_neg_src2(ptr addrspace(1) %out, ptr addrs } ; FUNC-LABEL: {{^}}fma_f64_lit_src0: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} +; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} ; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_lit_src0(ptr addrspace(1) %out, ptr addrspace(1) %in2, ptr addrspace(1) %in3) { @@ -190,7 +188,7 @@ define amdgpu_kernel void @fma_f64_lit_src0(ptr addrspace(1) %out, } ; FUNC-LABEL: {{^}}fma_f64_lit_src1: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} +; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} ; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_lit_src1(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in3) { @@ -202,8 +200,7 @@ define amdgpu_kernel void @fma_f64_lit_src1(ptr addrspace(1) %out, ptr addrspace } ; FUNC-LABEL: {{^}}fma_f64_lit_src2: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0}} -; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0}} +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0}} define amdgpu_kernel void @fma_f64_lit_src2(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %r0 = load double, ptr addrspace(1) %in1 -- 2.7.4