; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -fp-contract=fast < %s | FileCheck -check-prefix=GFX11-CONTRACT %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX11-DENORM %s
; fadd (fma a, b, (fmul c, d)), e --> fma a, b, (fma c, d, e)
; fadd e, (fma a, b, (fmul c, d)) --> fma a, b, (fma c, d, e)
; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_f32_add_mul:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, v4
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX11-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_f32_add_mul:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_fma_f32 v2, v2, v3, v4
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast float %c, %d
%y = call fast float @llvm.fmuladd.f32(float %a, float %b, float %x)
; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_f32_add_mul_rhs:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, v4
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX11-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_f32_add_mul_rhs:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_fma_f32 v2, v2, v3, v4
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast float %c, %d
%y = call fast float @llvm.fmuladd.f32(float %a, float %b, float %x)
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v4
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_half_add_mul:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_fma_f16 v2, v2, v3, v4
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1
+; GFX11-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_half_add_mul:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX11-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DENORM-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX11-DENORM-NEXT: v_add_f16_e32 v0, v0, v4
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast half %c, %d
%y = call fast half @llvm.fmuladd.f16(half %a, half %b, half %x)
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v4, v0
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_half_add_mul_rhs:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_fma_f16 v2, v2, v3, v4
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1
+; GFX11-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_half_add_mul_rhs:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX11-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DENORM-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX11-DENORM-NEXT: v_add_f16_e32 v0, v4, v0
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast half %c, %d
%y = call fast half @llvm.fmuladd.f16(half %a, half %b, half %x)
; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_double_add_mul:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_double_add_mul:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast double %c, %d
%y = call fast double @llvm.fmuladd.f64(double %a, double %b, double %x)
; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_double_add_mul_rhs:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_double_add_mul_rhs:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast double %c, %d
%y = call fast double @llvm.fmuladd.f64(double %a, double %b, double %x)
; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, v10
; GFX10-DENORM-NEXT: v_mov_b32_e32 v3, v11
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_v4f32_add_mul:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_fma_f32 v8, v8, v12, v16
+; GFX11-CONTRACT-NEXT: v_fma_f32 v9, v9, v13, v17
+; GFX11-CONTRACT-NEXT: v_fma_f32 v10, v10, v14, v18
+; GFX11-CONTRACT-NEXT: v_fma_f32 v11, v11, v15, v19
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-CONTRACT-NEXT: v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5
+; GFX11-CONTRACT-NEXT: v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-CONTRACT-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-CONTRACT-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_v4f32_add_mul:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_fma_f32 v8, v8, v12, v16
+; GFX11-DENORM-NEXT: v_fma_f32 v9, v9, v13, v17
+; GFX11-DENORM-NEXT: v_fma_f32 v10, v10, v14, v18
+; GFX11-DENORM-NEXT: v_fma_f32 v11, v11, v15, v19
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DENORM-NEXT: v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5
+; GFX11-DENORM-NEXT: v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DENORM-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-DENORM-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast <4 x float> %c, %d
%y = call fast <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %x)
; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, v10
; GFX10-DENORM-NEXT: v_mov_b32_e32 v3, v11
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_v4f32_add_mul_rhs:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_fma_f32 v8, v8, v12, v16
+; GFX11-CONTRACT-NEXT: v_fma_f32 v9, v9, v13, v17
+; GFX11-CONTRACT-NEXT: v_fma_f32 v10, v10, v14, v18
+; GFX11-CONTRACT-NEXT: v_fma_f32 v11, v11, v15, v19
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-CONTRACT-NEXT: v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5
+; GFX11-CONTRACT-NEXT: v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-CONTRACT-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-CONTRACT-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_v4f32_add_mul_rhs:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_fma_f32 v8, v8, v12, v16
+; GFX11-DENORM-NEXT: v_fma_f32 v9, v9, v13, v17
+; GFX11-DENORM-NEXT: v_fma_f32 v10, v10, v14, v18
+; GFX11-DENORM-NEXT: v_fma_f32 v11, v11, v15, v19
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DENORM-NEXT: v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5
+; GFX11-DENORM-NEXT: v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DENORM-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-DENORM-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast <4 x float> %c, %d
%y = call fast <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %x)
; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v9
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_f16_add_mul:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8
+; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_f16_add_mul:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6
+; GFX11-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
+; GFX11-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX11-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
+; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v1, v9
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast <4 x half> %c, %d
%y = call fast <4 x half> @llvm.fmuladd.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %x)
; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v8, v0
; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v9, v1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_f16_add_mul_rhs:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8
+; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX11-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_f16_add_mul_rhs:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6
+; GFX11-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
+; GFX11-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX11-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DENORM-NEXT: v_pk_add_f16 v0, v8, v0
+; GFX11-DENORM-NEXT: v_pk_add_f16 v1, v9, v1
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast <4 x half> %c, %d
%y = call fast <4 x half> @llvm.fmuladd.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %x)
; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX10-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_f64_add_mul:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: s_clause 0x8
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v35, off, s32 offset:16
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v36, off, s32 offset:20
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v38, off, s32 offset:28
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v39, off, s32 offset:32
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(6)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(4)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(2)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_f64_add_mul:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: s_clause 0x8
+; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-DENORM-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-DENORM-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-DENORM-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-DENORM-NEXT: scratch_load_b32 v35, off, s32 offset:16
+; GFX11-DENORM-NEXT: scratch_load_b32 v36, off, s32 offset:20
+; GFX11-DENORM-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX11-DENORM-NEXT: scratch_load_b32 v38, off, s32 offset:28
+; GFX11-DENORM-NEXT: scratch_load_b32 v39, off, s32 offset:32
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(6)
+; GFX11-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(4)
+; GFX11-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(2)
+; GFX11-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX11-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX11-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast <4 x double> %c, %d
%y = call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %x)
; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX10-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-CONTRACT-LABEL: test_f64_add_mul_rhs:
+; GFX11-CONTRACT: ; %bb.0: ; %.entry
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CONTRACT-NEXT: s_clause 0x8
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v35, off, s32 offset:16
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v36, off, s32 offset:20
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v38, off, s32 offset:28
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v39, off, s32 offset:32
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(6)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(4)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(2)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX11-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX11-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX11-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-DENORM-LABEL: test_f64_add_mul_rhs:
+; GFX11-DENORM: ; %bb.0: ; %.entry
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-NEXT: s_clause 0x8
+; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-DENORM-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-DENORM-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-DENORM-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-DENORM-NEXT: scratch_load_b32 v35, off, s32 offset:16
+; GFX11-DENORM-NEXT: scratch_load_b32 v36, off, s32 offset:20
+; GFX11-DENORM-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX11-DENORM-NEXT: scratch_load_b32 v38, off, s32 offset:28
+; GFX11-DENORM-NEXT: scratch_load_b32 v39, off, s32 offset:32
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(6)
+; GFX11-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(4)
+; GFX11-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(2)
+; GFX11-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX11-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX11-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX11-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%x = fmul fast <4 x double> %c, %d
%y = call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %x)
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
; (fadd (fmul x, y), z) -> (fma x, y, z)
-; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_f64_0:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_f64_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; (fadd (fmul x, y), z) -> (fma x, y, z)
-; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_f64_0_2use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[6:7], v[2:3], v[4:5], v[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_f64_0_2use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
+; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; (fadd x, (fmul y, z)) -> (fma y, z, x)
-; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_f64_1:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_f64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_fsub_0_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[6:7]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_fsub_0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_fsub_f64_0_2use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[6:7], v[2:3], v[4:5], -v[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9]
+; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], -v[4:5]
+; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
+; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_fsub_1_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], v[6:7]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_fsub_1_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
+; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_fsub_1_f64_2use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], v[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], v[8:9]
+; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], v[4:5]
+; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[6:7]
+; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
-; SI: buffer_store_dwordx2 [[RESULT]]
define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_fsub_2_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[6:7]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_fsub_2_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5]
+; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -v[8:9]
+; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
+; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[6:7]
+; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
-; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
-; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
-; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
-; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
-; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI: s_endpgm
define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], -v[6:7]
+; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], -v[8:9]
+; SI-NEXT: buffer_store_dwordx2 v[6:7], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 offset:8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5]
+; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7]
+; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
}
; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
-
-; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
-; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}
-
-; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
-; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
-; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]]
-
-; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
-; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
-
-; SI: buffer_store_dwordx2 [[RESULT]]
define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s6, 0
+; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
+; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7]
+; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-FMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s6, 0
+; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-FMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-FMA-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], -v[6:7]
+; SI-FMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
+; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5]
+; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
; fold (fsub x, (fma y, z, (fmul u, v)))
; -> (fma (fneg y), z, (fma (fneg u), v, x))
-
-; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
-; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
-; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}
-
-; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
-; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
-; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]]
-
-; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
-; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
-
-; SI: buffer_store_dwordx2 [[RESULT]]
define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s6, 0
+; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
+; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5]
+; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-FMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s6, 0
+; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-FMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[8:9], v[10:11], v[2:3]
+; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[4:5], v[6:7], v[2:3]
+; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1]
+; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1]
+; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
;
-; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
-; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_add_x_one_y:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load volatile float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
-; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_y_add_x_one:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load volatile float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
-; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_add_x_negone_y:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
-; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_y_add_x_negone:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
-; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_sub_one_x_y:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
-; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_y_sub_one_x:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
-; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
-; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
-; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_sub_x_one_y:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
-; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_y_sub_x_one:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
-; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
ret void
}
-; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
-; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
-;
-; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s2, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s2
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s12, s6
+; SI-NOFMA-NEXT: s_mov_b32 s13, s7
+; SI-NOFMA-NEXT: s_mov_b32 s15, s3
+; SI-NOFMA-NEXT: s_mov_b32 s10, s2
+; SI-NOFMA-NEXT: s_mov_b32 s11, s3
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_mov_b32 s0, s4
+; SI-NOFMA-NEXT: s_mov_b32 s1, s5
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-FMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s2, -1
+; SI-FMA-NEXT: s_mov_b32 s14, s2
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s12, s6
+; SI-FMA-NEXT: s_mov_b32 s13, s7
+; SI-FMA-NEXT: s_mov_b32 s15, s3
+; SI-FMA-NEXT: s_mov_b32 s10, s2
+; SI-FMA-NEXT: s_mov_b32 s11, s3
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-FMA-NEXT: s_mov_b32 s0, s4
+; SI-FMA-NEXT: s_mov_b32 s1, s5
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x1
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
;
-; FUNC-LABEL: {{^}}test_f32_interp:
-; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
-; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
-;
-; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
-; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
+; SI-NOFMA-LABEL: test_f32_interp:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s10, -1
+; SI-NOFMA-NEXT: s_mov_b32 s14, s10
+; SI-NOFMA-NEXT: s_mov_b32 s15, s11
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b32 s16, s4
+; SI-NOFMA-NEXT: s_mov_b32 s17, s5
+; SI-NOFMA-NEXT: s_mov_b32 s4, s6
+; SI-NOFMA-NEXT: s_mov_b32 s5, s7
+; SI-NOFMA-NEXT: s_mov_b32 s6, s10
+; SI-NOFMA-NEXT: s_mov_b32 s7, s11
+; SI-NOFMA-NEXT: s_mov_b32 s12, s2
+; SI-NOFMA-NEXT: s_mov_b32 s13, s3
+; SI-NOFMA-NEXT: s_mov_b32 s18, s10
+; SI-NOFMA-NEXT: s_mov_b32 s19, s11
+; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NOFMA-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; SI-NOFMA-NEXT: s_mov_b32 s8, s0
+; SI-NOFMA-NEXT: s_mov_b32 s9, s1
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(2)
+; SI-NOFMA-NEXT: v_sub_f32_e32 v3, 1.0, v0
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v3
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v2, v0
+; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; SI-NOFMA-NEXT: s_endpgm
+;
+; SI-FMA-LABEL: test_f32_interp:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_mov_b32 s11, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s10, -1
+; SI-FMA-NEXT: s_mov_b32 s18, s10
+; SI-FMA-NEXT: s_mov_b32 s19, s11
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s16, s4
+; SI-FMA-NEXT: s_mov_b32 s17, s5
+; SI-FMA-NEXT: s_mov_b32 s14, s10
+; SI-FMA-NEXT: s_mov_b32 s12, s2
+; SI-FMA-NEXT: s_mov_b32 s13, s3
+; SI-FMA-NEXT: s_mov_b32 s15, s11
+; SI-FMA-NEXT: s_mov_b32 s4, s6
+; SI-FMA-NEXT: s_mov_b32 s5, s7
+; SI-FMA-NEXT: s_mov_b32 s6, s10
+; SI-FMA-NEXT: s_mov_b32 s7, s11
+; SI-FMA-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SI-FMA-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-FMA-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; SI-FMA-NEXT: s_mov_b32 s8, s0
+; SI-FMA-NEXT: s_mov_b32 s9, s1
+; SI-FMA-NEXT: s_waitcnt vmcnt(1)
+; SI-FMA-NEXT: v_fma_f32 v0, -v1, v0, v0
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f32 v0, v2, v1, v0
+; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f32_interp:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x2
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NOFMA-NEXT: v_sub_f32_e32 v4, 1.0, v1
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f32_interp:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x2
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FMA-NEXT: v_fma_f32 v1, -v2, v1, v1
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2,
ptr addrspace(1) %in3) {
ret void
}
-; FUNC-LABEL: {{^}}test_f64_interp:
-; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0
-; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]]
-; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]]
-;
-; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
-; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
+; SI-FMA-LABEL: test_f64_interp:
+; SI-FMA: ; %bb.0:
+; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-FMA-NEXT: s_mov_b32 s11, 0xf000
+; SI-FMA-NEXT: s_mov_b32 s10, -1
+; SI-FMA-NEXT: s_mov_b32 s18, s10
+; SI-FMA-NEXT: s_mov_b32 s19, s11
+; SI-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FMA-NEXT: s_mov_b32 s16, s4
+; SI-FMA-NEXT: s_mov_b32 s17, s5
+; SI-FMA-NEXT: s_mov_b32 s4, s6
+; SI-FMA-NEXT: s_mov_b32 s5, s7
+; SI-FMA-NEXT: s_mov_b32 s6, s10
+; SI-FMA-NEXT: s_mov_b32 s7, s11
+; SI-FMA-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-FMA-NEXT: s_mov_b32 s14, s10
+; SI-FMA-NEXT: s_mov_b32 s12, s2
+; SI-FMA-NEXT: s_mov_b32 s13, s3
+; SI-FMA-NEXT: s_mov_b32 s15, s11
+; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0
+; SI-FMA-NEXT: s_mov_b32 s8, s0
+; SI-FMA-NEXT: s_mov_b32 s9, s1
+; SI-FMA-NEXT: s_waitcnt vmcnt(1)
+; SI-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
+; SI-FMA-NEXT: s_waitcnt vmcnt(0)
+; SI-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-FMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-FMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: test_f64_interp:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: s_clause 0x2
+; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[6:7]
+; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[4:5]
+; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[2:3]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NOFMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
+; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NOFMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: test_f64_interp:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x2
+; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[4:5]
+; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[6:7]
+; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[2:3]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2,
ptr addrspace(1) %in3) {
}
; Make sure negative constant cancels out fneg
-; SI-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
-; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
-; SI-NOT: [[A]]
-; SI-NOT: [[B]]
-; SI: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: fma_neg_2.0_neg_a_b_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_fma_f32 v2, v2, 2.0, v3
+; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fmac_f32_e32 v2, 2.0, v1
+; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
ret void
}
-; SI-LABEL: {{^}}fma_2.0_neg_a_b_f32:
-; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
-; SI-NOT: [[A]]
-; SI-NOT: [[B]]
-; SI: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: fma_2.0_neg_a_b_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_fma_f32 v2, v2, -2.0, v3
+; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fma_2.0_neg_a_b_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fmac_f32_e32 v2, -2.0, v1
+; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
ret void
}
-; SI-LABEL: {{^}}fma_neg_b_c_v4f32:
-; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
-; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
-; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
-; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 {
+; SI-LABEL: fma_neg_b_c_v4f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0
+; SI-NEXT: v_mov_b32_e32 v13, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:48
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_fma_f32 v3, v11, -v3, -v7
+; SI-NEXT: v_fma_f32 v2, v10, -v2, -v6
+; SI-NEXT: v_fma_f32 v1, v9, -v1, -v5
+; SI-NEXT: v_fma_f32 v0, v8, -v0, -v4
+; SI-NEXT: buffer_store_dwordx4 v[0:3], v[12:13], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fma_neg_b_c_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16
+; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3]
+; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3
+; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2
+; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1
+; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0
+; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr <4 x float>, ptr addrspace(1) %gep.0, i32 1