ret void
}
+; GCN-LABEL: {{^}}fmul_fadd_f16:
+; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; GFX10-FLUSH: v_mul_f16_e32
+; GFX10-FLUSH: v_add_f16_e32
+; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define amdgpu_kernel void @fmul_fadd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+ half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
+ %r0 = load half, half addrspace(1)* %in1
+ %r1 = load half, half addrspace(1)* %in2
+ %r2 = load half, half addrspace(1)* %in3
+ %mul = fmul half %r0, %r1
+ %add = fadd half %mul, %r2
+ store half %add, half addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fmul_fadd_contract_f16:
+; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; GFX10-FLUSH: v_mul_f16_e32
+; GFX10-FLUSH: v_add_f16_e32
+; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+ half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
+ %r0 = load half, half addrspace(1)* %in1
+ %r1 = load half, half addrspace(1)* %in2
+ %r2 = load half, half addrspace(1)* %in3
+ %mul = fmul half %r0, %r1
+ %add = fadd contract half %mul, %r2
+ store half %add, half addrspace(1)* %out
+ ret void
+}
+
; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
ret void
}
+; GCN-LABEL: {{^}}fmul_fadd_contract_f32:
+; GCN-FLUSH-FMAC: v_fmac_f32_e32
+
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
+; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
+
+; GCN-DENORM-FASTFMA: v_fma_f32
+define amdgpu_kernel void @fmul_fadd_contract_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+ float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
+ %r0 = load volatile float, float addrspace(1)* %in1
+ %r1 = load volatile float, float addrspace(1)* %in2
+ %r2 = load volatile float, float addrspace(1)* %in3
+ %mul = fmul float %r0, %r1
+ %add = fadd contract float %mul, %r2
+ store float %add, float addrspace(1)* %out
+ ret void
+}
+
; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
ret void
}
+; GCN-LABEL: {{^}}fmul_fadd_contract_f64:
+; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+
+define amdgpu_kernel void @fmul_fadd_contract_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+ double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
+ %r0 = load double, double addrspace(1)* %in1
+ %r1 = load double, double addrspace(1)* %in2
+ %r2 = load double, double addrspace(1)* %in3
+ %tmp = fmul double %r0, %r1
+ %r3 = fadd contract double %tmp, %r2
+ store double %r3, double addrspace(1)* %out
+ ret void
+}
+
; GCN-LABEL: {{^}}fadd_a_a_b_f64:
; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]],
; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]],
ret void
}
+; GCN-LABEL: {{^}}fmul_fadd_v2f16:
+; GFX9-DENORM-STRICT: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX9-DENORM-STRICT: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; GFX9-DENORM-CONTRACT: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+define amdgpu_kernel void @fmul_fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
+ <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
+ %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
+ %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
+ %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3
+ %r3 = fmul <2 x half> %r0, %r1
+ %r4 = fadd <2 x half> %r3, %r2
+ store <2 x half> %r4, <2 x half> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fmul_fadd_contract_v2f16:
+; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+define amdgpu_kernel void @fmul_fadd_contract_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
+ <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
+ %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
+ %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
+ %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3
+ %r3 = fmul <2 x half> %r0, %r1
+ %r4 = fadd contract <2 x half> %r3, %r2
+ store <2 x half> %r4, <2 x half> addrspace(1)* %out
+ ret void
+}
+
+
; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16:
; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],