loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32,
SchedWriteFMA>;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32,
+ loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
SchedWriteFMA>;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32,
loadv2f64, loadv4f64, X86any_Fmadd, v2f64,
v4f64, SchedWriteFMA>, VEX_W;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
- loadv2f64, loadv4f64, X86Fmsub, v2f64,
+ loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
v4f64, SchedWriteFMA>, VEX_W;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
loadv2f64, loadv4f64, X86Fmaddsub,
// Fused Negative Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
- loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>;
+ loadv8f32, X86any_Fnmadd, v4f32, v8f32, SchedWriteFMA>;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
- loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>;
+ loadv8f32, X86any_Fnmsub, v4f32, v8f32, SchedWriteFMA>;
}
let ExeDomain = SSEPackedDouble in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
- loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+ loadv4f64, X86any_Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
- loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+ loadv4f64, X86any_Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
}
// All source register operands of FMA opcodes defined in fma3s_rm multiclass
defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd,
SchedWriteFMA.Scl>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub,
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
SchedWriteFMA.Scl>, VEX_LIG;
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd,
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86any_Fnmadd,
SchedWriteFMA.Scl>, VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub,
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub,
SchedWriteFMA.Scl>, VEX_LIG;
multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
}
defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
//===----------------------------------------------------------------------===//
// FMA4 - AMD 4 operand Fused Multiply-Add instructions
SchedWriteFMA.Scl>,
fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>;
- defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32,
+ defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32,
SchedWriteFMA.Scl>,
fma4s_int<0x6E, "vfmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
- X86Fnmadd, loadf32, SchedWriteFMA.Scl>,
+ X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>,
fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
- X86Fnmsub, loadf32, SchedWriteFMA.Scl>,
+ X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>,
fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>;
// Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
- defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+ defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
- defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+ defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86any_Fnmadd, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
- defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+ defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86any_Fnmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
SchedWriteFMA.Scl>,
fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
- defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64,
+ defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64,
SchedWriteFMA.Scl>,
fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
- X86Fnmadd, loadf64, SchedWriteFMA.Scl>,
+ X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>,
fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
- X86Fnmsub, loadf64, SchedWriteFMA.Scl>,
+ X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>,
fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
// Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
- defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+ defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
- defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+ defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86any_Fnmadd, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
- defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+ defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86any_Fnmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
}
defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefixes=COMMON,NOFMA
-; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA
-; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA,FMA-AVX1
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma4 < %s | FileCheck %s --check-prefixes=COMMON,FMA4
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA,FMA-AVX512
+
+define float @f1(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f1:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f1:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f1:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg float %0
+ %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %result
+}
+
+define double @f2(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f2:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f2:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f2:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg double %0
+ %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
+
+define float @f3(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f3:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f3:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f3:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg float %2
+ %result = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %3,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %result
+}
+
+define double @f4(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f4:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f4:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f4:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg double %2
+ %result = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %3,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
+
+define float @f5(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f5:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorps %xmm3, %xmm0
+; NOFMA-NEXT: xorps %xmm3, %xmm2
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f5:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f5:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg float %0
+ %4 = fneg float %2
+ %result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %result
+}
+
+define double @f6(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f6:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorps %xmm3, %xmm0
+; NOFMA-NEXT: xorps %xmm3, %xmm2
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f6:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f6:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg double %0
+ %4 = fneg double %2
+ %result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
+
+define float @f7(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f7:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-AVX1-LABEL: f7:
+; FMA-AVX1: # %bb.0: # %entry
+; FMA-AVX1-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT: retq
+;
+; FMA4-LABEL: f7:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT: retq
+;
+; FMA-AVX512-LABEL: f7:
+; FMA-AVX512: # %bb.0: # %entry
+; FMA-AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; FMA-AVX512-NEXT: retq
+entry:
+ %3 = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg float %3
+ ret float %result
+}
+
+define double @f8(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f8:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f8:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f8:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg double %3
+ ret double %result
+}
+
+define float @f9(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f9:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorps %xmm3, %xmm0
+; NOFMA-NEXT: xorps %xmm3, %xmm2
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-AVX1-LABEL: f9:
+; FMA-AVX1: # %bb.0: # %entry
+; FMA-AVX1-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT: retq
+;
+; FMA4-LABEL: f9:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT: retq
+;
+; FMA-AVX512-LABEL: f9:
+; FMA-AVX512: # %bb.0: # %entry
+; FMA-AVX512-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; FMA-AVX512-NEXT: retq
+entry:
+ %3 = fneg float %0
+ %4 = fneg float %2
+ %5 = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg float %5
+ ret float %result
+}
+
+define double @f10(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f10:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: pushq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 16
+; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorps %xmm3, %xmm0
+; NOFMA-NEXT: xorps %xmm3, %xmm2
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: popq %rax
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f10:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f10:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg double %0
+ %4 = fneg double %2
+ %5 = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg double %5
+ ret double %result
+}
; Verify that fma(3.5) isn't simplified when the rounding mode is
; unknown.
; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
; FMA-NEXT: retq
+;
+; FMA4-LABEL: f17:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMA4-NEXT: vfmaddss %xmm0, %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
entry:
%result = call float @llvm.experimental.constrained.fma.f32(
float 3.5,
; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
; FMA-NEXT: retq
+;
+; FMA4-LABEL: f18:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMA4-NEXT: vfmaddsd %xmm0, %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
entry:
%result = call double @llvm.experimental.constrained.fma.f64(
double 42.1,
ret double %result
}
+define <4 x float> @f19(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
+; NOFMA-LABEL: f19:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 96
+; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = mem[1,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
+; NOFMA-NEXT: movdqa %xmm1, %xmm0
+; NOFMA-NEXT: addq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f19:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f19:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg <4 x float> %0
+ %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <4 x float> %result
+}
+
+define <2 x double> @f20(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
+; NOFMA-LABEL: f20:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 80
+; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; NOFMA-NEXT: movdqa %xmm1, %xmm0
+; NOFMA-NEXT: addq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f20:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f20:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg <2 x double> %0
+ %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <2 x double> %result
+}
+
+define <4 x float> @f21(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
+; NOFMA-LABEL: f21:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 96
+; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm2
+; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm2 = mem[1,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
+; NOFMA-NEXT: movaps %xmm1, %xmm0
+; NOFMA-NEXT: addq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f21:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f21:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg <4 x float> %2
+ %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %3,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <4 x float> %result
+}
+
+define <2 x double> @f22(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
+; NOFMA-LABEL: f22:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 80
+; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2
+; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; NOFMA-NEXT: movaps %xmm1, %xmm0
+; NOFMA-NEXT: addq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f22:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f22:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg <2 x double> %2
+ %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %3,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <2 x double> %result
+}
+
+define <4 x float> @f23(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
+; NOFMA-LABEL: f23:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 96
+; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: pxor %xmm3, %xmm0
+; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pxor %xmm3, %xmm2
+; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = mem[1,1,2,3]
+; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm2 = mem[1,1,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
+; NOFMA-NEXT: movdqa %xmm1, %xmm0
+; NOFMA-NEXT: addq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f23:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f23:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg <4 x float> %0
+ %4 = fneg <4 x float> %2
+ %result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <4 x float> %result
+}
+
+define <2 x double> @f24(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
+; NOFMA-LABEL: f24:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 80
+; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorps %xmm3, %xmm0
+; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: xorps %xmm3, %xmm2
+; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; NOFMA-NEXT: movdqa %xmm1, %xmm0
+; NOFMA-NEXT: addq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f24:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f24:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg <2 x double> %0
+ %4 = fneg <2 x double> %2
+ %result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret <2 x double> %result
+}
+
+define <4 x float> @f25(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
+; NOFMA-LABEL: f25:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 96
+; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm1
+; NOFMA-NEXT: movaps %xmm1, %xmm0
+; NOFMA-NEXT: addq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-AVX1-LABEL: f25:
+; FMA-AVX1: # %bb.0: # %entry
+; FMA-AVX1-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT: retq
+;
+; FMA4-LABEL: f25:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT: retq
+;
+; FMA-AVX512-LABEL: f25:
+; FMA-AVX512: # %bb.0: # %entry
+; FMA-AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; FMA-AVX512-NEXT: retq
+entry:
+ %3 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg <4 x float> %3
+ ret <4 x float> %result
+}
+
+define <2 x double> @f26(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
+; NOFMA-LABEL: f26:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 80
+; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm1
+; NOFMA-NEXT: movaps %xmm1, %xmm0
+; NOFMA-NEXT: addq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f26:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f26:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg <2 x double> %3
+ ret <2 x double> %result
+}
+
+define <4 x float> @f27(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
+; NOFMA-LABEL: f27:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 96
+; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: pxor %xmm3, %xmm0
+; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pxor %xmm3, %xmm2
+; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = mem[1,1,2,3]
+; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm2 = mem[1,1,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; NOFMA-NEXT: callq fmaf
+; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
+; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm1
+; NOFMA-NEXT: movdqa %xmm1, %xmm0
+; NOFMA-NEXT: addq $88, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-AVX1-LABEL: f27:
+; FMA-AVX1: # %bb.0: # %entry
+; FMA-AVX1-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT: retq
+;
+; FMA4-LABEL: f27:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT: retq
+;
+; FMA-AVX512-LABEL: f27:
+; FMA-AVX512: # %bb.0: # %entry
+; FMA-AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; FMA-AVX512-NEXT: retq
+entry:
+ %3 = fneg <4 x float> %0
+ %4 = fneg <4 x float> %2
+ %5 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg <4 x float> %5
+ ret <4 x float> %result
+}
+
+define <2 x double> @f28(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
+; NOFMA-LABEL: f28:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: subq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 80
+; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; NOFMA-NEXT: xorps %xmm3, %xmm0
+; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NOFMA-NEXT: xorps %xmm3, %xmm2
+; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
+; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
+; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; NOFMA-NEXT: callq fma
+; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm1
+; NOFMA-NEXT: movdqa %xmm1, %xmm0
+; NOFMA-NEXT: addq $72, %rsp
+; NOFMA-NEXT: .cfi_def_cfa_offset 8
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f28:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA-NEXT: retq
+;
+; FMA4-LABEL: f28:
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %3 = fneg <2 x double> %0
+ %4 = fneg <2 x double> %2
+ %5 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %result = fneg <2 x double> %5
+ ret <2 x double> %result
+}
+
attributes #0 = { strictfp }
declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)