From fdf3f1ff82501bd6d0dd99a72f406cbc195909db Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 8 Jul 2018 01:10:43 +0000 Subject: [PATCH] [X86] Add new scalar fma intrinsics with rounding mode that use f32/f64 types. This allows us to handle masking in a very similar way to the default rounding version that uses llvm.fma. I had to add new rounding mode CodeGenOnly instructions to support isel when we can't find a movss to grab the upper bits from to use the b_Int instruction. Fast-isel tests have been updated to match new clang codegen. We are currently having trouble folding fneg into the new intrinsic. I'm going to correct that in a follow up patch to keep the size of this one down. A future patch will also remove the old intrinsics. llvm-svn: 336506 --- llvm/include/llvm/IR/IntrinsicsX86.td | 39 ++- llvm/lib/Target/X86/X86InstrAVX512.td | 221 ++++++++++--- llvm/lib/Target/X86/X86InstrFMA3Info.cpp | 16 +- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 8 +- .../CodeGen/X86/avx512-intrinsics-fast-isel.ll | 364 +++++++++++++++------ 5 files changed, 480 insertions(+), 168 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index aa25693..b3b0945 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1924,63 +1924,62 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vfmadd_f64 : + Intrinsic<[llvm_double_ty], + [llvm_double_ty, llvm_double_ty, llvm_double_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_vfmadd_f32 : + Intrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty], + [IntrNoMem]>; + - def int_x86_avx512_mask_vfmadd_sd : - GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask">, + def int_x86_avx512_mask_vfmadd_sd : // FIXME: Remove Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vfmadd_ss : - GCCBuiltin<"__builtin_ia32_vfmaddss3_mask">, + def int_x86_avx512_mask_vfmadd_ss : // FIXME: Remove Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_maskz_vfmadd_sd : - GCCBuiltin<"__builtin_ia32_vfmaddsd3_maskz">, + def int_x86_avx512_maskz_vfmadd_sd : // FIXME: Remove Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_maskz_vfmadd_ss : - GCCBuiltin<"__builtin_ia32_vfmaddss3_maskz">, + def int_x86_avx512_maskz_vfmadd_ss : // FIXME: Remove Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask3_vfmadd_sd : - GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask3">, + def int_x86_avx512_mask3_vfmadd_sd : // FIXME: Remove Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask3_vfmadd_ss : - GCCBuiltin<"__builtin_ia32_vfmaddss3_mask3">, + def int_x86_avx512_mask3_vfmadd_ss : // FIXME: Remove Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask3_vfmsub_sd : - GCCBuiltin<"__builtin_ia32_vfmsubsd3_mask3">, + def int_x86_avx512_mask3_vfmsub_sd : // FIXME: Remove Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask3_vfmsub_ss : - GCCBuiltin<"__builtin_ia32_vfmsubss3_mask3">, + def int_x86_avx512_mask3_vfmsub_ss : // FIXME: Remove Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask3_vfnmsub_sd : - GCCBuiltin<"__builtin_ia32_vfnmsubsd3_mask3">, + def int_x86_avx512_mask3_vfnmsub_sd : // FIXME: Remove Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask3_vfnmsub_ss : - GCCBuiltin<"__builtin_ia32_vfnmsubss3_mask3">, + def int_x86_avx512_mask3_vfnmsub_ss : // FIXME: Remove Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index e8301b9..ff5bcef 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -6751,7 +6751,7 @@ defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubR // Scalar FMA multiclass avx512_fma3s_common opc, string OpcodeStr, X86VectorVTInfo _, dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb, - dag RHS_r, dag RHS_m, bit MaskOnlyReg> { + dag RHS_r, dag RHS_m, dag RHS_b, bit MaskOnlyReg> { let Constraints = "$src1 = $dst", hasSideEffects = 0 in { defm r_Int: AVX512_maskable_3src_scalar, Sched<[SchedWriteFMA.Scl.Folded, ReadAfterLd]>; + + def rb : AVX512FMA3S, EVEX_B, EVEX_RC, + Sched<[SchedWriteFMA.Scl]>; }// isCodeGenOnly = 1 }// Constraints = "$src1 = $dst" } multiclass avx512_fma3s_all opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodes1, - SDNode OpNodeRnds1, SDNode OpNodes3, + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, + SDNode OpNodes1, SDNode OpNodeRnds1, SDNode OpNodes3, SDNode OpNodeRnds3, X86VectorVTInfo _, string SUFF> { let ExeDomain = _.ExeDomain in { @@ -6800,7 +6807,9 @@ multiclass avx512_fma3s_all opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, _.FRC:$src3))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, - (_.ScalarLdFrag addr:$src3)))), 0>; + (_.ScalarLdFrag addr:$src3)))), + (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src1, + _.FRC:$src3, (i32 imm:$rc)))), 0>; defm NAME#231#SUFF#Z: avx512_fma3s_common opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3, _.FRC:$src1))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, - (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), 1>; + (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), + (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src3, + _.FRC:$src1, (i32 imm:$rc)))), 1>; // One pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -6823,38 +6834,44 @@ multiclass avx512_fma3s_all opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3, _.FRC:$src2))), (set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3), - _.FRC:$src1, _.FRC:$src2))), 1>; + _.FRC:$src1, _.FRC:$src2))), + (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src1, _.FRC:$src3, + _.FRC:$src2, (i32 imm:$rc)))), 1>; } } multiclass avx512_fma3s opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodes1, - SDNode OpNodeRnds1, SDNode OpNodes3, + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, + SDNode OpNodes1, SDNode OpNodeRnds1, SDNode OpNodes3, SDNode OpNodeRnds3> { let Predicates = [HasAVX512] in { defm NAME : avx512_fma3s_all, + OpNodeRnd, OpNodes1, OpNodeRnds1, OpNodes3, + OpNodeRnds3, f32x_info, "SS">, EVEX_CD8<32, CD8VT1>, VEX_LIG; defm NAME : avx512_fma3s_all, + OpNodeRnd, OpNodes1, OpNodeRnds1, OpNodes3, + OpNodeRnds3, f64x_info, "SD">, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; } } -defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86Fmadds1, - X86FmaddRnds1, X86Fmadds3, X86FmaddRnds3>; -defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86Fmsubs1, - X86FmsubRnds1, X86Fmsubs3, X86FmsubRnds3>; -defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1, - X86FnmaddRnds1, X86Fnmadds3, X86FnmaddRnds3>; -defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1, - X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd, + X86Fmadds1, X86FmaddRnds1, X86Fmadds3, + X86FmaddRnds3>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd, + X86Fmsubs1, X86FmsubRnds1, X86Fmsubs3, + X86FmsubRnds3>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd, + X86Fnmadds1, X86FnmaddRnds1, X86Fnmadds3, + X86FnmaddRnds3>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd, + X86Fnmsubs1, X86FnmsubRnds1, X86Fnmsubs3, + X86FnmsubRnds3>; -multiclass avx512_scalar_fma_patterns { +multiclass avx512_scalar_fma_patterns { let Predicates = [HasAVX512] in { def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (Op _.FRC:$src2, @@ -6879,7 +6896,6 @@ multiclass avx512_scalar_fma_patterns; - // TODO: Add memory patterns. def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector (X86selects VK1WM:$mask, (Op _.FRC:$src2, @@ -6893,6 +6909,25 @@ multiclass avx512_scalar_fma_patterns(Prefix#"213"#Suffix#"Zm_Intk") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src3), _.FRC:$src2), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast(Prefix#"132"#Suffix#"Zm_Intk") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, (Op _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), @@ -6903,6 +6938,15 @@ multiclass avx512_scalar_fma_patterns(Prefix#"231"#Suffix#"Zm_Intk") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, (Op _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), _.FRC:$src3), @@ -6911,26 +6955,117 @@ multiclass avx512_scalar_fma_patterns; - } -} -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; - -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (_.EltVT ZeroFP)))))), + (!cast(Prefix#"231"#Suffix#"Zr_Intkz") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + (COPY_TO_REGCLASS _.FRC:$src3, VR128X))>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src3)), + (_.EltVT ZeroFP)))))), + (!cast(Prefix#"213"#Suffix#"Zm_Intkz") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2, (_.ScalarLdFrag addr:$src3)), + (_.EltVT ZeroFP)))))), + (!cast(Prefix#"132"#Suffix#"Zm_Intkz") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (_.EltVT ZeroFP)))))), + (!cast(Prefix#"231"#Suffix#"Zm_Intkz") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), addr:$src3)>; + + // Patterns with rounding mode. + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (RndOp _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3, (i32 imm:$rc)))))), + (!cast(Prefix#"213"#Suffix#"Zrb_Int") + VR128X:$src1, (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (RndOp _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3, (i32 imm:$rc)), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast(Prefix#"213"#Suffix#"Zrb_Intk") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (RndOp _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (i32 imm:$rc)), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast(Prefix#"231"#Suffix#"Zrb_Intk") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (RndOp _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3, (i32 imm:$rc)), + (_.EltVT ZeroFP)))))), + (!cast(Prefix#"213"#Suffix#"Zrb_Intkz") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (RndOp _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (i32 imm:$rc)), + (_.EltVT ZeroFP)))))), + (!cast(Prefix#"231"#Suffix#"Zrb_Intkz") + VR128X:$src1, VK1WM:$mask, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X), + (COPY_TO_REGCLASS _.FRC:$src3, VR128X), imm:$rc)>; + } +} + +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; + +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; //===----------------------------------------------------------------------===// // AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp index 663b132..def732a 100644 --- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -85,9 +85,11 @@ static const X86InstrFMA3Group Groups[] = { FMA3GROUP_MASKED(Name, PDZ##Suf, Attrs) \ FMA3GROUP_MASKED(Name, PSZ##Suf, Attrs) -#define FMA3GROUP_SCALAR_AVX512(Name, Suf, Attrs) \ - FMA3GROUP_MASKED(Name, SDZ##Suf, Attrs) \ - FMA3GROUP_MASKED(Name, SSZ##Suf, Attrs) +#define FMA3GROUP_SCALAR_AVX512_ROUND(Name, Suf, Attrs) \ + FMA3GROUP(Name, SDZ##Suf, Attrs) \ + FMA3GROUP_MASKED(Name, SDZ##Suf##_Int, Attrs) \ + FMA3GROUP(Name, SSZ##Suf, Attrs) \ + FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs) static const X86InstrFMA3Group BroadcastGroups[] = { FMA3GROUP_PACKED_AVX512(VFMADD, mb, 0) @@ -100,15 +102,15 @@ static const X86InstrFMA3Group BroadcastGroups[] = { static const X86InstrFMA3Group RoundGroups[] = { FMA3GROUP_PACKED_AVX512_ROUND(VFMADD, rb, 0) - FMA3GROUP_SCALAR_AVX512(VFMADD, rb_Int, X86InstrFMA3Group::Intrinsic) + FMA3GROUP_SCALAR_AVX512_ROUND(VFMADD, rb, X86InstrFMA3Group::Intrinsic) FMA3GROUP_PACKED_AVX512_ROUND(VFMADDSUB, rb, 0) FMA3GROUP_PACKED_AVX512_ROUND(VFMSUB, rb, 0) - FMA3GROUP_SCALAR_AVX512(VFMSUB, rb_Int, X86InstrFMA3Group::Intrinsic) + FMA3GROUP_SCALAR_AVX512_ROUND(VFMSUB, rb, X86InstrFMA3Group::Intrinsic) FMA3GROUP_PACKED_AVX512_ROUND(VFMSUBADD, rb, 0) FMA3GROUP_PACKED_AVX512_ROUND(VFNMADD, rb, 0) - FMA3GROUP_SCALAR_AVX512(VFNMADD, rb_Int, X86InstrFMA3Group::Intrinsic) + FMA3GROUP_SCALAR_AVX512_ROUND(VFNMADD, rb, X86InstrFMA3Group::Intrinsic) FMA3GROUP_PACKED_AVX512_ROUND(VFNMSUB, rb, 0) - FMA3GROUP_SCALAR_AVX512(VFNMSUB, rb_Int, X86InstrFMA3Group::Intrinsic) + FMA3GROUP_SCALAR_AVX512_ROUND(VFNMSUB, rb, X86InstrFMA3Group::Intrinsic) }; static void verifyTables() { diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 6303d99..00129ed 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -879,9 +879,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK, X86ISD::CVTPS2PH, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), - X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_512, FMA_OP_MASK, X86ISD::VSHLDV, 0), @@ -933,9 +930,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ, X86ISD::VFIXUPIMMS, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), - X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), @@ -1091,6 +1085,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0), X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0), X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0), + X86_INTRINSIC_DATA(avx512_vfmadd_f32, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_vfmadd_f64, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND), X86_INTRINSIC_DATA(avx512_vfmadd_pd_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND), X86_INTRINSIC_DATA(avx512_vfmadd_ps_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND), X86_INTRINSIC_DATA(avx512_vfmaddsub_pd_512, INTR_TYPE_3OP, X86ISD::FMADDSUB, diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index ae45e55..1c7f633 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -4899,11 +4899,18 @@ define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__ ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %__A, <4 x float> %__B, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__W, i64 0 + %1 = extractelement <4 x float> %__A, i64 0 + %2 = extractelement <4 x float> %__B, i64 0 + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float %0 + %7 = insertelement <4 x float> %__W, float %6, i64 0 + ret <4 x float> %7 } -declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1 +declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1 define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { ; X86-LABEL: test_mm_maskz_fmadd_ss: @@ -4944,12 +4951,17 @@ define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %_ ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__A, i64 0 + %1 = extractelement <4 x float> %__B, i64 0 + %2 = extractelement <4 x float> %__C, i64 0 + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float 0.000000e+00 + %7 = insertelement <4 x float> %__A, float %6, i64 0 + ret <4 x float> %7 } -declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1 - define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { ; X86-LABEL: test_mm_mask3_fmadd_ss: ; X86: # %bb.0: # %entry @@ -4994,12 +5006,17 @@ define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> % ; X64-NEXT: vmovaps %xmm2, %xmm0 ; X64-NEXT: retq entry: - %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__W, i64 0 + %1 = extractelement <4 x float> %__X, i64 0 + %2 = extractelement <4 x float> %__Y, i64 0 + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float %2 + %7 = insertelement <4 x float> %__Y, float %6, i64 0 + ret <4 x float> %7 } -declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1 - define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; X86-LABEL: test_mm_mask_fmsub_ss: ; X86: # %bb.0: # %entry @@ -5045,9 +5062,16 @@ define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__ ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %sub = fsub <4 x float> , %__B - %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %__A, <4 x float> %sub, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__W, i64 0 + %1 = extractelement <4 x float> %__A, i64 0 + %.rhs = extractelement <4 x float> %__B, i64 0 + %2 = fsub float -0.000000e+00, %.rhs + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float %0 + %7 = insertelement <4 x float> %__W, float %6, i64 0 + ret <4 x float> %7 } define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { @@ -5094,9 +5118,16 @@ define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %_ ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %sub = fsub <4 x float> , %__C - %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__A, i64 0 + %1 = extractelement <4 x float> %__B, i64 0 + %.rhs = extractelement <4 x float> %__C, i64 0 + %2 = fsub float -0.000000e+00, %.rhs + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float 0.000000e+00 + %7 = insertelement <4 x float> %__A, float %6, i64 0 + ret <4 x float> %7 } define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { @@ -5132,24 +5163,36 @@ define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> % ; X86-LABEL: test_mm_mask3_fmsub_round_ss: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0] +; X86-NEXT: vxorps %xmm3, %xmm2, %xmm3 +; X86-NEXT: vfmadd213ss %xmm3, %xmm0, %xmm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X86-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; X86-NEXT: vmovaps %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask3_fmsub_round_ss: ; X64: # %bb.0: # %entry +; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0] +; X64-NEXT: vxorps %xmm3, %xmm2, %xmm3 +; X64-NEXT: vfmadd213ss %xmm3, %xmm0, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X64-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; X64-NEXT: vmovaps %xmm2, %xmm0 ; X64-NEXT: retq entry: - %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__W, i64 0 + %1 = extractelement <4 x float> %__X, i64 0 + %.rhs = extractelement <4 x float> %__Y, i64 0 + %2 = fsub float -0.000000e+00, %.rhs + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float %.rhs + %7 = insertelement <4 x float> %__Y, float %6, i64 0 + ret <4 x float> %7 } -declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) #1 - define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; X86-LABEL: test_mm_mask_fnmadd_ss: ; X86: # %bb.0: # %entry @@ -5195,9 +5238,16 @@ define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %_ ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %sub = fsub <4 x float> , %__A - %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %__B, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__W, i64 0 + %.rhs = extractelement <4 x float> %__A, i64 0 + %1 = fsub float -0.000000e+00, %.rhs + %2 = extractelement <4 x float> %__B, i64 0 + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float %0 + %7 = insertelement <4 x float> %__W, float %6, i64 0 + ret <4 x float> %7 } define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { @@ -5244,9 +5294,16 @@ define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> % ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %sub = fsub <4 x float> , %__B - %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %sub, <4 x float> %__C, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__A, i64 0 + %.rhs = extractelement <4 x float> %__B, i64 0 + %1 = fsub float -0.000000e+00, %.rhs + %2 = extractelement <4 x float> %__C, i64 0 + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float 0.000000e+00 + %7 = insertelement <4 x float> %__A, float %6, i64 0 + ret <4 x float> %7 } define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { @@ -5298,9 +5355,16 @@ define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> ; X64-NEXT: vmovaps %xmm2, %xmm0 ; X64-NEXT: retq entry: - %sub = fsub <4 x float> , %__X - %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %__Y, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__W, i64 0 + %.rhs = extractelement <4 x float> %__X, i64 0 + %1 = fsub float -0.000000e+00, %.rhs + %2 = extractelement <4 x float> %__Y, i64 0 + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float %2 + %7 = insertelement <4 x float> %__Y, float %6, i64 0 + ret <4 x float> %7 } define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { @@ -5351,10 +5415,17 @@ define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %_ ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %sub = fsub <4 x float> , %__A - %sub1 = fsub <4 x float> , %__B - %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %sub1, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__W, i64 0 + %.rhs = extractelement <4 x float> %__A, i64 0 + %1 = fsub float -0.000000e+00, %.rhs + %.rhs2 = extractelement <4 x float> %__B, i64 0 + %2 = fsub float -0.000000e+00, %.rhs2 + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float %0 + %7 = insertelement <4 x float> %__W, float %6, i64 0 + ret <4 x float> %7 } define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { @@ -5404,10 +5475,17 @@ define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> % ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %sub = fsub <4 x float> , %__B - %sub1 = fsub <4 x float> , %__C - %0 = tail call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %__A, <4 x float> %sub, <4 x float> %sub1, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__A, i64 0 + %.rhs = extractelement <4 x float> %__B, i64 0 + %1 = fsub float -0.000000e+00, %.rhs + %.rhs2 = extractelement <4 x float> %__C, i64 0 + %2 = fsub float -0.000000e+00, %.rhs2 + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float 0.000000e+00 + %7 = insertelement <4 x float> %__A, float %6, i64 0 + ret <4 x float> %7 } define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { @@ -5446,8 +5524,10 @@ define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0] ; X86-NEXT: vxorps %xmm3, %xmm1, %xmm1 +; X86-NEXT: vxorps %xmm3, %xmm2, %xmm3 +; X86-NEXT: vfmadd213ss %xmm3, %xmm0, %xmm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X86-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; X86-NEXT: vmovaps %xmm2, %xmm0 ; X86-NEXT: retl ; @@ -5455,14 +5535,24 @@ define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> ; X64: # %bb.0: # %entry ; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0] ; X64-NEXT: vxorps %xmm3, %xmm1, %xmm1 +; X64-NEXT: vxorps %xmm3, %xmm2, %xmm3 +; X64-NEXT: vfmadd213ss %xmm3, %xmm0, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X64-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; X64-NEXT: vmovaps %xmm2, %xmm0 ; X64-NEXT: retq entry: - %sub = fsub <4 x float> , %__X - %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %__W, <4 x float> %sub, <4 x float> %__Y, i8 %__U, i32 8) - ret <4 x float> %0 + %0 = extractelement <4 x float> %__W, i64 0 + %.rhs = extractelement <4 x float> %__X, i64 0 + %1 = fsub float -0.000000e+00, %.rhs + %.rhs1 = extractelement <4 x float> %__Y, i64 0 + %2 = fsub float -0.000000e+00, %.rhs1 + %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, float %3, float %.rhs1 + %7 = insertelement <4 x float> %__Y, float %6, i64 0 + ret <4 x float> %7 } define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { @@ -5505,11 +5595,18 @@ define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext % ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %__A, <2 x double> %__B, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__W, i64 0 + %1 = extractelement <2 x double> %__A, i64 0 + %2 = extractelement <2 x double> %__B, i64 0 + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double %0 + %7 = insertelement <2 x double> %__W, double %6, i64 0 + ret <2 x double> %7 } -declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1 +declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1 define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { ; X86-LABEL: test_mm_maskz_fmadd_sd: @@ -5550,12 +5647,17 @@ define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__A, i64 0 + %1 = extractelement <2 x double> %__B, i64 0 + %2 = extractelement <2 x double> %__C, i64 0 + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double 0.000000e+00 + %7 = insertelement <2 x double> %__A, double %6, i64 0 + ret <2 x double> %7 } -declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1 - define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { ; X86-LABEL: test_mm_mask3_fmadd_sd: ; X86: # %bb.0: # %entry @@ -5600,12 +5702,17 @@ define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double ; X64-NEXT: vmovapd %xmm2, %xmm0 ; X64-NEXT: retq entry: - %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__W, i64 0 + %1 = extractelement <2 x double> %__X, i64 0 + %2 = extractelement <2 x double> %__Y, i64 0 + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double %2 + %7 = insertelement <2 x double> %__Y, double %6, i64 0 + ret <2 x double> %7 } -declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1 - define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; X86-LABEL: test_mm_mask_fmsub_sd: ; X86: # %bb.0: # %entry @@ -5649,9 +5756,16 @@ define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext % ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %sub = fsub <2 x double> , %__B - %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %__A, <2 x double> %sub, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__W, i64 0 + %1 = extractelement <2 x double> %__A, i64 0 + %.rhs = extractelement <2 x double> %__B, i64 0 + %2 = fsub double -0.000000e+00, %.rhs + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double %0 + %7 = insertelement <2 x double> %__W, double %6, i64 0 + ret <2 x double> %7 } define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { @@ -5696,9 +5810,16 @@ define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %sub = fsub <2 x double> , %__C - %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__A, i64 0 + %1 = extractelement <2 x double> %__B, i64 0 + %.rhs = extractelement <2 x double> %__C, i64 0 + %2 = fsub double -0.000000e+00, %.rhs + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double 0.000000e+00 + %7 = insertelement <2 x double> %__A, double %6, i64 0 + ret <2 x double> %7 } define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { @@ -5734,24 +5855,34 @@ define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double ; X86-LABEL: test_mm_mask3_fmsub_round_sd: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm3 +; X86-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X86-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} ; X86-NEXT: vmovapd %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask3_fmsub_round_sd: ; X64: # %bb.0: # %entry +; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm3 +; X64-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X64-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} ; X64-NEXT: vmovapd %xmm2, %xmm0 ; X64-NEXT: retq entry: - %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__W, i64 0 + %1 = extractelement <2 x double> %__X, i64 0 + %.rhs = extractelement <2 x double> %__Y, i64 0 + %2 = fsub double -0.000000e+00, %.rhs + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double %.rhs + %7 = insertelement <2 x double> %__Y, double %6, i64 0 + ret <2 x double> %7 } -declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) #1 - define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; X86-LABEL: test_mm_mask_fnmadd_sd: ; X86: # %bb.0: # %entry @@ -5795,9 +5926,16 @@ define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %sub = fsub <2 x double> , %__A - %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %__B, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__W, i64 0 + %.rhs = extractelement <2 x double> %__A, i64 0 + %1 = fsub double -0.000000e+00, %.rhs + %2 = extractelement <2 x double> %__B, i64 0 + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double %0 + %7 = insertelement <2 x double> %__W, double %6, i64 0 + ret <2 x double> %7 } define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { @@ -5842,9 +5980,16 @@ define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %sub = fsub <2 x double> , %__B - %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %sub, <2 x double> %__C, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__A, i64 0 + %.rhs = extractelement <2 x double> %__B, i64 0 + %1 = fsub double -0.000000e+00, %.rhs + %2 = extractelement <2 x double> %__C, i64 0 + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double 0.000000e+00 + %7 = insertelement <2 x double> %__A, double %6, i64 0 + ret <2 x double> %7 } define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { @@ -5894,9 +6039,16 @@ define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x doubl ; X64-NEXT: vmovapd %xmm2, %xmm0 ; X64-NEXT: retq entry: - %sub = fsub <2 x double> , %__X - %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %__Y, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__W, i64 0 + %.rhs = extractelement <2 x double> %__X, i64 0 + %1 = fsub double -0.000000e+00, %.rhs + %2 = extractelement <2 x double> %__Y, i64 0 + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double %2 + %7 = insertelement <2 x double> %__Y, double %6, i64 0 + ret <2 x double> %7 } define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { @@ -5947,10 +6099,17 @@ define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: - %sub = fsub <2 x double> , %__A - %sub1 = fsub <2 x double> , %__B - %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %sub1, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__W, i64 0 + %.rhs = extractelement <2 x double> %__A, i64 0 + %1 = fsub double -0.000000e+00, %.rhs + %.rhs2 = extractelement <2 x double> %__B, i64 0 + %2 = fsub double -0.000000e+00, %.rhs2 + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double %0 + %7 = insertelement <2 x double> %__W, double %6, i64 0 + ret <2 x double> %7 } define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { @@ -6000,10 +6159,17 @@ define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %sub = fsub <2 x double> , %__B - %sub1 = fsub <2 x double> , %__C - %0 = tail call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %__A, <2 x double> %sub, <2 x double> %sub1, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__A, i64 0 + %.rhs = extractelement <2 x double> %__B, i64 0 + %1 = fsub double -0.000000e+00, %.rhs + %.rhs2 = extractelement <2 x double> %__C, i64 0 + %2 = fsub double -0.000000e+00, %.rhs2 + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double 0.000000e+00 + %7 = insertelement <2 x double> %__A, double %6, i64 0 + ret <2 x double> %7 } define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { @@ -6040,23 +6206,37 @@ define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x doubl ; X86-LABEL: test_mm_mask3_fnmsub_round_sd: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1 +; X86-NEXT: vmovapd {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00] +; X86-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; X86-NEXT: vxorpd %xmm3, %xmm2, %xmm3 +; X86-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X86-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} ; X86-NEXT: vmovapd %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask3_fnmsub_round_sd: ; X64: # %bb.0: # %entry -; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 +; X64-NEXT: vmovapd {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00] +; X64-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; X64-NEXT: vxorpd %xmm3, %xmm2, %xmm3 +; X64-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X64-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} ; X64-NEXT: vmovapd %xmm2, %xmm0 ; X64-NEXT: retq entry: - %sub = fsub <2 x double> , %__X - %0 = tail call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %__W, <2 x double> %sub, <2 x double> %__Y, i8 %__U, i32 8) - ret <2 x double> %0 + %0 = extractelement <2 x double> %__W, i64 0 + %.rhs = extractelement <2 x double> %__X, i64 0 + %1 = fsub double -0.000000e+00, %.rhs + %.rhs1 = extractelement <2 x double> %__Y, i64 0 + %2 = fsub double -0.000000e+00, %.rhs1 + %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) + %4 = bitcast i8 %__U to <8 x i1> + %5 = extractelement <8 x i1> %4, i64 0 + %6 = select i1 %5, double %3, double %.rhs1 + %7 = insertelement <2 x double> %__Y, double %6, i64 0 + ret <2 x double> %7 } define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { -- 2.7.4