From: Craig Topper Date: Fri, 9 Dec 2016 06:42:28 +0000 (+0000) Subject: [AVX-512] Correctly preserve the passthru semantics of the FMA scalar intrinsics X-Git-Tag: llvmorg-4.0.0-rc1~2539 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a55b483bb5d8f807ead39191d34b30b18a78e43d;p=platform%2Fupstream%2Fllvm.git [AVX-512] Correctly preserve the passthru semantics of the FMA scalar intrinsics Summary: Scalar intrinsics have specific semantics about the which input's upper bits are passed through to the output. The same input is also supposed to be the input we use for the lower element when the mask bit is 0 in a masked operation. We aren't currently keeping these semantics with instruction selection. This patch corrects this by introducing new scalar FMA ISD nodes that indicate whether operand 1(one of the multiply inputs) or operand 3(the additon/subtraction input) should pass thru its upper bits. We use this information to select 213/132 form for the operand 1 version and the 231 form for the operand 3 version. We also use this information to suppress combining FNEG operations on the passthru input since semantically the passthru bits aren't negated. This is stronger than the earlier check added for a user being SELECTS so we can remove that. This fixes PR30913. Reviewers: delena, zvi, v_klochkov Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D27144 llvm-svn: 289190 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 75398c7ba7e4..8db8ed8f2bcf 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23435,6 +23435,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; + case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND"; + case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND"; + case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND"; + case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND"; + case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND"; + case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND"; + case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND"; + case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND"; case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; @@ -31709,14 +31717,17 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, unsigned NewOpcode = 0; if (Arg.hasOneUse()) { switch (Arg.getOpcode()) { - case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break; - case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; - case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; - case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break; - case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; - case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; - case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break; + case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break; + case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; + case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; + case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break; + // We can't handle scalar intrinsic node here because it would only + // invert one element and not the whole vector. But we could try to handle + // a negation of the lower element only. } } if (NewOpcode) @@ -32250,15 +32261,6 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); - auto isScalarMaskedNode = [&](SDValue &V) { - if (V.hasOneUse()) - return false; - for (auto User : V.getNode()->uses()) - if (User->getOpcode() == X86ISD::SELECTS && N->isOperandOf(User)) - return true; - return false; - }; - auto invertIfNegative = [](SDValue &V) { if (SDValue NegVal = isFNEG(V.getNode())) { V = NegVal; @@ -32267,10 +32269,11 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, return false; }; - // Do not convert scalar masked operations. - bool NegA = !isScalarMaskedNode(A) && invertIfNegative(A); - bool NegB = !isScalarMaskedNode(B) && invertIfNegative(B); - bool NegC = !isScalarMaskedNode(C) && invertIfNegative(C); + // Do not convert the passthru input of scalar intrinsics. + // FIXME: We could allow negations of the lower element only. + bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); + bool NegB = invertIfNegative(B); + bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); // Negative multiplication when NegA xor NegB bool NegMul = (NegA != NegB); @@ -32281,16 +32284,35 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, else NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + if (N->getOpcode() == X86ISD::FMADD_RND) { switch (NewOpcode) { - case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break; - case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break; - case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; - case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; } - return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); + } else if (N->getOpcode() == X86ISD::FMADDS1_RND) { + switch (NewOpcode) { + case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break; + } + } else if (N->getOpcode() == X86ISD::FMADDS3_RND) { + switch (NewOpcode) { + case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break; + } + } else { + assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) && + "Unexpected opcode!"); + return DAG.getNode(NewOpcode, dl, VT, A, B, C); } - return DAG.getNode(NewOpcode, dl, VT, A, B, C); + + return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, @@ -33057,6 +33079,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case X86ISD::FMADD: case X86ISD::FMADD_RND: + case X86ISD::FMADDS1_RND: + case X86ISD::FMADDS3_RND: case ISD::FMA: return combineFMA(N, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return combineGatherScatter(N, DAG); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 728ebd5e23f1..74bbe513a107 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -488,6 +488,13 @@ namespace llvm { FMADDSUB_RND, FMSUBADD_RND, + // Scalar intrinsic FMA with rounding mode. + // Two versions, passthru bits on op1 or op3. + FMADDS1_RND, FMADDS3_RND, + FNMADDS1_RND, FNMADDS3_RND, + FMSUBS1_RND, FMSUBS3_RND, + FNMSUBS1_RND, FNMSUBS3_RND, + // Compress and expand. COMPRESS, EXPAND, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index a9bfdc7bf751..fa6eba1169cc 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5580,14 +5580,16 @@ multiclass avx512_fma3s_common opc, string OpcodeStr, X86VectorVTInfo _, }// Constraints = "$src1 = $dst" multiclass avx512_fma3s_all opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ , - string SUFF> { + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, + SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> { defm NAME#213#SUFF#Z: avx512_fma3s_common opc213, bits<8> opc231, bits<8> opc132, (_.ScalarLdFrag addr:$src3))))>; defm NAME#231#SUFF#Z: avx512_fma3s_common opc213, bits<8> opc231, bits<8> opc132, (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>; defm NAME#132#SUFF#Z: avx512_fma3s_common opc213, bits<8> opc231, bits<8> opc132, } multiclass avx512_fma3s opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{ + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, + SDNode OpNodeRnds3> { let Predicates = [HasAVX512] in { defm NAME : avx512_fma3s_all, - EVEX_CD8<32, CD8VT1>, VEX_LIG; + OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">, + EVEX_CD8<32, CD8VT1>, VEX_LIG; defm NAME : avx512_fma3s_all, - EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; + OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">, + EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; } } -defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>; -defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; -defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1, + X86FmaddRnds3>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1, + X86FmsubRnds3>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, + X86FnmaddRnds1, X86FnmaddRnds3>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, + X86FnmsubRnds1, X86FnmsubRnds3>; //===----------------------------------------------------------------------===// // AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index f261ad62ee2b..1973684d2ab0 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -466,6 +466,18 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>; def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>; def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>; +// Scalar FMA intrinsics with passthru bits in operand 1. +def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>; +def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>; +def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>; +def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>; + +// Scalar FMA intrinsics with passthru bits in operand 3. +def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound>; +def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound>; +def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound>; +def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound>; + def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTFma>; def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTFma>; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 2dcb8ce96044..5e5369ea6721 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1192,8 +1192,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, @@ -1326,8 +1326,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB, @@ -1345,8 +1345,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB, X86ISD::FMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), @@ -1365,8 +1365,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, @@ -1404,8 +1404,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index ddd059ecf55d..c134fc386b40 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -4707,15 +4707,15 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 -; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vmovaps %xmm0, %xmm5 -; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4735,15 +4735,15 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 -; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vmovaps %xmm0, %xmm5 -; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm1 +; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -4762,10 +4762,10 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %xmm1, %xmm3 -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, %xmm3 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) @@ -4780,8 +4780,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) @@ -4797,13 +4796,13 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -4825,13 +4824,13 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -4853,13 +4852,13 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -4881,13 +4880,13 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -4909,13 +4908,13 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfnmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -4937,13 +4936,13 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm4 -; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vmovaps %xmm2, %xmm5 ; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfnmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -4988,8 +4987,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: ; CHECK: ## BB#0: ; CHECK-NEXT: kxorw %k0, %k0, %k1 -; CHECK-NEXT: vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine.ll b/llvm/test/CodeGen/X86/fma-fneg-combine.ll index 76d8cb5a644f..5636a5bcd73e 100644 --- a/llvm/test/CodeGen/X86/fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine.ll @@ -126,8 +126,8 @@ declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a, <8 x d define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: test10: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq entry: %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 -1, i32 4) #2 @@ -188,7 +188,7 @@ define <2 x double> @test13(<2 x double> %a, <2 x double> %b, <2 x double> %c, i ; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq entry: %sub.i = fsub <2 x double> , %a