case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
+ case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
+ case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
+ case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
+ case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
+ case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
+ case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
+ case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
+ case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
unsigned NewOpcode = 0;
if (Arg.hasOneUse()) {
switch (Arg.getOpcode()) {
- case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
- case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
- case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
+ case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
+ // We can't handle scalar intrinsic node here because it would only
+ // invert one element and not the whole vector. But we could try to handle
+ // a negation of the lower element only.
}
}
if (NewOpcode)
SDValue B = N->getOperand(1);
SDValue C = N->getOperand(2);
- auto isScalarMaskedNode = [&](SDValue &V) {
- if (V.hasOneUse())
- return false;
- for (auto User : V.getNode()->uses())
- if (User->getOpcode() == X86ISD::SELECTS && N->isOperandOf(User))
- return true;
- return false;
- };
-
auto invertIfNegative = [](SDValue &V) {
if (SDValue NegVal = isFNEG(V.getNode())) {
V = NegVal;
return false;
};
- // Do not convert scalar masked operations.
- bool NegA = !isScalarMaskedNode(A) && invertIfNegative(A);
- bool NegB = !isScalarMaskedNode(B) && invertIfNegative(B);
- bool NegC = !isScalarMaskedNode(C) && invertIfNegative(C);
+ // Do not convert the passthru input of scalar intrinsics.
+ // FIXME: We could allow negations of the lower element only.
+ bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+ bool NegB = invertIfNegative(B);
+ bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
// Negative multiplication when NegA xor NegB
bool NegMul = (NegA != NegB);
else
NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
if (N->getOpcode() == X86ISD::FMADD_RND) {
switch (NewOpcode) {
- case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
}
- return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+ } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
+ switch (NewOpcode) {
+ case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
+ }
+ } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
+ switch (NewOpcode) {
+ case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
+ }
+ } else {
+ assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
+ "Unexpected opcode!");
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);
}
- return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
}
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
case X86ISD::FMADD:
case X86ISD::FMADD_RND:
+ case X86ISD::FMADDS1_RND:
+ case X86ISD::FMADDS3_RND:
case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case ISD::MGATHER:
case ISD::MSCATTER: return combineGatherScatter(N, DAG);
FMADDSUB_RND,
FMSUBADD_RND,
+ // Scalar intrinsic FMA with rounding mode.
+ // Two versions, passthru bits on op1 or op3.
+ FMADDS1_RND, FMADDS3_RND,
+ FNMADDS1_RND, FNMADDS3_RND,
+ FMSUBS1_RND, FMSUBS3_RND,
+ FNMSUBS1_RND, FNMSUBS3_RND,
+
// Compress and expand.
COMPRESS,
EXPAND,
}// Constraints = "$src1 = $dst"
multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ ,
- string SUFF> {
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+ SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
- (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1,
+ // Operands for intrinsic are in 123 order to preserve passthu
+ // semantics.
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
- (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3,
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
_.FRC:$src3))),
(_.ScalarLdFrag addr:$src3))))>;
defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
- (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnd _.RC:$src2,
+ (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds3 _.RC:$src2,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
_.RC:$src1, (i32 FROUND_CURRENT))),
- (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1,
+ (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
_.FRC:$src1))),
(_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnd _.RC:$src1,
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds1 _.RC:$src1,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
_.RC:$src2, (i32 FROUND_CURRENT))),
- (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2,
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
_.FRC:$src2))),
}
multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+ SDNode OpNodeRnds3> {
let Predicates = [HasAVX512] in {
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnd, f32x_info, "SS">,
- EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
+ EVEX_CD8<32, CD8VT1>, VEX_LIG;
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnd, f64x_info, "SD">,
- EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+ OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
+ EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
}
}
-defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>;
-defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
+ X86FmaddRnds3>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
+ X86FmsubRnds3>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
+ X86FnmaddRnds1, X86FnmaddRnds3>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
+ X86FnmsubRnds1, X86FnmsubRnds3>;
//===----------------------------------------------------------------------===//
// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>;
def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>;
+// Scalar FMA intrinsics with passthru bits in operand 1.
+def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>;
+def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>;
+def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>;
+def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>;
+
+// Scalar FMA intrinsics with passthru bits in operand 3.
+def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound>;
+def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound>;
+def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound>;
+def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound>;
+
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTFma>;
def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTFma>;
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
X86ISD::FMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm0, %xmm3
-; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm4
-; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4
; CHECK-NEXT: vmovaps %xmm0, %xmm5
-; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
-; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
-; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm1
+; CHECK-NEXT: vaddpd %xmm5, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm0, %xmm3
-; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm4
-; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4
; CHECK-NEXT: vmovaps %xmm0, %xmm5
-; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
-; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
-; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm1
+; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovaps %xmm1, %xmm3
-; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z}
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm4
-; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %xmm2, %xmm5
; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm4
-; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %xmm2, %xmm5
; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm4
-; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %xmm2, %xmm5
; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT: vfmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm4
-; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %xmm2, %xmm5
; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT: vfmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm4
-; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %xmm2, %xmm5
; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT: vfnmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm4
-; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %xmm2, %xmm5
; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
-; CHECK-NEXT: vfnmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
; CHECK: ## BB#0:
; CHECK-NEXT: kxorw %k0, %k0, %k1
-; CHECK-NEXT: vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z}
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test10:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 -1, i32 4) #2
; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
entry:
%sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a