From a406796f5f690da895015c4106a9289d88408c93 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 8 Mar 2018 08:02:52 +0000 Subject: [PATCH] [X86] Change X86::PMULDQ/PMULUDQ opcodes to take vXi64 type as input instead of vXi32. This instruction can be thought of as reading either the even elements of a vXi32 input or the lower half of each element of a vXi64 input. We currently use the vXi32 interpretation, but vXi64 matches better with its broadcast behavior in EVEX. I'm looking at moving MULDQ/MULUDQ creation to a DAG combine so we can do it when AVX512DQ is enabled without having to go through Custom lowering. But in some of the test cases we failed to use a broadcast load due to the size difference. This should help with that. I'm also wondering if we can model these instructions in native IR and remove the intrinsics and I think using a vXi64 type will work better with that. llvm-svn: 326991 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 52 ++++++++++++------- llvm/lib/Target/X86/X86InstrAVX512.td | 10 ++-- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 6 +-- llvm/lib/Target/X86/X86InstrSSE.td | 56 ++++----------------- llvm/lib/Target/X86/X86InstrXOP.td | 6 +-- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 6 --- llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll | 60 +++++++++++----------- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll | 72 ++++++++++++--------------- llvm/test/CodeGen/X86/vector-idiv.ll | 26 +++++----- llvm/test/CodeGen/X86/vector-mul.ll | 68 +++++++++++++------------ llvm/test/CodeGen/X86/vselect-avx.ll | 13 +++-- 11 files changed, 168 insertions(+), 207 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 651977f..2cc4a7f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20702,6 +20702,24 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::x86_sse41_pmuldq: + case Intrinsic::x86_avx2_pmul_dq: + case Intrinsic::x86_avx512_pmul_dq_512: { + MVT OpVT = Op.getSimpleValueType(); + return DAG.getNode(X86ISD::PMULDQ, dl, OpVT, + DAG.getBitcast(OpVT, Op.getOperand(1)), + DAG.getBitcast(OpVT, Op.getOperand(2))); + } + + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_avx2_pmulu_dq: + case Intrinsic::x86_avx512_pmulu_dq_512: { + MVT OpVT = Op.getSimpleValueType(); + return DAG.getNode(X86ISD::PMULUDQ, dl, OpVT, + DAG.getBitcast(OpVT, Op.getOperand(1)), + DAG.getBitcast(OpVT, Op.getOperand(2))); + } + case Intrinsic::x86_avx2_permd: case Intrinsic::x86_avx2_permps: // Operands intentionally swapped. Mask is last operand to intrinsic, @@ -22350,9 +22368,13 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); // Multiply the even parts. - SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); + SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, + DAG.getBitcast(MVT::v2i64, A), + DAG.getBitcast(MVT::v2i64, B)); // Now multiply odd parts. - SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); + SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, + DAG.getBitcast(MVT::v2i64, Aodds), + DAG.getBitcast(MVT::v2i64, Bodds)); Evens = DAG.getBitcast(VT, Evens); Odds = DAG.getBitcast(VT, Odds); @@ -22366,15 +22388,11 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"); - // 32-bit vector types used for MULDQ/MULUDQ. - MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); - // MULDQ returns the 64-bit result of the signed multiplication of the lower // 32-bits. We can lower with this if the sign bits stretch that far. if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 && DAG.ComputeNumSignBits(B) > 32) { - return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A), - DAG.getBitcast(MulVT, B)); + return DAG.getNode(X86ISD::PMULDQ, dl, VT, A, B); } // Ahi = psrlqi(a, 32); @@ -22399,29 +22417,23 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero)) return Op; - // Bit cast to 32-bit vectors for MULUDQ. - SDValue Alo = DAG.getBitcast(MulVT, A); - SDValue Blo = DAG.getBitcast(MulVT, B); - SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); // Only multiply lo/hi halves that aren't known to be zero. SDValue AloBlo = Zero; if (!ALoIsZero && !BLoIsZero) - AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo); + AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); SDValue AloBhi = Zero; if (!ALoIsZero && !BHiIsZero) { SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); - Bhi = DAG.getBitcast(MulVT, Bhi); - AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi); + AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); } SDValue AhiBlo = Zero; if (!AHiIsZero && !BLoIsZero) { SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); - Ahi = DAG.getBitcast(MulVT, Ahi); - AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo); + AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); } SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo); @@ -22679,10 +22691,14 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> - SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, + DAG.getBitcast(MulVT, Op0), + DAG.getBitcast(MulVT, Op1))); // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> - SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); + SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, + DAG.getBitcast(MulVT, Odd0), + DAG.getBitcast(MulVT, Odd1))); // Shuffle it back into the right order. SmallVector HighMask(NumElts); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index e338f6c..42da15e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -4526,6 +4526,10 @@ defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_I HasBWI, 1>, T8PD; defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq, + SSE_INTMUL_ITINS_P, HasAVX512, 1>, T8PD; +defm VPMULUDQ : avx512_binop_rm_vl_q<0xF4, "vpmuludq", X86pmuludq, + SSE_INTMUL_ITINS_P, HasAVX512, 1>; multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo, @@ -4547,12 +4551,6 @@ multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, } } -defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTMUL_ITINS_P, - avx512vl_i32_info, avx512vl_i64_info, - X86pmuldq, HasAVX512, 1>,T8PD; -defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, - avx512vl_i32_info, avx512vl_i64_info, - X86pmuludq, HasAVX512, 1>; defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P, avx512vl_i8_info, avx512vl_i8_info, X86multishift, HasVBMI, 0>, T8PD; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 36e8073..7005139 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -259,14 +259,12 @@ def X86selects : SDNode<"X86ISD::SELECTS", def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, - SDTCVecEltisVT<1, i32>, - SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,1>, SDTCisSameAs<1,2>]>, [SDNPCommutative]>; def X86pmuldq : SDNode<"X86ISD::PMULDQ", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, - SDTCVecEltisVT<1, i32>, - SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,1>, SDTCisSameAs<1,2>]>, [SDNPCommutative]>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index a7c7a4f..51fbdf8 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -3796,6 +3796,8 @@ defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, + SSE_INTMUL_ITINS_P, 1, NoVLX>; let Predicates = [HasAVX, NoVLX_Or_NoBWI] in defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, @@ -3821,18 +3823,6 @@ let Constraints = "$src1 = $dst" in defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, memopv2i64, i128mem, SSE_INTALU_ITINS_P>; -let Predicates = [HasAVX, NoVLX] in -defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, - loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>, - VEX_4V, VEX_WIG; -let Predicates = [HasAVX2, NoVLX] in -defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, - VR256, loadv4i64, i256mem, - SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L, VEX_WIG; -let Constraints = "$src1 = $dst" in -defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, - memopv2i64, i128mem, SSE_INTMUL_ITINS_P>; - //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// @@ -6324,31 +6314,6 @@ multiclass SS48I_binop_rm opc, string OpcodeStr, SDNode OpNode, itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } -/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst -/// types. -multiclass SS48I_binop_rm2 opc, string OpcodeStr, SDNode OpNode, - ValueType DstVT, ValueType SrcVT, RegisterClass RC, - PatFrag memop_frag, X86MemOperand x86memop, - OpndItins itins, - bit IsCommutable = 0, bit Is2Addr = 1> { - let isCommutable = IsCommutable in - def rr : SS48I, - Sched<[itins.Sched]>; - def rm : SS48I, Sched<[itins.Sched.Folded, ReadAfterLd]>; -} - let Predicates = [HasAVX, NoVLX] in { defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, @@ -6362,9 +6327,9 @@ let Predicates = [HasAVX, NoVLX] in { defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_WIG; - defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32, - VR128, loadv2i64, i128mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_WIG; + defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, + loadv2i64, i128mem, 0, SSE_INTMUL_ITINS_P>, + VEX_4V, VEX_WIG; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, @@ -6394,9 +6359,9 @@ let Predicates = [HasAVX2, NoVLX] in { defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L, VEX_WIG; - defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32, - VR256, loadv4i64, i256mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L, VEX_WIG; + defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, + loadv4i64, i256mem, 0, SSE_INTMUL_ITINS_P>, + VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, @@ -6430,9 +6395,8 @@ let Constraints = "$src1 = $dst" in { memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32, - VR128, memopv2i64, i128mem, - SSE_INTMUL_ITINS_P, 1>; + defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, + memopv2i64, i128mem, 1, SSE_INTMUL_ITINS_P>; } let Predicates = [HasAVX, NoVLX] in diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td index 7f776a6..528d3eb 100644 --- a/llvm/lib/Target/X86/X86InstrXOP.td +++ b/llvm/lib/Target/X86/X86InstrXOP.td @@ -199,11 +199,11 @@ let Predicates = [HasXOP] in { def : Pat<(v4i32 (add (mul (v4i32 VR128:$src1), (v4i32 VR128:$src2)), (v4i32 VR128:$src3))), (VPMACSDDrr VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(v2i64 (add (X86pmuldq (X86PShufd (v4i32 VR128:$src1), (i8 -11)), - (X86PShufd (v4i32 VR128:$src2), (i8 -11))), + def : Pat<(v2i64 (add (X86pmuldq (bc_v2i64 (X86PShufd (v4i32 VR128:$src1), (i8 -11))), + (bc_v2i64 (X86PShufd (v4i32 VR128:$src2), (i8 -11)))), (v2i64 VR128:$src3))), (VPMACSDQHrr VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(v2i64 (add (X86pmuldq (v4i32 VR128:$src1), (v4i32 VR128:$src2)), + def : Pat<(v2i64 (add (X86pmuldq (v2i64 VR128:$src1), (v2i64 VR128:$src2)), (v2i64 VR128:$src3))), (VPMACSDQLrr VR128:$src1, VR128:$src2, VR128:$src3)>; def : Pat<(v4i32 (add (X86vpmaddwd (v8i16 VR128:$src1), (v8i16 VR128:$src2)), diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 3e3a863..e72ba60 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -413,11 +413,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), - X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), - X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), @@ -1456,11 +1454,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_pmul_hr_sw_512, INTR_TYPE_2OP, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(avx512_pmulh_w_512, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx512_pmulhu_w_512, INTR_TYPE_2OP, ISD::MULHU, 0), - X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0), @@ -1622,7 +1618,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), - X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), @@ -1660,7 +1655,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0), - X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0), diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 76005a0..efe6939 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -104,13 +104,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; ; SSE41-LABEL: test_div7_4i32: ; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuldq %xmm2, %xmm3 +; SSE41-NEXT: pmuldq %xmm1, %xmm2 ; SSE41-NEXT: pmuldq %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psrld $31, %xmm0 @@ -121,13 +120,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; ; AVX1-LABEL: test_div7_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 ; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 @@ -417,31 +415,29 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; ; SSE41-LABEL: test_rem7_4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuldq %xmm2, %xmm3 -; SSE41-NEXT: pmuldq %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrld $31, %xmm2 -; SSE41-NEXT: psrad $2, %xmm1 -; SSE41-NEXT: paddd %xmm2, %xmm1 -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pmuldq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE41-NEXT: paddd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrld $31, %xmm1 +; SSE41-NEXT: psrad $2, %xmm2 +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 +; SSE41-NEXT: psubd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_rem7_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index c851288..a419446 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -88,7 +88,6 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmuludq %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] @@ -101,28 +100,26 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; ; SSE41-LABEL: test_div7_4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: pmuludq %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE41-NEXT: pmuludq %xmm2, %xmm1 +; SSE41-NEXT: pmuludq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE41-NEXT: psubd %xmm2, %xmm0 ; SSE41-NEXT: psrld $1, %xmm0 -; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: psrld $2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_div7_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -371,7 +368,6 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmuludq %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] @@ -393,31 +389,29 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; ; SSE41-LABEL: test_rem7_4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: pmuludq %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psubd %xmm1, %xmm2 -; SSE41-NEXT: psrld $1, %xmm2 -; SSE41-NEXT: paddd %xmm1, %xmm2 -; SSE41-NEXT: psrld $2, %xmm2 -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; SSE41-NEXT: psubd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE41-NEXT: pmuludq %xmm2, %xmm1 +; SSE41-NEXT: pmuludq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: psrld $1, %xmm1 +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: psrld $2, %xmm1 +; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psubd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_rem7_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv.ll b/llvm/test/CodeGen/X86/vector-idiv.ll index e2f7697..205cb2d 100644 --- a/llvm/test/CodeGen/X86/vector-idiv.ll +++ b/llvm/test/CodeGen/X86/vector-idiv.ll @@ -46,27 +46,25 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind { ; ; SSE41-LABEL: PR20355: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuldq %xmm2, %xmm3 -; SSE41-NEXT: pmuldq %xmm1, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766] +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pmuldq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psrld $31, %xmm0 -; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: PR20355: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766] +; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 811084e..9c41126 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -695,16 +695,17 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_neg_15_63: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967281,4294967295,4294967233,4294967295] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [4294967281,4294967295,4294967233,4294967295] +; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-NEXT: pmuludq %xmm0, %xmm3 +; X86-NEXT: paddq %xmm1, %xmm3 +; X86-NEXT: psllq $32, %xmm3 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: paddq %xmm3, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_neg_15_63: @@ -739,16 +740,17 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_neg_17_65: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967279,4294967295,4294967231,4294967295] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [4294967279,4294967295,4294967231,4294967295] +; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-NEXT: pmuludq %xmm0, %xmm3 +; X86-NEXT: paddq %xmm1, %xmm3 +; X86-NEXT: psllq $32, %xmm3 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: paddq %xmm3, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_neg_17_65: @@ -823,16 +825,17 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_neg_0_1: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,4294967295] +; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-NEXT: pmuludq %xmm0, %xmm3 +; X86-NEXT: paddq %xmm1, %xmm3 +; X86-NEXT: psllq $32, %xmm3 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: paddq %xmm3, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_neg_0_1: @@ -876,16 +879,17 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_15_neg_63: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [15,0,4294967233,4294967295] +; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-NEXT: pmuludq %xmm0, %xmm3 +; X86-NEXT: paddq %xmm1, %xmm3 +; X86-NEXT: psllq $32, %xmm3 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: paddq %xmm3, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_15_neg_63: diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index 9c2ae11..be23d4b 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -85,13 +85,12 @@ bb: define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { ; AVX1-LABEL: test3: ; AVX1: ## %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] +; AVX1-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] ; AVX1-NEXT: vpsrld $31, %xmm3, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 -- 2.7.4