This instruction can be thought of as reading either the even elements of a vXi32 input or the lower half of each element of a vXi64 input. We currently use the vXi32 interpretation, but vXi64 matches better with its broadcast behavior in EVEX.
I'm looking at moving MULDQ/MULUDQ creation to a DAG combine so we can do it when AVX512DQ is enabled without having to go through Custom lowering. But in some of the test cases we failed to use a broadcast load due to the size difference. This should help with that.
I'm also wondering if we can model these instructions in native IR and remove the intrinsics and I think using a vXi64 type will work better with that.
llvm-svn: 326991
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::x86_sse41_pmuldq:
+ case Intrinsic::x86_avx2_pmul_dq:
+ case Intrinsic::x86_avx512_pmul_dq_512: {
+ MVT OpVT = Op.getSimpleValueType();
+ return DAG.getNode(X86ISD::PMULDQ, dl, OpVT,
+ DAG.getBitcast(OpVT, Op.getOperand(1)),
+ DAG.getBitcast(OpVT, Op.getOperand(2)));
+ }
+
+ case Intrinsic::x86_sse2_pmulu_dq:
+ case Intrinsic::x86_avx2_pmulu_dq:
+ case Intrinsic::x86_avx512_pmulu_dq_512: {
+ MVT OpVT = Op.getSimpleValueType();
+ return DAG.getNode(X86ISD::PMULUDQ, dl, OpVT,
+ DAG.getBitcast(OpVT, Op.getOperand(1)),
+ DAG.getBitcast(OpVT, Op.getOperand(2)));
+ }
+
case Intrinsic::x86_avx2_permd:
case Intrinsic::x86_avx2_permps:
// Operands intentionally swapped. Mask is last operand to intrinsic,
SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
// Multiply the even parts.
- SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+ SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, A),
+ DAG.getBitcast(MVT::v2i64, B));
// Now multiply odd parts.
- SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+ SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64, Aodds),
+ DAG.getBitcast(MVT::v2i64, Bodds));
Evens = DAG.getBitcast(VT, Evens);
Odds = DAG.getBitcast(VT, Odds);
assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");
- // 32-bit vector types used for MULDQ/MULUDQ.
- MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
-
// MULDQ returns the 64-bit result of the signed multiplication of the lower
// 32-bits. We can lower with this if the sign bits stretch that far.
if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
DAG.ComputeNumSignBits(B) > 32) {
- return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
- DAG.getBitcast(MulVT, B));
+ return DAG.getNode(X86ISD::PMULDQ, dl, VT, A, B);
}
// Ahi = psrlqi(a, 32);
if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero))
return Op;
- // Bit cast to 32-bit vectors for MULUDQ.
- SDValue Alo = DAG.getBitcast(MulVT, A);
- SDValue Blo = DAG.getBitcast(MulVT, B);
-
SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
// Only multiply lo/hi halves that aren't known to be zero.
SDValue AloBlo = Zero;
if (!ALoIsZero && !BLoIsZero)
- AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
+ AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
SDValue AloBhi = Zero;
if (!ALoIsZero && !BHiIsZero) {
SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
- Bhi = DAG.getBitcast(MulVT, Bhi);
- AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
+ AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
}
SDValue AhiBlo = Zero;
if (!AHiIsZero && !BLoIsZero) {
SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
- Ahi = DAG.getBitcast(MulVT, Ahi);
- AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
+ AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
}
SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
(!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
- SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+ SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, Op0),
+ DAG.getBitcast(MulVT, Op1)));
// PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
// => <2 x i64> <bf|dh>
- SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+ SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, Odd0),
+ DAG.getBitcast(MulVT, Odd1)));
// Shuffle it back into the right order.
SmallVector<int, 16> HighMask(NumElts);
HasBWI, 1>, T8PD;
defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq,
+ SSE_INTMUL_ITINS_P, HasAVX512, 1>, T8PD;
+defm VPMULUDQ : avx512_binop_rm_vl_q<0xF4, "vpmuludq", X86pmuludq,
+ SSE_INTMUL_ITINS_P, HasAVX512, 1>;
multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo,
}
}
-defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTMUL_ITINS_P,
- avx512vl_i32_info, avx512vl_i64_info,
- X86pmuldq, HasAVX512, 1>,T8PD;
-defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P,
- avx512vl_i32_info, avx512vl_i64_info,
- X86pmuludq, HasAVX512, 1>;
defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P,
avx512vl_i8_info, avx512vl_i8_info,
X86multishift, HasVBMI, 0>, T8PD;
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
- SDTCVecEltisVT<1, i32>,
- SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>]>,
[SDNPCommutative]>;
def X86pmuldq : SDNode<"X86ISD::PMULDQ",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
- SDTCVecEltisVT<1, i32>,
- SDTCisSameSizeAs<0,1>,
+ SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>]>,
[SDNPCommutative]>;
SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
+ SSE_INTMUL_ITINS_P, 1, NoVLX>;
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
-let Predicates = [HasAVX, NoVLX] in
-defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
- loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
- VEX_4V, VEX_WIG;
-let Predicates = [HasAVX2, NoVLX] in
-defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
- VR256, loadv4i64, i256mem,
- SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L, VEX_WIG;
-let Constraints = "$src1 = $dst" in
-defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
- memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
-
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Logical Instructions
//===---------------------------------------------------------------------===//
itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
-/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
-/// types.
-multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType DstVT, ValueType SrcVT, RegisterClass RC,
- PatFrag memop_frag, X86MemOperand x86memop,
- OpndItins itins,
- bit IsCommutable = 0, bit Is2Addr = 1> {
- let isCommutable = IsCommutable in
- def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))], itins.rr>,
- Sched<[itins.Sched]>;
- def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
- (bitconvert (memop_frag addr:$src2)))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
-}
-
let Predicates = [HasAVX, NoVLX] in {
defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
VEX_4V, VEX_WIG;
- defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
- VR128, loadv2i64, i128mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_WIG;
+ defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
+ loadv2i64, i128mem, 0, SSE_INTMUL_ITINS_P>,
+ VEX_4V, VEX_WIG;
}
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
VEX_4V, VEX_L, VEX_WIG;
- defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
- VR256, loadv4i64, i256mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L, VEX_WIG;
+ defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
+ loadv4i64, i256mem, 0, SSE_INTMUL_ITINS_P>,
+ VEX_4V, VEX_L, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
- defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
- VR128, memopv2i64, i128mem,
- SSE_INTMUL_ITINS_P, 1>;
+ defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
+ memopv2i64, i128mem, 1, SSE_INTMUL_ITINS_P>;
}
let Predicates = [HasAVX, NoVLX] in
def : Pat<(v4i32 (add (mul (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
(v4i32 VR128:$src3))),
(VPMACSDDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
- def : Pat<(v2i64 (add (X86pmuldq (X86PShufd (v4i32 VR128:$src1), (i8 -11)),
- (X86PShufd (v4i32 VR128:$src2), (i8 -11))),
+ def : Pat<(v2i64 (add (X86pmuldq (bc_v2i64 (X86PShufd (v4i32 VR128:$src1), (i8 -11))),
+ (bc_v2i64 (X86PShufd (v4i32 VR128:$src2), (i8 -11)))),
(v2i64 VR128:$src3))),
(VPMACSDQHrr VR128:$src1, VR128:$src2, VR128:$src3)>;
- def : Pat<(v2i64 (add (X86pmuldq (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
+ def : Pat<(v2i64 (add (X86pmuldq (v2i64 VR128:$src1), (v2i64 VR128:$src2)),
(v2i64 VR128:$src3))),
(VPMACSDQLrr VR128:$src1, VR128:$src2, VR128:$src3)>;
def : Pat<(v4i32 (add (X86vpmaddwd (v8i16 VR128:$src1), (v8i16 VR128:$src2)),
X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
- X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
- X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
- X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(avx512_pmul_hr_sw_512, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(avx512_pmulh_w_512, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(avx512_pmulhu_w_512, INTR_TYPE_2OP, ISD::MULHU, 0),
- X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
- X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
- X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0),
;
; SSE41-LABEL: test_div7_4i32:
; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm2, %xmm3
+; SSE41-NEXT: pmuldq %xmm1, %xmm2
; SSE41-NEXT: pmuldq %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; SSE41-NEXT: paddd %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psrld $31, %xmm0
;
; AVX1-LABEL: test_div7_4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
;
; SSE41-LABEL: test_rem7_4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm2, %xmm3
-; SSE41-NEXT: pmuldq %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrld $31, %xmm2
-; SSE41-NEXT: psrad $2, %xmm1
-; SSE41-NEXT: paddd %xmm2, %xmm1
-; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT: pmuldq %xmm2, %xmm1
+; SSE41-NEXT: pmuldq %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; SSE41-NEXT: paddd %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrld $31, %xmm1
+; SSE41-NEXT: psrad $2, %xmm2
+; SSE41-NEXT: paddd %xmm1, %xmm2
+; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; SSE41-NEXT: psubd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
;
; SSE41-LABEL: test_div7_4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm2, %xmm3
-; SSE41-NEXT: pmuludq %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT: pmuludq %xmm2, %xmm1
+; SSE41-NEXT: pmuludq %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; SSE41-NEXT: psubd %xmm2, %xmm0
; SSE41-NEXT: psrld $1, %xmm0
-; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: psrld $2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
;
; SSE41-LABEL: test_rem7_4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm2, %xmm3
-; SSE41-NEXT: pmuludq %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: psrld $1, %xmm2
-; SSE41-NEXT: paddd %xmm1, %xmm2
-; SSE41-NEXT: psrld $2, %xmm2
-; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; SSE41-NEXT: psubd %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT: pmuludq %xmm2, %xmm1
+; SSE41-NEXT: pmuludq %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psubd %xmm2, %xmm1
+; SSE41-NEXT: psrld $1, %xmm1
+; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: psrld $2, %xmm1
+; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psubd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
;
; SSE41-LABEL: PR20355:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm2, %xmm3
-; SSE41-NEXT: pmuldq %xmm1, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766]
+; SSE41-NEXT: pmuldq %xmm2, %xmm1
+; SSE41-NEXT: pmuldq %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: psrld $31, %xmm0
-; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: PR20355:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X86-LABEL: mul_v2i64_neg_15_63:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967281,4294967295,4294967233,4294967295]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrlq $32, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [4294967281,4294967295,4294967233,4294967295]
+; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: movdqa %xmm2, %xmm3
; X86-NEXT: psrlq $32, %xmm3
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-NEXT: pmuludq %xmm0, %xmm3
+; X86-NEXT: paddq %xmm1, %xmm3
+; X86-NEXT: psllq $32, %xmm3
+; X86-NEXT: pmuludq %xmm2, %xmm0
; X86-NEXT: paddq %xmm3, %xmm0
-; X86-NEXT: psllq $32, %xmm0
-; X86-NEXT: paddq %xmm2, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: mul_v2i64_neg_15_63:
define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X86-LABEL: mul_v2i64_neg_17_65:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967279,4294967295,4294967231,4294967295]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrlq $32, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [4294967279,4294967295,4294967231,4294967295]
+; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: movdqa %xmm2, %xmm3
; X86-NEXT: psrlq $32, %xmm3
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-NEXT: pmuludq %xmm0, %xmm3
+; X86-NEXT: paddq %xmm1, %xmm3
+; X86-NEXT: psllq $32, %xmm3
+; X86-NEXT: pmuludq %xmm2, %xmm0
; X86-NEXT: paddq %xmm3, %xmm0
-; X86-NEXT: psllq $32, %xmm0
-; X86-NEXT: paddq %xmm2, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: mul_v2i64_neg_17_65:
define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X86-LABEL: mul_v2i64_neg_0_1:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrlq $32, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,4294967295]
+; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: movdqa %xmm2, %xmm3
; X86-NEXT: psrlq $32, %xmm3
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-NEXT: pmuludq %xmm0, %xmm3
+; X86-NEXT: paddq %xmm1, %xmm3
+; X86-NEXT: psllq $32, %xmm3
+; X86-NEXT: pmuludq %xmm2, %xmm0
; X86-NEXT: paddq %xmm3, %xmm0
-; X86-NEXT: psllq $32, %xmm0
-; X86-NEXT: paddq %xmm2, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: mul_v2i64_neg_0_1:
define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X86-LABEL: mul_v2i64_15_neg_63:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrlq $32, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [15,0,4294967233,4294967295]
+; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: movdqa %xmm2, %xmm3
; X86-NEXT: psrlq $32, %xmm3
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-NEXT: pmuludq %xmm0, %xmm3
+; X86-NEXT: paddq %xmm1, %xmm3
+; X86-NEXT: psllq $32, %xmm3
+; X86-NEXT: pmuludq %xmm2, %xmm0
; X86-NEXT: paddq %xmm3, %xmm0
-; X86-NEXT: psllq $32, %xmm0
-; X86-NEXT: paddq %xmm2, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: mul_v2i64_15_neg_63:
define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
; AVX1-LABEL: test3:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766]
+; AVX1-NEXT: vpmuldq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmuldq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; AVX1-NEXT: vpsrld $31, %xmm3, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3