This helper provides a more complete approach for lowering to X86ISD::PACKSS/PACKUS nodes - testing for existing suitable sign/zero extension before recreating it.
It also optionally packs the upper half instead of the lower half.
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
+/// Returns a node that packs the LHS + RHS nodes together at half width.
+/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
+/// TODO: Add vXi64 -> vXi32 pack support with vector_shuffle node.
+/// TODO: Add subvector splitting if/when we have a need for it.
+static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
+ bool PackHiHalf = false) {
+ MVT OpVT = LHS.getSimpleValueType();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
+ assert(OpVT == RHS.getSimpleValueType() &&
+ VT.getSizeInBits() == OpVT.getSizeInBits() &&
+ (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
+ "Unexpected PACK operand types");
+ assert((EltSizeInBits == 8 || EltSizeInBits == 16) &&
+ "Unexpected PACK result type");
+
+ // See if we already have sufficient leading bits for PACKSS/PACKUS.
+ if (!PackHiHalf) {
+ if (UsePackUS &&
+ DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
+ DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
+
+ if (DAG.ComputeMinSignedBits(LHS) <= EltSizeInBits &&
+ DAG.ComputeMinSignedBits(RHS) <= EltSizeInBits)
+ return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
+ }
+
+ // Fallback to sign/zero extending the requested half and pack.
+ SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
+ if (UsePackUS) {
+ if (PackHiHalf) {
+ LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
+ RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
+ } else {
+ SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
+ LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
+ RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
+ };
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
+ };
+
+ if (!PackHiHalf) {
+ LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
+ RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
+ }
+ LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
+ RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
+ return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
+}
+
/// Return a vector_shuffle of the specified vector of zero or undef vector.
/// This produces a shuffle where the low element of V2 is swizzled into the
/// zero/undef vector, landing at element Idx.
// Multiply, mask the lower 8bits of the lo/hi results and pack.
SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
- RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
- RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
- return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
}
// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
- if (Low) {
- // Mask the lower bits and pack the results to rejoin the halves.
- SDValue Mask = DAG.getConstant(255, dl, ExVT);
- SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
- SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
- *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
- }
+ if (Low)
+ *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
- RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
- RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
-
- // Bitcast back to VT and then pack all the even elements from Lo and Hi.
- return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
if (Subtarget.hasSSE41())
return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
-
- return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
- DAG.getBitcast(VT, Hi),
- {0, 2, 4, 6, 8, 10, 12, 14});
+ return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
}
return SDValue();
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; SSE2-NEXT: paddd %xmm5, %xmm4
; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: pslld $16, %xmm4
+; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm4, %xmm2
; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; X86-SSE2-NEXT: paddd %xmm5, %xmm4
; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; X86-SSE2-NEXT: pslld $16, %xmm4
+; X86-SSE2-NEXT: psrad $16, %xmm4
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
; X86-SSE2-NEXT: paddd %xmm5, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
+; X86-SSE2-NEXT: packssdw %xmm4, %xmm2
; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmulhuw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; X86-SSE2-NEXT: paddd %xmm3, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: paddd %xmm3, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE2-NEXT: pslld $16, %xmm1
+; X86-SSE2-NEXT: psrad $16, %xmm1
+; X86-SSE2-NEXT: packssdw %xmm2, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2
; X86-SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; SSE2-NEXT: paddd %xmm5, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm1, %xmm2
; SSE2-NEXT: psllw $1, %xmm0
; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
; X86-SSE2-NEXT: paddd %xmm5, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-SSE2-NEXT: pslld $16, %xmm1
+; X86-SSE2-NEXT: psrad $16, %xmm1
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
; X86-SSE2-NEXT: paddd %xmm5, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
+; X86-SSE2-NEXT: packssdw %xmm1, %xmm2
; X86-SSE2-NEXT: psllw $1, %xmm0
; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pmulhuw %xmm2, %xmm1
; SSE2-NEXT: pmullw %xmm2, %xmm0
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; X86-SSE2-NEXT: paddd %xmm3, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-SSE2-NEXT: pslld $16, %xmm1
+; X86-SSE2-NEXT: psrad $16, %xmm1
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
; X86-SSE2-NEXT: paddd %xmm3, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
+; X86-SSE2-NEXT: packssdw %xmm1, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: pmulhuw %xmm2, %xmm1
; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmulhuw %xmm1, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm0
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; X86-SSE2-NEXT: paddd %xmm3, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: psrad $16, %xmm2
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: paddd %xmm3, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE2-NEXT: pslld $16, %xmm1
+; X86-SSE2-NEXT: psrad $16, %xmm1
+; X86-SSE2-NEXT: packssdw %xmm2, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2
; X86-SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; X86-SSE-NEXT: paddd %xmm3, %xmm2
; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X86-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE-NEXT: pslld $16, %xmm2
+; X86-SSE-NEXT: psrad $16, %xmm2
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE-NEXT: pslld $23, %xmm1
; X86-SSE-NEXT: paddd %xmm3, %xmm1
; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE-NEXT: pslld $16, %xmm1
+; X86-SSE-NEXT: psrad $16, %xmm1
+; X86-SSE-NEXT: packssdw %xmm2, %xmm1
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = shl <8 x i16> %a, %b
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; X86-SSE-NEXT: paddd %xmm3, %xmm2
; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X86-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE-NEXT: pslld $16, %xmm2
+; X86-SSE-NEXT: psrad $16, %xmm2
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE-NEXT: pslld $23, %xmm1
; X86-SSE-NEXT: paddd %xmm3, %xmm1
; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE-NEXT: pslld $16, %xmm1
+; X86-SSE-NEXT: psrad $16, %xmm1
+; X86-SSE-NEXT: packssdw %xmm2, %xmm1
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = shl <4 x i16> %a, %b
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; X86-SSE-NEXT: paddd %xmm3, %xmm2
; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X86-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE-NEXT: pslld $16, %xmm2
+; X86-SSE-NEXT: psrad $16, %xmm2
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; X86-SSE-NEXT: pslld $23, %xmm1
; X86-SSE-NEXT: paddd %xmm3, %xmm1
; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE-NEXT: pslld $16, %xmm1
+; X86-SSE-NEXT: psrad $16, %xmm1
+; X86-SSE-NEXT: packssdw %xmm2, %xmm1
; X86-SSE-NEXT: pmullw %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = shl <2 x i16> %a, %b