From c3a772fdf51b3bde8def94610eeae6f0978dd77f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 14 Nov 2021 21:27:15 +0000 Subject: [PATCH] [X86] Add getPack helper This helper provides a more complete approach for lowering to X86ISD::PACKSS/PACKUS nodes - testing for existing suitable sign/zero extension before recreating it. It also optionally packs the upper half instead of the lower half. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 76 ++++++++++++++++++------ llvm/test/CodeGen/X86/vector-fshl-128.ll | 24 ++++---- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 24 ++++---- llvm/test/CodeGen/X86/vector-fshr-128.ll | 24 ++++---- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 24 ++++---- llvm/test/CodeGen/X86/vector-rotate-128.ll | 24 ++++---- llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 24 ++++---- llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll | 48 +++++++-------- 8 files changed, 137 insertions(+), 131 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3c45a60..62a3329 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6750,6 +6750,58 @@ static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } +/// Returns a node that packs the LHS + RHS nodes together at half width. +/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half. +/// TODO: Add vXi64 -> vXi32 pack support with vector_shuffle node. +/// TODO: Add subvector splitting if/when we have a need for it. +static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, + const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, + bool PackHiHalf = false) { + MVT OpVT = LHS.getSimpleValueType(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8; + assert(OpVT == RHS.getSimpleValueType() && + VT.getSizeInBits() == OpVT.getSizeInBits() && + (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && + "Unexpected PACK operand types"); + assert((EltSizeInBits == 8 || EltSizeInBits == 16) && + "Unexpected PACK result type"); + + // See if we already have sufficient leading bits for PACKSS/PACKUS. + if (!PackHiHalf) { + if (UsePackUS && + DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits && + DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits) + return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS); + + if (DAG.ComputeMinSignedBits(LHS) <= EltSizeInBits && + DAG.ComputeMinSignedBits(RHS) <= EltSizeInBits) + return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS); + } + + // Fallback to sign/zero extending the requested half and pack. + SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8); + if (UsePackUS) { + if (PackHiHalf) { + LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt); + RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt); + } else { + SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT); + LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask); + RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask); + }; + return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS); + }; + + if (!PackHiHalf) { + LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt); + RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt); + } + LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt); + RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt); + return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS); +} + /// Return a vector_shuffle of the specified vector of zero or undef vector. /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. @@ -28249,9 +28301,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, // Multiply, mask the lower 8bits of the lo/hi results and pack. SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); - RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT)); - RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT)); - return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); + return getPack(DAG, Subtarget, dl, VT, RLo, RHi); } // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. @@ -28404,19 +28454,10 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi); - if (Low) { - // Mask the lower bits and pack the results to rejoin the halves. - SDValue Mask = DAG.getConstant(255, dl, ExVT); - SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask); - SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask); - *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi); - } + if (Low) + *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi); - RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG); - RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG); - - // Bitcast back to VT and then pack all the even elements from Lo and Hi. - return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); + return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true); } static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, @@ -29108,10 +29149,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG); if (Subtarget.hasSSE41()) return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); - - return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo), - DAG.getBitcast(VT, Hi), - {0, 2, 4, 6, 8, 10, 12, 14}); + return getPack(DAG, Subtarget, dl, VT, Lo, Hi); } return SDValue(); diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 40976e8..5eeca65 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -464,17 +464,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; SSE2-NEXT: paddd %xmm5, %xmm4 ; SSE2-NEXT: cvttps2dq %xmm4, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm4, %xmm2 ; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 @@ -708,17 +706,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; X86-SSE2-NEXT: paddd %xmm5, %xmm4 ; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; X86-SSE2-NEXT: pslld $16, %xmm4 +; X86-SSE2-NEXT: psrad $16, %xmm4 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd %xmm5, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm4, %xmm2 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm3, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 6d0c60c..9a3e607 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -312,17 +312,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmulhuw %xmm1, %xmm2 ; SSE2-NEXT: pmullw %xmm1, %xmm0 @@ -465,17 +463,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; X86-SSE2-NEXT: paddd %xmm3, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd %xmm3, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE2-NEXT: pslld $16, %xmm1 +; X86-SSE2-NEXT: psrad $16, %xmm1 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2 ; X86-SSE2-NEXT: pmullw %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index ba78dd9..9868f3f 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -464,17 +464,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm1, %xmm2 ; SSE2-NEXT: psllw $1, %xmm0 ; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 @@ -709,17 +707,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] ; X86-SSE2-NEXT: paddd %xmm5, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: pslld $16, %xmm1 +; X86-SSE2-NEXT: psrad $16, %xmm1 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd %xmm5, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm1, %xmm2 ; X86-SSE2-NEXT: psllw $1, %xmm0 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm4, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 41e8254..6e977d6 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -328,17 +328,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pmulhuw %xmm2, %xmm1 ; SSE2-NEXT: pmullw %xmm2, %xmm0 @@ -499,17 +497,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; X86-SSE2-NEXT: paddd %xmm3, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: pslld $16, %xmm1 +; X86-SSE2-NEXT: psrad $16, %xmm1 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd %xmm3, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pmulhuw %xmm2, %xmm1 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 4e47cb1..76ca5b9 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -298,17 +298,15 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmulhuw %xmm1, %xmm2 ; SSE2-NEXT: pmullw %xmm1, %xmm0 @@ -451,17 +449,15 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; X86-SSE2-NEXT: paddd %xmm3, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd %xmm3, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE2-NEXT: pslld $16, %xmm1 +; X86-SSE2-NEXT: psrad $16, %xmm1 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2 ; X86-SSE2-NEXT: pmullw %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 084d012..7626f24 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -163,17 +163,15 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 ; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -264,17 +262,15 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; X86-SSE-NEXT: paddd %xmm3, %xmm2 ; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X86-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE-NEXT: pslld $16, %xmm2 +; X86-SSE-NEXT: psrad $16, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; X86-SSE-NEXT: pslld $23, %xmm1 ; X86-SSE-NEXT: paddd %xmm3, %xmm1 ; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE-NEXT: pslld $16, %xmm1 +; X86-SSE-NEXT: psrad $16, %xmm1 +; X86-SSE-NEXT: packssdw %xmm2, %xmm1 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = shl <8 x i16> %a, %b diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index f2fd914..c989549 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -99,17 +99,15 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 ; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -200,17 +198,15 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; X86-SSE-NEXT: paddd %xmm3, %xmm2 ; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X86-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE-NEXT: pslld $16, %xmm2 +; X86-SSE-NEXT: psrad $16, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; X86-SSE-NEXT: pslld $23, %xmm1 ; X86-SSE-NEXT: paddd %xmm3, %xmm1 ; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE-NEXT: pslld $16, %xmm1 +; X86-SSE-NEXT: psrad $16, %xmm1 +; X86-SSE-NEXT: packssdw %xmm2, %xmm1 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = shl <4 x i16> %a, %b @@ -226,17 +222,15 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 ; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -327,17 +321,15 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] ; X86-SSE-NEXT: paddd %xmm3, %xmm2 ; X86-SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X86-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE-NEXT: pslld $16, %xmm2 +; X86-SSE-NEXT: psrad $16, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; X86-SSE-NEXT: pslld $23, %xmm1 ; X86-SSE-NEXT: paddd %xmm3, %xmm1 ; X86-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE-NEXT: pslld $16, %xmm1 +; X86-SSE-NEXT: psrad $16, %xmm1 +; X86-SSE-NEXT: packssdw %xmm2, %xmm1 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = shl <2 x i16> %a, %b -- 2.7.4