From 35c47c494d58af1ed41934f49ea40484238f73eb Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 29 Nov 2016 14:18:51 +0000 Subject: [PATCH] [X86][SSE] Add initial support for combining target shuffles to (V)PMOVZX. We can only handle 128-bit vectors until we support target shuffle inputs of different size to the output. llvm-svn: 288140 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 52 ++++++--- llvm/test/CodeGen/X86/vector-popcnt-128.ll | 34 +++--- llvm/test/CodeGen/X86/vector-popcnt-256.ll | 4 +- .../CodeGen/X86/vector-shuffle-combining-ssse3.ll | 15 ++- llvm/test/CodeGen/X86/vector-tzcnt-128.ll | 118 ++++++++++----------- llvm/test/CodeGen/X86/vector-tzcnt-256.ll | 8 +- llvm/test/CodeGen/X86/vector-zext.ll | 30 ++++-- 7 files changed, 147 insertions(+), 114 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 02e0657..c902cb2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25445,7 +25445,7 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &ShuffleVT) { + unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); bool FloatDomain = MaskVT.isFloatingPoint() || @@ -25456,27 +25456,48 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, isUndefOrEqual(Mask[0], 0) && isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { Shuffle = X86ISD::VZEXT_MOVL; - ShuffleVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; return true; } + // Match against a VZEXT instruction. + // TODO: Add 256/512-bit vector support. + if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) { + unsigned MaxScale = 64 / MaskEltSize; + for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { + bool Match = true; + unsigned NumDstElts = NumMaskElts / Scale; + for (unsigned i = 0; i != NumDstElts && Match; ++i) { + Match &= isUndefOrEqual(Mask[i * Scale], (int)i); + Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); + } + if (Match) { + SrcVT = MaskVT; + DstVT = MVT::getIntegerVT(Scale * MaskEltSize); + DstVT = MVT::getVectorVT(DstVT, NumDstElts); + Shuffle = X86ISD::VZEXT; + return true; + } + } + } + // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; - ShuffleVT = MVT::v2f64; + SrcVT = DstVT = MVT::v2f64; return true; } if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVSLDUP; - ShuffleVT = MVT::v4f32; + SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) { Shuffle = X86ISD::MOVSHDUP; - ShuffleVT = MVT::v4f32; + SrcVT = DstVT = MVT::v4f32; return true; } } @@ -25485,17 +25506,17 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; - ShuffleVT = MVT::v4f64; + SrcVT = DstVT = MVT::v4f64; return true; } if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVSLDUP; - ShuffleVT = MVT::v8f32; + SrcVT = DstVT = MVT::v8f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) { Shuffle = X86ISD::MOVSHDUP; - ShuffleVT = MVT::v8f32; + SrcVT = DstVT = MVT::v8f32; return true; } } @@ -25505,19 +25526,19 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVDDUP; - ShuffleVT = MVT::v8f64; + SrcVT = DstVT = MVT::v8f64; return true; } if (isTargetShuffleEquivalent( Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) { Shuffle = X86ISD::MOVSLDUP; - ShuffleVT = MVT::v16f32; + SrcVT = DstVT = MVT::v16f32; return true; } if (isTargetShuffleEquivalent( Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) { Shuffle = X86ISD::MOVSHDUP; - ShuffleVT = MVT::v16f32; + SrcVT = DstVT = MVT::v16f32; return true; } } @@ -25526,7 +25547,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, if (Subtarget.hasAVX2()) { SmallVector BroadcastMask(NumMaskElts, 0); if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { - ShuffleVT = MaskVT; + SrcVT = DstVT = MaskVT; Shuffle = X86ISD::VBROADCAST; return true; } @@ -25954,7 +25975,7 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts); // Attempt to match the mask against known shuffle patterns. - MVT ShuffleVT; + MVT ShuffleSrcVT, ShuffleVT; unsigned Shuffle, PermuteImm; if (UnaryShuffle) { @@ -25973,12 +25994,13 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } - if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT)) { + if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) return false; // AVX512 Writemask clash. - Res = DAG.getBitcast(ShuffleVT, V1); + Res = DAG.getBitcast(ShuffleSrcVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); DCI.AddToWorklist(Res.getNode()); diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll index 358bd40..27909c6 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -175,24 +175,22 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; SSE41-LABEL: testv4i32: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufb %xmm2, %xmm4 ; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm1 -; SSE41-NEXT: paddb %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE41-NEXT: psadbw %xmm0, %xmm2 -; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: packuswb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE41-NEXT: psadbw %xmm1, %xmm3 +; SSE41-NEXT: psadbw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv4i32: @@ -208,7 +206,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll index b0e39bd..8bbfea9 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -58,7 +58,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5 @@ -69,7 +69,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 3e2a8c4..2ae0fae 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -106,14 +106,19 @@ define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) { } define <4 x i32> @combine_pshufb_as_zext(<16 x i8> %a0) { -; SSE-LABEL: combine_pshufb_as_zext: -; SSE: # BB#0: -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE-NEXT: retq +; SSSE3-LABEL: combine_pshufb_as_zext: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_pshufb_as_zext: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_pshufb_as_zext: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = bitcast <16 x i8> %1 to <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index aa5abab..5c92e62 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -448,22 +448,21 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE41-NEXT: psubd %xmm0, %xmm2 ; SSE41-NEXT: pand %xmm0, %xmm2 ; SSE41-NEXT: psubd {{.*}}(%rip), %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pshufb %xmm4, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pshufb %xmm3, %xmm5 ; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: paddb %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE41-NEXT: psadbw %xmm1, %xmm2 -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: paddb %xmm5, %xmm4 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE41-NEXT: psadbw %xmm1, %xmm4 ; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: packuswb %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: testv4i32: @@ -482,7 +481,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -504,7 +503,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -547,7 +546,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX512CD-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512CD-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX512CD-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512CD-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512CD-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX512CD-NEXT: retq @@ -559,22 +558,21 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; X32-SSE-NEXT: psubd %xmm0, %xmm2 ; X32-SSE-NEXT: pand %xmm0, %xmm2 ; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm2, %xmm4 -; X32-SSE-NEXT: pand %xmm3, %xmm4 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: pshufb %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pand %xmm0, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pshufb %xmm3, %xmm5 ; X32-SSE-NEXT: psrlw $4, %xmm2 -; X32-SSE-NEXT: pand %xmm3, %xmm2 -; X32-SSE-NEXT: pshufb %xmm2, %xmm0 -; X32-SSE-NEXT: paddb %xmm5, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X32-SSE-NEXT: psadbw %xmm1, %xmm2 -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: pand %xmm0, %xmm2 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: paddb %xmm5, %xmm4 +; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; X32-SSE-NEXT: psadbw %xmm1, %xmm4 ; X32-SSE-NEXT: psadbw %xmm1, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: packuswb %xmm4, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) ret <4 x i32> %out @@ -671,22 +669,21 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE41-NEXT: psubd %xmm0, %xmm2 ; SSE41-NEXT: pand %xmm0, %xmm2 ; SSE41-NEXT: psubd {{.*}}(%rip), %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pshufb %xmm4, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pshufb %xmm3, %xmm5 ; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: paddb %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE41-NEXT: psadbw %xmm1, %xmm2 -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: paddb %xmm5, %xmm4 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE41-NEXT: psadbw %xmm1, %xmm4 ; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: packuswb %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: testv4i32u: @@ -705,7 +702,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -727,7 +724,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -759,22 +756,21 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; X32-SSE-NEXT: psubd %xmm0, %xmm2 ; X32-SSE-NEXT: pand %xmm0, %xmm2 ; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm2, %xmm4 -; X32-SSE-NEXT: pand %xmm3, %xmm4 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: pshufb %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: pand %xmm0, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm4, %xmm5 +; X32-SSE-NEXT: pshufb %xmm3, %xmm5 ; X32-SSE-NEXT: psrlw $4, %xmm2 -; X32-SSE-NEXT: pand %xmm3, %xmm2 -; X32-SSE-NEXT: pshufb %xmm2, %xmm0 -; X32-SSE-NEXT: paddb %xmm5, %xmm0 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X32-SSE-NEXT: psadbw %xmm1, %xmm2 -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE-NEXT: pand %xmm0, %xmm2 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: paddb %xmm5, %xmm4 +; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; X32-SSE-NEXT: psadbw %xmm1, %xmm4 ; X32-SSE-NEXT: psadbw %xmm1, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 +; X32-SSE-NEXT: packuswb %xmm4, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) ret <4 x i32> %out diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll index db9081d..5e9a0ff 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -224,7 +224,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 @@ -236,7 +236,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -354,7 +354,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 @@ -366,7 +366,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index e679cc9..8d8d7aa 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -1636,7 +1636,7 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ; ; AVX1-LABEL: shuf_zext_4i32_to_4i64: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0] @@ -1892,18 +1892,30 @@ entry: } define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE-LABEL: shuf_zext_8i16_to_4i32_offset1: -; SSE: # BB#0: # %entry -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: retq +; SSE2-LABEL: shuf_zext_8i16_to_4i32_offset1: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_4i32_offset1: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_4i32_offset1: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: retq ; ; AVX-LABEL: shuf_zext_8i16_to_4i32_offset1: ; AVX: # BB#0: # %entry ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: retq entry: %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> -- 2.7.4