From a34a8e230d05250871047c4ab4de50a92587a35c Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Thu, 10 Jul 2014 09:16:40 +0000 Subject: [PATCH] [x86] Tweak the v16i8 single input special case lowering for shuffles that splat i8s into i16s. Previously, we would try much too hard to arrange a sequence of i8s in one half of the input such that we could unpack them into i16s and shuffle those into place. This isn't always going to be a cheaper i8 shuffle than our other strategies. The case where it is always going to be cheaper is when we can arrange all the necessary inputs into one half using just i16 shuffles. It happens that viewing the problem this way also makes it much easier to produce an efficient set of shuffles to move the inputs into one half and then unpack them. With this, our splat code gets one step closer to being not terrible with the new experimental lowering strategy. It also exposes two combines missing which I will add next. llvm-svn: 212692 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 78 ++++++++++++++----------- llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll | 18 +++--- 2 files changed, 51 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 57b21cc..38551db 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7694,6 +7694,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isSingleInputShuffleMask(Mask)) { // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. + // However, it only makes sense if the pre-duplication shuffle simplifies + // things significantly. Currently, this means we need to be able to + // express the pre-duplication shuffle as an i16 shuffle. // // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. @@ -7704,7 +7707,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } return true; }; - if (canWidenViaDuplication(Mask)) { + auto tryToWidenViaDuplication = [&]() -> SDValue { + if (!canWidenViaDuplication(Mask)) + return SDValue(); SmallVector LoInputs; std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); @@ -7722,52 +7727,57 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; - int ByteMask[16]; + int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap LaneMap; - for (int i = 0; i < 16; ++i) - ByteMask[i] = -1; for (int I : InPlaceInputs) { - ByteMask[I] = I; + PreDupI16Shuffle[I/2] = I/2; LaneMap[I] = I; } - int FreeByteIdx = 0; - int TargetOffset = TargetLo ? 0 : 8; - for (int I : MovingInputs) { - // Walk the free index into the byte mask until we find an unoccupied - // spot. We bound this to 8 steps to catch bugs, the pigeonhole - // principle indicates that there *must* be a spot as we can only have - // 8 duplicated inputs. We have to walk the index using modular - // arithmetic to wrap around as necessary. - // FIXME: We could do a much better job of picking an inexpensive slot - // so this doesn't go through the worst case for the byte shuffle. - for (int j = 0; j < 8 && ByteMask[FreeByteIdx + TargetOffset] != -1; - ++j, FreeByteIdx = (FreeByteIdx + 1) % 8) - ; - assert(ByteMask[FreeByteIdx + TargetOffset] == -1 && - "Failed to find a free byte!"); - ByteMask[FreeByteIdx + TargetOffset] = I; - LaneMap[I] = FreeByteIdx + TargetOffset; + int j = TargetLo ? 0 : 4, je = j + 4; + for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { + // Check if j is already a shuffle of this input. This happens when + // there are two adjacent bytes after we move the low one. + if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { + // If we haven't yet mapped the input, search for a slot into which + // we can map it. + while (j < je && PreDupI16Shuffle[j] != -1) + ++j; + + if (j == je) + // We can't place the inputs into a single half with a simple i16 shuffle, so bail. + return SDValue(); + + // Map this input with the i16 shuffle. + PreDupI16Shuffle[j] = MovingInputs[i] / 2; + } + + // Update the lane map based on the mapping we ended up with. + LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } - V1 = DAG.getVectorShuffle(MVT::v16i8, DL, V1, DAG.getUNDEF(MVT::v16i8), - ByteMask); - for (int &M : Mask) - if (M != -1) - M = LaneMap[M]; + V1 = DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, V1, V1); - int I16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; i += 2) { if (Mask[i] != -1) - I16Shuffle[i / 2] = Mask[i] - (TargetLo ? 0 : 8); - assert(I16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); + PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); } - return DAG.getVectorShuffle(MVT::v8i16, DL, - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), - DAG.getUNDEF(MVT::v8i16), I16Shuffle); - } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); + }; + if (SDValue V = tryToWidenViaDuplication()) + return V; } // Check whether an interleaving lowering is likely to be more efficient. diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 85d2bb5..fa6bdf8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -30,14 +30,12 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01( define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) { ; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 ; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pand ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0 ; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5] +; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle @@ -58,16 +56,14 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03( define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) { ; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 ; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pand +; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,1] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,3,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,6,7] -; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] ; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,0,1] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,6] +; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle -- 2.7.4