From 16d11785a5fbd982cf81b1585603d576d743f576 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 10 Mar 2016 11:23:51 +0000 Subject: [PATCH] [X86][SSE] Basic combining of unary target shuffles of binary target shuffles. This patch reorders the combining of target shuffle masks so that when a unary shuffle takes a binary shuffle as its input but only references one of its inputs it can correctly combine into a unary shuffle mask. This is starting to encroach on the purpose of resolveTargetShuffleInputs, but I don't want to remove it until we definitely know we won't need it for full binary shuffle combining. There is a lot more work before we can properly support binary target shuffle masks but this was an easy case to add support for. Differential Revision: http://reviews.llvm.org/D17858 llvm-svn: 263102 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 30 +++++++++++++--------- llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll | 12 +++------ .../CodeGen/X86/vector-shuffle-combining-ssse3.ll | 9 +++---- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b482371..3ae5136 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5266,8 +5266,7 @@ static bool setTargetShuffleZeroElements(SDValue N, /// remaining input indices in case we now have a unary shuffle and adjust the /// Op0/Op1 inputs accordingly. /// Returns true if the target shuffle mask was decoded. -static bool resolveTargetShuffleInputs(SDValue Op, bool &IsUnary, SDValue &Op0, - SDValue &Op1, +static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1, SmallVectorImpl &Mask) { SmallVector Ops; if (!setTargetShuffleZeroElements(Op, Mask, Ops)) @@ -5282,10 +5281,6 @@ static bool resolveTargetShuffleInputs(SDValue Op, bool &IsUnary, SDValue &Op0, Op0 = Op0InUse ? Ops[0] : SDValue(); Op1 = Op1InUse ? Ops[1] : SDValue(); - IsUnary = !(Op0InUse && Op1InUse); - - if (!IsUnary) - return true; // We're only using Op1 - commute the mask and inputs. if (!Op0InUse && Op1InUse) { @@ -24036,14 +24031,9 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. - bool IsUnary; SDValue Input0, Input1; SmallVector OpMask; - if (!resolveTargetShuffleInputs(Op, IsUnary, Input0, Input1, OpMask)) - return false; - - // At the moment we can only combine target shuffle unary cases. - if (!IsUnary) + if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) return false; assert(VT.getVectorNumElements() == OpMask.size() && @@ -24103,8 +24093,24 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, Subtarget, DAG, SDLoc(Root))); return true; } + + int MaskSize = Mask.size(); + bool UseInput0 = std::any_of(Mask.begin(), Mask.end(), + [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; }); + bool UseInput1 = std::any_of(Mask.begin(), Mask.end(), + [MaskSize](int Idx) { return MaskSize <= Idx; }); + + // At the moment we can only combine unary shuffle mask cases. + if (UseInput0 && UseInput1) + return false; + else if (UseInput1) { + std::swap(Input0, Input1); + ShuffleVectorSDNode::commuteMask(Mask); + } + assert(Input0 && "Shuffle with no inputs detected"); + // TODO - generalize this to support any variable mask shuffle. HasPSHUFB |= (Op.getOpcode() == X86ISD::PSHUFB); // See if we can recurse into Input0 (if it's a target shuffle). diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll index 5d970e1..0397366 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -929,18 +929,14 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) { ; ; SSE41-LABEL: shuffle_v2i64_bitcast_z123: ; SSE41: # BB#0: -; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: shuffle_v2i64_bitcast_z123: ; AVX1: # BB#0: -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v2i64_bitcast_z123: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index aecd45a..f66f0d4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -63,13 +63,11 @@ define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) { define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) { ; SSE-LABEL: combine_unpckl_arg0_pshufb: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: combine_unpckl_arg0_pshufb: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> @@ -80,14 +78,13 @@ define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) { define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) { ; SSE-LABEL: combine_unpckl_arg1_pshufb: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_unpckl_arg1_pshufb: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) -- 2.7.4