From 219f32f4b68679563443cdaae7b8174c9976409a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 3 Aug 2020 17:53:32 +0100 Subject: [PATCH] [X86][SSE] Shuffle combine blends to OR(X,Y) if the relevant elements are known zero. This allows us to remove the (depth violating) code in getFauxShuffleMask where we were combining the OR(SHUFFLE,SHUFFLE) shuffle inputs as well, and not just the OR(). This is a minor step toward being able to shuffle combine from/to SELECT/BLENDV as a faux shuffle. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 65 ++++++++++++++++++------- llvm/test/CodeGen/X86/insertelement-ones.ll | 12 ++--- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 8 +-- llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll | 5 +- llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 6 +-- 5 files changed, 61 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e9bb50a..b2bfcc2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7401,8 +7401,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. - SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); - SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1)); + SDValue N0 = peekThroughBitcasts(N.getOperand(0)); + SDValue N1 = peekThroughBitcasts(N.getOperand(1)); if (!N0.getValueType().isVector() || !N1.getValueType().isVector()) return false; SmallVector SrcMask0, SrcMask1; @@ -7413,34 +7413,24 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, true)) return false; - // Shuffle inputs must be the same size as the result. - if (llvm::any_of(SrcInputs0, [VT](SDValue Op) { - return VT.getSizeInBits() != Op.getValueSizeInBits(); - })) - return false; - if (llvm::any_of(SrcInputs1, [VT](SDValue Op) { - return VT.getSizeInBits() != Op.getValueSizeInBits(); - })) - return false; - size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector Mask0, Mask1; narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0); narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1); - for (size_t i = 0; i != MaskSize; ++i) { + for (int i = 0; i != (int)MaskSize; ++i) { if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) Mask.push_back(SM_SentinelUndef); else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) Mask.push_back(SM_SentinelZero); else if (Mask1[i] == SM_SentinelZero) - Mask.push_back(Mask0[i]); + Mask.push_back(i); else if (Mask0[i] == SM_SentinelZero) - Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size())); + Mask.push_back(i + MaskSize); else return false; } - Ops.append(SrcInputs0.begin(), SrcInputs0.end()); - Ops.append(SrcInputs1.begin(), SrcInputs1.end()); + Ops.push_back(N0); + Ops.push_back(N1); return true; } case ISD::INSERT_SUBVECTOR: { @@ -34219,6 +34209,7 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary) { + unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { @@ -34276,6 +34267,46 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef Mask, } } + // Attempt to match against a OR if we're performing a blend shuffle and the + // non-blended source element is zero in each case. + if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && + (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) { + bool IsBlend = true; + unsigned NumV1Elts = V1.getValueType().getVectorNumElements(); + unsigned NumV2Elts = V2.getValueType().getVectorNumElements(); + unsigned Scale1 = NumV1Elts / NumMaskElts; + unsigned Scale2 = NumV2Elts / NumMaskElts; + APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts); + APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + if (M == SM_SentinelZero) { + DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); + DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); + continue; + } + if (M == i) { + DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); + continue; + } + if (M == (i + NumMaskElts)) { + DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); + continue; + } + IsBlend = false; + break; + } + if (IsBlend && + DAG.computeKnownBits(V1, DemandedZeroV1).isZero() && + DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) { + Shuffle = ISD::OR; + SrcVT = DstVT = EVT(MaskVT).changeTypeToInteger().getSimpleVT(); + return true; + } + } + return false; } diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll index 3d8e42b..6a9a401 100644 --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -389,11 +389,9 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) { ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq @@ -411,11 +409,9 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) { ; SSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE3-NEXT: por %xmm4, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; SSE3-NEXT: pand %xmm5, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE3-NEXT: pandn %xmm3, %xmm5 -; SSE3-NEXT: por %xmm5, %xmm1 +; SSE3-NEXT: por %xmm3, %xmm1 ; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: por %xmm4, %xmm1 ; SSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 6b49f22..9256a43 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -1314,10 +1314,10 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX1-LABEL: negative: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index f448f41..86423ce 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1713,9 +1713,8 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { ; ; SSSE3-LABEL: shuffle_v8i16_XX4X8acX: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u] -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v8i16_XX4X8acX: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 82df05e..e5285ae 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -3358,9 +3358,9 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -- 2.7.4