From: Roman Lebedev Date: Sun, 19 Sep 2021 14:21:17 +0000 (+0300) Subject: [X86] combineX86ShufflesRecursively(): call SimplifyMultipleUseDemandedVectorElts... X-Git-Tag: upstream/15.0.7~31147 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1e72ca94e5796a744d0e1a8871c33b1b4edb0acb;p=platform%2Fupstream%2Fllvm.git [X86] combineX86ShufflesRecursively(): call SimplifyMultipleUseDemandedVectorElts() on after finishing recursing This was suggested in https://reviews.llvm.org/D108382#inline-1039018, and it avoids regressions in that patch. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D109065 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 82b6a20..92dfbc1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37905,6 +37905,48 @@ static SDValue combineX86ShufflesRecursively( Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget)) return DAG.getBitcast(Root.getValueType(), HOp); + // Try to refine our inputs given our knowledge of target shuffle mask. + for (auto I : enumerate(Ops)) { + int OpIdx = I.index(); + SDValue &Op = I.value(); + + // What range of shuffle mask element values results in picking from Op? + int lo = OpIdx * Mask.size(); + int hi = lo + Mask.size(); + + // Which elements of Op do we demand, given the mask's granularity? + APInt OpDemandedElts(Mask.size(), 0); + for (int MaskElt : Mask) { + if (isInRange(MaskElt, lo, hi)) { // Picks from Op? + int OpEltIdx = MaskElt - lo; + OpDemandedElts.setBit(OpEltIdx); + } + } + + // Is the shuffle result smaller than the root? + if (Op.getValueSizeInBits() < RootSizeInBits) { + // We padded the mask with undefs. But we now need to undo that. + unsigned NumExpectedVectorElts = Mask.size(); + unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts; + unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits; + assert(!OpDemandedElts.extractBits( + NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && + "Demanding the virtual undef widening padding?"); + OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW + } + + // The Op itself may be of different VT, so we need to scale the mask. + unsigned NumOpElts = Op.getValueType().getVectorNumElements(); + APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts); + + // Can this operand be simplified any further, given it's demanded elements? + if (SDValue NewOp = + DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts( + Op, OpScaledDemandedElts, DAG)) + Op = NewOp; + } + // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now? + // Widen any subvector shuffle inputs we've collected. if (any_of(Ops, [RootSizeInBits](SDValue Op) { return Op.getValueSizeInBits() < RootSizeInBits; diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index a6d31bc..0a3ddb8 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -2261,12 +2261,13 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) { ; ; AVX1-LABEL: splat_v3i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: splat_v3i32: @@ -2288,12 +2289,13 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) { ; ; XOP-LABEL: splat_v3i32: ; XOP: # %bb.0: -; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; XOP-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; XOP-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; XOP-NEXT: movq (%rdi), %rax +; XOP-NEXT: vmovq %rax, %xmm0 +; XOP-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; XOP-NEXT: vmovd %eax, %xmm2 +; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; XOP-NEXT: retq %1 = load <3 x i32>, <3 x i32>* %ptr, align 1 %2 = shufflevector <3 x i32> %1, <3 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index f9469d6..0a8283f 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -568,12 +568,13 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] -; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: simplify_select: