From ee8b96f25345c19a1f7705acd93b66c74f9119d7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 20 Nov 2018 13:23:37 +0000 Subject: [PATCH] [X86][SSE] Add computeKnownBits/ComputeNumSignBits support for PACKSS/PACKUS instructions. Pull out getPackDemandedElts demanded elts remapping helper from computeKnownBitsForTargetNode and use in computeKnownBits/ComputeNumSignBits. llvm-svn: 347303 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 79 ++++++++++++++++++++++----------- llvm/test/CodeGen/X86/combine-srl.ll | 20 +-------- llvm/test/CodeGen/X86/psubus.ll | 39 ++++++++-------- 3 files changed, 72 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e72cc5e..0a8a5e7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5932,6 +5932,31 @@ static void createPackShuffleMask(MVT VT, SmallVectorImpl &Mask, } } +// Split the demanded elts of a PACKSS/PACKUS node between its operands. +static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, + APInt &DemandedLHS, APInt &DemandedRHS) { + int NumLanes = VT.getSizeInBits() / 128; + int NumElts = DemandedElts.getBitWidth(); + int NumInnerElts = NumElts / 2; + int NumEltsPerLane = NumElts / NumLanes; + int NumInnerEltsPerLane = NumInnerElts / NumLanes; + + DemandedLHS = APInt::getNullValue(NumInnerElts); + DemandedRHS = APInt::getNullValue(NumInnerElts); + + // Map DemandedElts to the packed operands. + for (int Lane = 0; Lane != NumLanes; ++Lane) { + for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) { + int OuterIdx = (Lane * NumEltsPerLane) + Elt; + int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt; + if (DemandedElts[OuterIdx]) + DemandedLHS.setBit(InnerIdx); + if (DemandedElts[OuterIdx + NumInnerEltsPerLane]) + DemandedRHS.setBit(InnerIdx); + } + } +} + /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. @@ -29938,12 +29963,24 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } case X86ISD::PACKUS: { // PACKUS is just a truncation if the upper half is zero. - // TODO: Add DemandedElts support. + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); + + Known.One = APInt::getAllOnesValue(BitWidth * 2); + Known.Zero = APInt::getAllOnesValue(BitWidth * 2); + KnownBits Known2; - DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1); - DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1); - Known.One &= Known2.One; - Known.Zero &= Known2.Zero; + if (!!DemandedLHS) { + DAG.computeKnownBits(Op.getOperand(0), Known2, DemandedLHS, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + } + if (!!DemandedRHS) { + DAG.computeKnownBits(Op.getOperand(1), Known2, DemandedRHS, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + } + if (Known.countMinLeadingZeros() < BitWidth) Known.resetAll(); Known = Known.trunc(BitWidth); @@ -30039,10 +30076,16 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::PACKSS: { // PACKSS is just a truncation if the sign bits extend to the packed size. - // TODO: Add DemandedElts support. + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS, + DemandedRHS); + unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits(); - unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); - unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); + unsigned Tmp0 = SrcBits, Tmp1 = SrcBits; + if (!!DemandedLHS) + Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1); + if (!!DemandedRHS) + Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1); unsigned Tmp = std::min(Tmp0, Tmp1); if (Tmp > (SrcBits - VTBits)) return Tmp - (SrcBits - VTBits); @@ -32226,24 +32269,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } case X86ISD::PACKSS: case X86ISD::PACKUS: { - int NumLanes = VT.getSizeInBits() / 128; - int NumInnerElts = NumElts / 2; - int NumEltsPerLane = NumElts / NumLanes; - int NumInnerEltsPerLane = NumInnerElts / NumLanes; - - // Map DemandedElts to the packed operands. - APInt DemandedLHS = APInt::getNullValue(NumInnerElts); - APInt DemandedRHS = APInt::getNullValue(NumInnerElts); - for (int Lane = 0; Lane != NumLanes; ++Lane) { - for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) { - int OuterIdx = (Lane * NumEltsPerLane) + Elt; - int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt; - if (DemandedElts[OuterIdx]) - DemandedLHS.setBit(InnerIdx); - if (DemandedElts[OuterIdx + NumInnerEltsPerLane]) - DemandedRHS.setBit(InnerIdx); - } - } + APInt DemandedLHS, DemandedRHS; + getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef, diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index dbc56f9..1cecc68 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -237,25 +237,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) { define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) { ; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrlq $51, %xmm2 -; SSE-NEXT: psrlq $50, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $49, %xmm2 -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE-NEXT: packusdw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $27, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrld $25, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $26, %xmm1 -; SSE-NEXT: psrld $24, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1: diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 6e3c137..53d4ccc 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1681,17 +1681,17 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; ; SSE41-LABEL: psubus_8i64_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm0, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm8, %xmm7 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm9, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm9, %xmm5 +; SSE41-NEXT: pand %xmm10, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535] @@ -1699,12 +1699,12 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm11 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm9, %xmm5 +; SSE41-NEXT: pand %xmm10, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm4 @@ -1712,21 +1712,21 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: packusdw %xmm11, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm9, %xmm3 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm9, %xmm5 +; SSE41-NEXT: pand %xmm10, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: pxor %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE41-NEXT: pand %xmm2, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] @@ -1734,11 +1734,8 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 ; SSE41-NEXT: packusdw %xmm3, %xmm7 ; SSE41-NEXT: packusdw %xmm4, %xmm7 -; SSE41-NEXT: psubusw %xmm7, %xmm10 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE41-NEXT: packusdw %xmm10, %xmm0 +; SSE41-NEXT: psubusw %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_8i64_max: -- 2.7.4