[DAGCombiner] Add vector demanded elements support to computeKnownBits

author Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 27 Oct 2016 14:29:28 +0000 (14:29 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 27 Oct 2016 14:29:28 +0000 (14:29 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 27 Oct 2016 14:29:28 +0000 (14:29 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 27 Oct 2016 14:29:28 +0000 (14:29 +0000)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h

index 6cc1f50..5b53554 100644 (file)
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1256,12 +1256,22 @@ public:
      const;
  
    /// Determine which bits of Op are known to be either zero or one and return
-  /// them in the KnownZero/KnownOne bitsets.  Targets can implement the
-  /// computeKnownBitsForTargetNode method in the TargetLowering class to allow
-  /// target nodes to be understood.
+  /// them in the KnownZero/KnownOne bitsets. For vectors, the known bits are
+  /// those that are shared by every vector element.
+  /// Targets can implement the computeKnownBitsForTargetNode method in the
+  /// TargetLowering class to allow target nodes to be understood.
    void computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne,
                          unsigned Depth = 0) const;
  
+  /// Determine which bits of Op are known to be either zero or one and return
+  /// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows
+  /// us to only collect the known bits that are shared by the requested vector
+  /// elements.
+  /// Targets can implement the computeKnownBitsForTargetNode method in the
+  /// TargetLowering class to allow target nodes to be understood.
+  void computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne,
+                        const APInt &DemandedElts, unsigned Depth = 0) const;
+
    /// Test if the given value is known to have exactly one bit set. This differs
    /// from computeKnownBits in that it doesn't necessarily determine which bit
    /// is set.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

index c2c7bb0..cb37583 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2007,9 +2007,26 @@ static const APInt *getValidShiftAmountConstant(SDValue V) {
  }
  
  /// Determine which bits of Op are known to be either zero or one and return
-/// them in the KnownZero/KnownOne bitsets.
+/// them in the KnownZero/KnownOne bitsets. For vectors, the known bits are
+/// those that are shared by every vector element.
  void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
                                      APInt &KnownOne, unsigned Depth) const {
+  EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth);
+}
+
+/// Determine which bits of Op are known to be either zero or one and return
+/// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows
+/// us to only collect the known bits that are shared by the requested vector
+/// elements.
+/// TODO: We only support DemandedElts on a few opcodes so far, the remainder
+/// should be added when they become necessary.
+void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
+                                    APInt &KnownOne, const APInt &DemandedElts,
+                                    unsigned Depth) const {
    unsigned BitWidth = Op.getScalarValueSizeInBits();
  
    KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
@@ -2017,6 +2034,10 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
      return;  // Limit search depth.
  
    APInt KnownZero2, KnownOne2;
+  unsigned NumElts = DemandedElts.getBitWidth();
+
+  if (DemandedElts == APInt(NumElts, 0))
+    return;  // No demanded elts, better to assume we don't know anything.
  
    switch (Op.getOpcode()) {
    case ISD::Constant:
@@ -2025,9 +2046,15 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
      KnownZero = ~KnownOne;
      break;
    case ISD::BUILD_VECTOR:
-    // Collect the known bits that are shared by every vector element.
+    // Collect the known bits that are shared by every demanded vector element.
+    assert(NumElts == Op.getValueType().getVectorNumElements() &&
+           "Unexpected vector size");
      KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
-    for (SDValue SrcOp : Op->ops()) {
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      if (!DemandedElts[i])
+        continue;
+
+      SDValue SrcOp = Op.getOperand(i);
        computeKnownBits(SrcOp, KnownZero2, KnownOne2, Depth + 1);
  
        // BUILD_VECTOR can implicitly truncate sources, we must handle this.
@@ -2038,16 +2065,72 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
          KnownZero2 = KnownZero2.trunc(BitWidth);
        }
  
-      // Known bits are the values that are shared by every element.
-      // TODO: support per-element known bits.
+      // Known bits are the values that are shared by every demanded element.
+      KnownOne &= KnownOne2;
+      KnownZero &= KnownZero2;
+    }
+    break;
+  case ISD::VECTOR_SHUFFLE: {
+    // Collect the known bits that are shared by every vector element referenced
+    // by the shuffle.
+    APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+    KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+    assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = SVN->getMaskElt(i);
+      if (M < 0) {
+        // For UNDEF elements, we don't know anything about the common state of
+        // the shuffle result.
+        // FIXME: Is this too pessimistic?
+        KnownZero = KnownOne = APInt(BitWidth, 0);
+        break;
+      }
+      if (!DemandedElts[i])
+        continue;
+
+      if ((unsigned)M < NumElts)
+        DemandedLHS.setBit((unsigned)M % NumElts);
+      else
+        DemandedRHS.setBit((unsigned)M % NumElts);
+    }
+    // Known bits are the values that are shared by every demanded element.
+    if (DemandedLHS != APInt(NumElts, 0)) {
+      SDValue LHS = Op.getOperand(0);
+      computeKnownBits(LHS, KnownZero2, KnownOne2, DemandedLHS, Depth + 1);
        KnownOne &= KnownOne2;
        KnownZero &= KnownZero2;
      }
+    if (DemandedRHS != APInt(NumElts, 0)) {
+      SDValue RHS = Op.getOperand(1);
+      computeKnownBits(RHS, KnownZero2, KnownOne2, DemandedRHS, Depth + 1);
+      KnownOne &= KnownOne2;
+      KnownZero &= KnownZero2;
+    }
+    break;
+  }
+  case ISD::EXTRACT_SUBVECTOR: {
+    // If we know the element index, just demand that subvector elements,
+    // otherwise demand them all.
+    SDValue Src = Op.getOperand(0);
+    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+      // Offset the demanded elts by the subvector index.
+      uint64_t Idx = SubIdx->getZExtValue();
+      APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
+      computeKnownBits(Src, KnownZero, KnownOne, DemandedSrc, Depth + 1);
+    } else {
+      computeKnownBits(Src, KnownZero, KnownOne, Depth + 1);
+    }
      break;
+  }
    case ISD::AND:
      // If either the LHS or the RHS are Zero, the result is zero.
-    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
-    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
  
      // Output known-1 bits are only known if set in both the LHS & RHS.
      KnownOne &= KnownOne2;
@@ -2206,7 +2289,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
      if (NewBits.getBoolValue())
        InputDemandedBits |= InSignBit;
  
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
+                     Depth + 1);
      KnownOne &= InputDemandedBits;
      KnownZero &= InputDemandedBits;
  
@@ -2253,7 +2337,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
      APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
      KnownZero = KnownZero.trunc(InBits);
      KnownOne = KnownOne.trunc(InBits);
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
+                     Depth + 1);
      KnownZero = KnownZero.zext(BitWidth);
      KnownOne = KnownOne.zext(BitWidth);
      KnownZero |= NewBits;
@@ -2266,7 +2351,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
  
      KnownZero = KnownZero.trunc(InBits);
      KnownOne = KnownOne.trunc(InBits);
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
+                     Depth + 1);
  
      // Note if the sign bit is known to be zero or one.
      bool SignBitKnownZero = KnownZero.isNegative();
@@ -2439,16 +2525,28 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
      // At the moment we keep this simple and skip tracking the specific
      // element. This way we get the lowest common denominator for all elements
      // of the vector.
-    // TODO: get information for given vector element
+    SDValue InVec = Op.getOperand(0);
+    SDValue EltNo = Op.getOperand(1);
+    EVT VecVT = InVec.getValueType();
      const unsigned BitWidth = Op.getValueSizeInBits();
-    const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
+    const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
+    const unsigned NumSrcElts = VecVT.getVectorNumElements();
      // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
      // anything about the extended bits.
      if (BitWidth > EltBitWidth) {
        KnownZero = KnownZero.trunc(EltBitWidth);
        KnownOne = KnownOne.trunc(EltBitWidth);
      }
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) {
+      // If we know the element index, just demand that vector element.
+      unsigned Idx = ConstEltNo->getZExtValue();
+      APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
+      computeKnownBits(InVec, KnownZero, KnownOne, DemandedElt, Depth + 1);
+    } else {
+      // Unknown element index, so ignore DemandedElts and demand them all.
+      computeKnownBits(InVec, KnownZero, KnownOne, Depth + 1);
+    }
      if (BitWidth > EltBitWidth) {
        KnownZero = KnownZero.zext(BitWidth);
        KnownOne = KnownOne.zext(BitWidth);
diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll

index 740fd77..f7f54a0 100644 (file)
--- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
@@ -465,11 +465,9 @@ define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
  ; AVX1-LABEL: shuffle_v4i64_67zz:
  ; AVX1:       ## BB#0:
  ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
  ; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
  ; AVX1-NEXT:    retq
  ;
  ; AVX2-LABEL: shuffle_v4i64_67zz:
diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll

index e124eda..142fbed 100644 (file)
--- a/llvm/test/CodeGen/X86/known-bits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-bits-vector.ll
@@ -5,16 +5,14 @@
  define i32 @knownbits_mask_extract_sext(<8 x i16> %a0) nounwind {
  ; X32-LABEL: knownbits_mask_extract_sext:
  ; X32:       # BB#0:
-; X32-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    cwtl
+; X32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpextrw $0, %xmm0, %eax
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: knownbits_mask_extract_sext:
  ; X64:       # BB#0:
-; X64-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    cwtl
+; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpextrw $0, %xmm0, %eax
  ; X64-NEXT:    retq
    %1 = and <8 x i16> %a0, <i16 15, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
    %2 = extractelement <8 x i16> %1, i32 0
@@ -31,17 +29,10 @@ define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
  ; X32-NEXT:    subl $16, %esp
  ; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
  ; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
-; X32-NEXT:    vpextrd $1, %xmm0, %eax
  ; X32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    xorl %ecx, %ecx
-; X32-NEXT:    testl %eax, %eax
-; X32-NEXT:    setns %cl
  ; X32-NEXT:    fildll {{[0-9]+}}(%esp)
-; X32-NEXT:    fadds {{\.LCPI.*}}(,%ecx,4)
  ; X32-NEXT:    fstps {{[0-9]+}}(%esp)
-; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss %xmm0, (%esp)
-; X32-NEXT:    flds (%esp)
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
  ; X32-NEXT:    movl %ebp, %esp
  ; X32-NEXT:    popl %ebp
  ; X32-NEXT:    retl
@@ -51,18 +42,7 @@ define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
  ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
  ; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
  ; X64-NEXT:    vmovq %xmm0, %rax
-; X64-NEXT:    testq %rax, %rax
-; X64-NEXT:    js .LBB1_1
-; X64-NEXT:  # BB#2:
-; X64-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB1_1:
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq %rcx
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    orq %rcx, %rax
  ; X64-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
-; X64-NEXT:    vaddss %xmm0, %xmm0, %xmm0
  ; X64-NEXT:    retq
    %1 = and <2 x i64> %a0, <i64 65535, i64 -1>
    %2 = extractelement <2 x i64> %1, i32 0
@@ -74,15 +54,15 @@ define <4 x i32> @knownbits_mask_shuffle_sext(<8 x i16> %a0) nounwind {
  ; X32-LABEL: knownbits_mask_shuffle_sext:
  ; X32:       # BB#0:
  ; X32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X32-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: knownbits_mask_shuffle_sext:
  ; X64:       # BB#0:
  ; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
  ; X64-NEXT:    retq
    %1 = and <8 x i16> %a0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 15, i16 15, i16 15, i16 15>
    %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -95,22 +75,14 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
  ; X32:       # BB#0:
  ; X32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
  ; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; X32-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; X32-NEXT:    vaddps {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X32-NEXT:    vcvtdq2ps %xmm0, %xmm0
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: knownbits_mask_shuffle_uitofp:
  ; X64:       # BB#0:
  ; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
  ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; X64-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; X64-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; X64-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
  ; X64-NEXT:    retq
    %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
    %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 27 Oct 2016 14:29:28 +0000 (14:29 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 27 Oct 2016 14:29:28 +0000 (14:29 +0000)
llvm/include/llvm/CodeGen/SelectionDAG.h		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/avx-vperm2x128.ll		patch \| blob \| history
llvm/test/CodeGen/X86/known-bits-vector.ll		patch \| blob \| history