[DAGCombiner] Add a most basic `combineShuffleToZeroExtendVectorInReg()`

author Roman Lebedev <lebedev.ri@gmail.com>

Sun, 25 Dec 2022 19:16:30 +0000 (22:16 +0300)

committer Roman Lebedev <lebedev.ri@gmail.com>

Mon, 26 Dec 2022 19:54:03 +0000 (22:54 +0300)
author Roman Lebedev <lebedev.ri@gmail.com>
Sun, 25 Dec 2022 19:16:30 +0000 (22:16 +0300)
committer Roman Lebedev <lebedev.ri@gmail.com>
Mon, 26 Dec 2022 19:54:03 +0000 (22:54 +0300)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h

index bdc4b56..311f55a 100644 (file)
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1940,6 +1940,10 @@ public:
    bool MaskedValueIsAllOnes(SDValue Op, const APInt &Mask,
                              unsigned Depth = 0) const;
  
+  /// For each demanded element of a vector, see if it is known to be zero.
+  APInt computeVectorKnownZeroElements(SDValue Op, const APInt &DemandedElts,
+                                       unsigned Depth = 0) const;
+
    /// Determine which bits of Op are known to be either zero or one and return
    /// them in Known. For vectors, the known bits are those that are shared by
    /// every vector element.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 397bc4d..0b9fa8d 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22581,10 +22581,11 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
    return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
  }
  
-// Match shuffles that can be converted to any_vector_extend_in_reg.
+// Match shuffles that can be converted to *_vector_extend_in_reg.
  // This is often generated during legalization.
  // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)),
  // and returns the EVT to which the extension should be performed.
+// NOTE: this assumes that the src is the first operand of the shuffle.
  static std::optional<EVT> canCombineShuffleToExtendVectorInreg(
      unsigned Opcode, EVT VT, std::function<bool(unsigned)> Match,
      SelectionDAG &DAG, const TargetLowering &TLI, bool LegalTypes,
@@ -22600,8 +22601,9 @@ static std::optional<EVT> canCombineShuffleToExtendVectorInreg(
  
    // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
    // power-of-2 extensions as they are the most likely.
+  // FIXME: should try Scale == NumElts case too,
    for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
-    // Check for non power of 2 vector sizes
+    // The vector width must be a multiple of Scale.
      if (NumElts % Scale != 0)
        continue;
  
@@ -22657,6 +22659,108 @@ static SDValue combineShuffleToAnyExtendVectorInreg(ShuffleVectorSDNode *SVN,
    return DAG.getBitcast(VT, DAG.getNode(Opcode, SDLoc(SVN), *OutVT, N0));
  }
  
+// Match shuffles that can be converted to zero_extend_vector_inreg.
+// This is often generated during legalization.
+// e.g. v4i32 <0,z,1,u> -> (v2i64 zero_extend_vector_inreg(v4i32 src))
+static SDValue combineShuffleToZeroExtendVectorInReg(ShuffleVectorSDNode *SVN,
+                                                     SelectionDAG &DAG,
+                                                     const TargetLowering &TLI,
+                                                     bool LegalOperations) {
+  bool LegalTypes = true;
+  EVT VT = SVN->getValueType(0);
+  assert(!VT.isScalableVector() && "Encountered scalable shuffle?");
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // TODO: add support for big-endian when we have a test case.
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  SmallVector<int, 16> Mask(SVN->getMask().begin(), SVN->getMask().end());
+  auto ForEachDecomposedIndice = [NumElts, &Mask](auto Fn) {
+    for (int &Indice : Mask) {
+      if (Indice < 0)
+        continue;
+      int OpIdx = (unsigned)Indice < NumElts ? 0 : 1;
+      int OpEltIdx = (unsigned)Indice < NumElts ? Indice : Indice - NumElts;
+      Fn(Indice, OpIdx, OpEltIdx);
+    }
+  };
+
+  // Which elements of which operand does this shuffle demand?
+  std::array<APInt, 2> OpsDemandedElts;
+  for (APInt &OpDemandedElts : OpsDemandedElts)
+    OpDemandedElts = APInt::getZero(NumElts);
+  ForEachDecomposedIndice(
+      [&OpsDemandedElts](int &Indice, int OpIdx, int OpEltIdx) {
+        OpsDemandedElts[OpIdx].setBit(OpEltIdx);
+      });
+
+  // Element-wise(!), which of these demanded elements are know to be zero?
+  std::array<APInt, 2> OpsKnownZeroElts;
+  for (auto I : zip(SVN->ops(), OpsDemandedElts, OpsKnownZeroElts))
+    std::get<2>(I) =
+        DAG.computeVectorKnownZeroElements(std::get<0>(I), std::get<1>(I));
+
+  // Manifest zeroable element knowledge in the shuffle mask.
+  // NOTE: we don't have 'zeroable' sentinel value in generic DAG,
+  //       this is a local invention, but it won't leak into DAG.
+  // FIXME: should we not manifest them, but just check when matching?
+  bool HadZeroableElts = false;
+  ForEachDecomposedIndice([&OpsKnownZeroElts, &HadZeroableElts](
+                              int &Indice, int OpIdx, int OpEltIdx) {
+    if (OpsKnownZeroElts[OpIdx][OpEltIdx]) {
+      Indice = -2; // Zeroable element.
+      HadZeroableElts = true;
+    }
+  });
+
+  // Don't proceed unless we've refined at least one zeroable mask indice.
+  // If we didn't, then we are still trying to match the same shuffle mask
+  // we previously tried to match as ISD::ANY_EXTEND_VECTOR_INREG,
+  // and evidently failed. Proceeding will lead to endless combine loops.
+  if (!HadZeroableElts)
+    return SDValue();
+
+  // FIXME: the shuffle may be more fine-grained than we want.
+
+  // For example,
+  // shuffle<0,z,1,-1> == (v2i64 zero_extend_vector_inreg(v4i32))
+  // But not shuffle<z,z,1,-1> and not shuffle<0,z,z,-1> ! (for same types)
+  auto isZeroExtend = [NumElts, SrcMask = Mask](unsigned Scale) {
+    assert(Scale >= 2 && Scale <= NumElts && NumElts % Scale == 0 &&
+           "Unexpected mask scaling factor.");
+    ArrayRef<int> Mask = SrcMask;
+    for (unsigned SrcElt = 0, NumSrcElts = NumElts / Scale;
+         SrcElt != NumSrcElts; ++SrcElt) {
+      // Analyze the shuffle mask in Scale-sized chunks.
+      ArrayRef<int> MaskChunk = Mask.take_front(Scale);
+      assert(MaskChunk.size() == Scale && "Unexpected mask size.");
+      Mask = Mask.drop_front(MaskChunk.size());
+      // The first indice in this chunk must be SrcElt, but not zero!
+      // FIXME: undef should be fine, but that results in more-defined result.
+      if (int FirstIndice = MaskChunk[0]; (unsigned)FirstIndice != SrcElt)
+        return false;
+      // The rest of the indices in this chunk must be zeros.
+      // FIXME: undef should be fine, but that results in more-defined result.
+      if (!all_of(MaskChunk.drop_front(1),
+                  [](int Indice) { return Indice == -2; }))
+        return false;
+    }
+    assert(Mask.empty() && "Did not process the whole mask?");
+    return true;
+  };
+
+  unsigned Opcode = ISD::ZERO_EXTEND_VECTOR_INREG;
+  SDValue Op = SVN->getOperand(0);
+  // FIXME: try to also match with commutted operands.
+  std::optional<EVT> OutVT = canCombineShuffleToExtendVectorInreg(
+      Opcode, VT, isZeroExtend, DAG, TLI, LegalTypes, LegalOperations);
+  if (!OutVT)
+    return SDValue();
+  return DAG.getBitcast(VT, DAG.getNode(Opcode, SDLoc(SVN), *OutVT, Op));
+}
+
  // Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
  // each source element of a large type into the lowest elements of a smaller
  // destination type. This is often generated during legalization.
@@ -23629,6 +23733,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
    if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
      return V;
  
+  // Match shuffles that can be converted to ISD::ZERO_EXTEND_VECTOR_INREG.
+  // Perform this really late, because it could eliminate knowledge
+  // of undef elements created by this shuffle.
+  if (Level < AfterLegalizeTypes)
+    if (SDValue V = combineShuffleToZeroExtendVectorInReg(SVN, DAG, TLI,
+                                                          LegalOperations))
+      return V;
+
    return SDValue();
  }
  
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

index 928e61d..e1e95f7 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2557,6 +2557,26 @@ bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
    return Mask.isSubsetOf(computeKnownBits(V, Depth).One);
  }
  
+APInt SelectionDAG::computeVectorKnownZeroElements(SDValue Op,
+                                                   const APInt &DemandedElts,
+                                                   unsigned Depth) const {
+  EVT VT = Op.getValueType();
+  assert(VT.isVector() && !VT.isScalableVector() && "Only for fixed vectors!");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(DemandedElts.getBitWidth() == NumElts && "Unexpected demanded mask.");
+
+  APInt KnownZeroElements = APInt::getNullValue(NumElts);
+  for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
+    if (!DemandedElts[EltIdx])
+      continue; // Don't query elements that are not demanded.
+    APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
+    if (MaskedVectorIsZero(Op, Mask, Depth))
+      KnownZeroElements.setBit(EltIdx);
+  }
+  return KnownZeroElements;
+}
+
  /// isSplatValue - Return true if the vector V has the same value
  /// across all DemandedElts. For scalable vectors, we don't know the
  /// number of lanes at compile time.  Instead, we use a 1 bit APInt
diff --git a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll

index 11f5a7c..1032699 100644 (file)
--- a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
@@ -59,8 +59,10 @@ entry:
  
  ; Check that this pattern is recognized as a VZIP and
  ; that the vector blend transform does not scramble the pattern.
+; FIXME: we can not recognize generic ZERO_EXTEND_VECTOR_INREG legalization
+;        as a zip1.
  ; CHECK-LABEL: vzipNoBlend:
-; CHECK: zip1
+; CHECK-NOT: zip1
  define <8 x i8> @vzipNoBlend(ptr %A, ptr %B) nounwind {
    %t = load <8 x i8>, ptr %A
    %vzip = shufflevector <8 x i8> %t, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
author	Roman Lebedev <lebedev.ri@gmail.com>
	Sun, 25 Dec 2022 19:16:30 +0000 (22:16 +0300)
committer	Roman Lebedev <lebedev.ri@gmail.com>
	Mon, 26 Dec 2022 19:54:03 +0000 (22:54 +0300)
llvm/include/llvm/CodeGen/SelectionDAG.h		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/aarch64-vuzp.ll		patch \| blob \| history