From 43bf9917cc54be406c0efb811635407dbb99c984 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 8 Oct 2018 18:08:02 +0000 Subject: [PATCH] [x86] make horizontal binop matching clearer; NFCI The instructions are complicated, so this code will probably never be very obvious, but hopefully this makes it better. As shown in PR39195: https://bugs.llvm.org/show_bug.cgi?id=39195 ...we need to improve the matching to not miss cases where we're h-opping on 1 source vector, and that should be a small patch after this rearranging. llvm-svn: 343989 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 78 ++++++++++++++++----------------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 08a9294..ef8bf06 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36927,10 +36927,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. -/// Note that the binary operation should have the property that if one of the -/// operands is UNDEF then the result is UNDEF. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { - // Look for the following pattern: if + // If either operand is undef, bail out. The binop should be simplified. + if (LHS.isUndef() || RHS.isUndef()) + return false; + + // Look for the following pattern: // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > // and @@ -36945,25 +36947,15 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { return false; MVT VT = LHS.getSimpleValueType(); - assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); - // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to - // operate independently on 128-bit lanes. - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts / NumLanes; - assert((NumLaneElts % 2 == 0) && - "Vector type should have an even number of elements in each lane"); - unsigned HalfLaneElts = NumLaneElts/2; - // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask - // If LHS is not a shuffle then pretend it is the shuffle + // If LHS is not a shuffle, then pretend it is the identity shuffle: // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> - // NOTE: in what follows a default initialized SDValue represents an UNDEF of - // type VT. + // NOTE: A default initialized SDValue represents an UNDEF of type VT. + unsigned NumElts = VT.getVectorNumElements(); SDValue A, B; SmallVector LMask(NumElts); if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { @@ -36974,8 +36966,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { ArrayRef Mask = cast(LHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), LMask.begin()); } else { - if (!LHS.isUndef()) - A = LHS; + A = LHS; for (unsigned i = 0; i != NumElts; ++i) LMask[i] = i; } @@ -36992,43 +36983,48 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { ArrayRef Mask = cast(RHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), RMask.begin()); } else { - if (!RHS.isUndef()) - C = RHS; + C = RHS; for (unsigned i = 0; i != NumElts; ++i) RMask[i] = i; } + // If A and B occur in reverse order in RHS, then canonicalize by commuting + // RHS operands and shuffle mask. + if (A != C) { + std::swap(C, D); + ShuffleVectorSDNode::commuteMask(RMask); + } // Check that the shuffles are both shuffling the same vectors. - if (!(A == C && B == D) && !(A == D && B == C)) + if (!(A == C && B == D)) return false; - // If everything is UNDEF then bail out: it would be better to fold to UNDEF. - if (!A.getNode() && !B.getNode()) - return false; - - // If A and B occur in reverse order in RHS, then "swap" them (which means - // rewriting the mask). - if (A != C) - ShuffleVectorSDNode::commuteMask(RMask); - - // At this point LHS and RHS are equivalent to - // LHS = VECTOR_SHUFFLE A, B, LMask - // RHS = VECTOR_SHUFFLE A, B, RMask + // LHS and RHS are now: + // LHS = shuffle A, B, LMask + // RHS = shuffle A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0; i != NumLaneElts; ++i) { - int LIdx = LMask[i+l], RIdx = RMask[i+l]; - - // Ignore any UNDEF components. + // AVX defines horizontal add/sub to operate independently on 128-bit lanes, + // so we just repeat the inner loop if this is a 256-bit op. + unsigned Num128BitChunks = VT.getSizeInBits() / 128; + unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; + assert((NumEltsPer128BitChunk % 2 == 0) && + "Vector type should have an even number of elements in each lane"); + for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { + for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) { + // Ignore undefined components. + int LIdx = LMask[i + j], RIdx = RMask[i + j]; if (LIdx < 0 || RIdx < 0 || (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; - // Check that successive elements are being operated on. If not, this is + // The low half of the 128-bit result must choose from A. + // The high half of the 128-bit result must choose from B. + unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; + unsigned Src = i >= NumEltsPer64BitChunk; + + // Check that successive elements are being operated on. If not, this is // not a horizontal operation. - unsigned Src = (i/HalfLaneElts); // each lane is split between srcs - int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; + int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j; if (!(LIdx == Index && RIdx == Index + 1) && !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; -- 2.7.4