From 6f7b0cd530de71e5c47ae237346a1d7182b332ea Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 5 Aug 2016 11:16:53 +0000 Subject: [PATCH] [X86][SSE] Added target shuffle combine binary compute matching function. NFCI. Added matchBinaryPermuteVectorShuffle and moved the blend+zero and insertps matching code into it. llvm-svn: 277808 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 152 +++++++++++++++++--------------- 1 file changed, 80 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7fa7c02..f52a1ec 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25005,6 +25005,71 @@ static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef Mask, return false; } +static bool matchBinaryPermuteVectorShuffle(MVT SrcVT, ArrayRef Mask, + SDValue &V1, SDValue &V2, + SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + unsigned &PermuteImm) { + unsigned NumMaskElts = Mask.size(); + + // Attempt to blend with zero. + if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && SrcVT.is128BitVector()) || + (Subtarget.hasAVX() && SrcVT.is256BitVector()))) { + // Determine a type compatible with X86ISD::BLENDI. + // TODO - add 16i16 support (requires lane duplication). + MVT BlendVT = SrcVT; + if (Subtarget.hasAVX2()) { + if (BlendVT == MVT::v4i64) + BlendVT = MVT::v8i32; + else if (BlendVT == MVT::v2i64) + BlendVT = MVT::v4i32; + } else { + if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32) + BlendVT = MVT::v8i16; + else if (BlendVT == MVT::v4i64) + BlendVT = MVT::v4f64; + else if (BlendVT == MVT::v8i32) + BlendVT = MVT::v8f32; + } + + if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts, + /*Low*/ 0) && + NumMaskElts <= BlendVT.getVectorNumElements()) { + unsigned BlendSize = BlendVT.getVectorNumElements(); + unsigned MaskRatio = BlendSize / NumMaskElts; + + PermuteImm = 0; + for (unsigned i = 0; i != BlendSize; ++i) + if (Mask[i / MaskRatio] < 0) + PermuteImm |= 1u << i; + + V2 = getZeroVector(BlendVT, Subtarget, DAG, DL); + Shuffle = X86ISD::BLENDI; + ShuffleVT = BlendVT; + return true; + } + } + + // Attempt to combine to INSERTPS. + if (Subtarget.hasSSE41() && NumMaskElts == 4 && + (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32)) { + SmallBitVector Zeroable(4, false); + for (unsigned i = 0; i != NumMaskElts; ++i) + if (Mask[i] < 0) + Zeroable[i] = true; + + if (Zeroable.any()) + if (matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + Shuffle = X86ISD::INSERTPS; + ShuffleVT = MVT::v4f32; + return true; + } + } + + return false; +} + /// \brief Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// @@ -25142,78 +25207,21 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, return true; } - // Attempt to blend with zero. - if (NumMaskElts <= 8 && - ((Subtarget.hasSSE41() && VT.is128BitVector()) || - (Subtarget.hasAVX() && VT.is256BitVector()))) { - // Convert VT to a type compatible with X86ISD::BLENDI. - // TODO - add 16i16 support (requires lane duplication). - MVT ShuffleVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v8i32; - else if (ShuffleVT == MVT::v2i64) - ShuffleVT = MVT::v4i32; - } else { - if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) - ShuffleVT = MVT::v8i16; - else if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v4f64; - else if (ShuffleVT == MVT::v8i32) - ShuffleVT = MVT::v8f32; - } - - if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts, - /*Low*/ 0) && - NumMaskElts <= ShuffleVT.getVectorNumElements()) { - unsigned BlendMask = 0; - unsigned ShuffleSize = ShuffleVT.getVectorNumElements(); - unsigned MaskRatio = ShuffleSize / NumMaskElts; - - if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI) - return false; - - for (unsigned i = 0; i != ShuffleSize; ++i) - if (Mask[i / MaskRatio] < 0) - BlendMask |= 1u << i; - - SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL); - Res = DAG.getBitcast(ShuffleVT, Input); - DCI.AddToWorklist(Res.getNode()); - Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero, - DAG.getConstant(BlendMask, DL, MVT::i8)); - DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; - } - } - - // Attempt to combine to INSERTPS. - if (Subtarget.hasSSE41() && NumMaskElts == 4 && - (VT == MVT::v2f64 || VT == MVT::v4f32)) { - SmallBitVector Zeroable(4, false); - for (unsigned i = 0; i != NumMaskElts; ++i) - if (Mask[i] < 0) - Zeroable[i] = true; - - unsigned InsertPSMask; - SDValue V1 = Input, V2 = Input; - if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, - Zeroable, Mask, DAG)) { - if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS) - return false; // Nothing to do! - V1 = DAG.getBitcast(MVT::v4f32, V1); - DCI.AddToWorklist(V1.getNode()); - V2 = DAG.getBitcast(MVT::v4f32, V2); - DCI.AddToWorklist(V2.getNode()); - Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); - DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; - } + SDValue V1 = Input, V2 = Input; + if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, V1, V2, DL, DAG, Subtarget, + Shuffle, ShuffleVT, PermuteImm)) { + if (Depth == 1 && Root.getOpcode() == Shuffle) + return false; // Nothing to do! + V1 = DAG.getBitcast(ShuffleVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(ShuffleVT, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2, + DAG.getConstant(PermuteImm, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; } // Don't try to re-form single instruction chains under any circumstances now -- 2.7.4