From 073d5e5945c428e20db0884943e6dcb7ff2158df Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 24 Nov 2022 07:05:10 -0800 Subject: [PATCH] [Hexagon] Further improve code generation for shuffles * Concatenate partial shuffles into longer ones whenever possible: In selection DAG, shuffle's operands and return type must all agree. This is not the case in LLVM IR, and non-conforming IR-level shuffles will be rewritten to match DAG's requirements. This can also make a shuffle that can be matched to a single HVX instruction become shuffles that require more complex handling. Example: anything that takes two single vectors and returns a pair (e.g. V6_vshuffvdd). This is avoided by concatenating such shuffles into ones that take a vector pair, and an undef pair, and produce a vector pair. * Recognize perfect shuffles when masks contain `undef` values. * Use funnel shifts for contracting shuffles. * Recognize rotations as a separate step. These changes go into a single commit, because each one on their own introduced some regressions. --- llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp | 21 +- llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h | 12 +- llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp | 580 ++++++++++++++++----- llvm/lib/Target/Hexagon/HexagonISelLowering.h | 9 +- llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp | 144 +++-- llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll | 466 +++++++++-------- llvm/test/CodeGen/Hexagon/autohvx/mulh.ll | 115 +--- llvm/test/CodeGen/Hexagon/autohvx/qmul.ll | 23 +- .../CodeGen/Hexagon/autohvx/shuffle-half-128b.ll | 8 +- .../CodeGen/Hexagon/autohvx/shuffle-half-64b.ll | 8 +- 10 files changed, 830 insertions(+), 556 deletions(-) diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index c7c72b9..c235357 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -682,6 +682,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) { SDValue V = N->getOperand(1); SDValue U; + // Splat intrinsics. if (keepsLowBits(V, Bits, U)) { SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), N->getOperand(0), U); @@ -697,14 +698,14 @@ void HexagonDAGToDAGISel::SelectExtractSubvector(SDNode *N) { MVT ResTy = N->getValueType(0).getSimpleVT(); auto IdxN = cast(N->getOperand(1)); unsigned Idx = IdxN->getZExtValue(); -#ifndef NDEBUG - MVT InpTy = Inp.getValueType().getSimpleVT(); + + [[maybe_unused]] MVT InpTy = Inp.getValueType().getSimpleVT(); + [[maybe_unused]] unsigned ResLen = ResTy.getVectorNumElements(); assert(InpTy.getVectorElementType() == ResTy.getVectorElementType()); - unsigned ResLen = ResTy.getVectorNumElements(); assert(2 * ResLen == InpTy.getVectorNumElements()); assert(ResTy.getSizeInBits() == 32); assert(Idx == 0 || Idx == ResLen); -#endif + unsigned SubReg = Idx == 0 ? Hexagon::isub_lo : Hexagon::isub_hi; SDValue Ext = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(N), ResTy, Inp); @@ -904,13 +905,12 @@ void HexagonDAGToDAGISel::Select(SDNode *N) { return N->setNodeId(-1); // Already selected. auto isHvxOp = [this](SDNode *N) { - auto &HST = MF->getSubtarget(); for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { - if (HST.isHVXVectorType(N->getValueType(i), true)) + if (HST->isHVXVectorType(N->getValueType(i), true)) return true; } for (SDValue I : N->ops()) { - if (HST.isHVXVectorType(I.getValueType(), true)) + if (HST->isHVXVectorType(I.getValueType(), true)) return true; } return false; @@ -1258,14 +1258,17 @@ void HexagonDAGToDAGISel::ppHoistZextI1(std::vector &&Nodes) { void HexagonDAGToDAGISel::PreprocessISelDAG() { // Repack all nodes before calling each preprocessing function, // because each of them can modify the set of nodes. - auto getNodes = [this] () -> std::vector { - std::vector T; + auto getNodes = [this]() -> std::vector { + std::vector T; T.reserve(CurDAG->allnodes_size()); for (SDNode &N : CurDAG->allnodes()) T.push_back(&N); return T; }; + if (HST->useHVXOps()) + PreprocessHvxISelDAG(); + // Transform: (or (select c x 0) z) -> (select c (or x z) z) // (or (select c 0 y) z) -> (select c z (or y z)) ppSimplifyOrSelect0(getNodes()); diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h index 5060537..c42bef3 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h @@ -127,11 +127,6 @@ private: return SDValue(U, 0); } - void SelectHvxExtractSubvector(SDNode *N); - void SelectHvxShuffle(SDNode *N); - void SelectHvxRor(SDNode *N); - void SelectHvxVAlign(SDNode *N); - bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src); bool isAlignedMemNode(const MemSDNode *N) const; bool isSmallStackStore(const StoreSDNode *N) const; @@ -139,10 +134,17 @@ private: bool hasOneUse(const SDNode *N) const; // DAG preprocessing functions. + void PreprocessHvxISelDAG(); void ppSimplifyOrSelect0(std::vector &&Nodes); void ppAddrReorderAddShl(std::vector &&Nodes); void ppAddrRewriteAndSrl(std::vector &&Nodes); void ppHoistZextI1(std::vector &&Nodes); + void ppHvxShuffleOfShuffle(std::vector &&Nodes); + + void SelectHvxExtractSubvector(SDNode *N); + void SelectHvxShuffle(SDNode *N); + void SelectHvxRor(SDNode *N); + void SelectHvxVAlign(SDNode *N); // Function postprocessing. void updateAligna(); diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 65f050a..ccfc2f3 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -10,6 +10,7 @@ #include "HexagonISelDAGToDAG.h" #include "HexagonISelLowering.h" #include "HexagonTargetMachine.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/SelectionDAGISel.h" @@ -17,15 +18,20 @@ #include "llvm/IR/IntrinsicsHexagon.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include +#include #include +#include #include +#include #include +#include #include #include #define DEBUG_TYPE "hexagon-isel" - using namespace llvm; namespace { @@ -617,6 +623,7 @@ struct OpRef { OpRef(SDValue V) : OpV(V) {} bool isValue() const { return OpV.getNode() != nullptr; } bool isValid() const { return isValue() || !(OpN & Invalid); } + bool isUndef() const { return OpN & Undef; } static OpRef res(int N) { return OpRef(Whole | (N & Index)); } static OpRef fail() { return OpRef(Invalid); } @@ -921,18 +928,18 @@ namespace llvm { const unsigned HwLen; HvxSelector(HexagonDAGToDAGISel &HS, SelectionDAG &G) - : Lower(getHexagonLowering(G)), ISel(HS), DAG(G), - HST(getHexagonSubtarget(G)), HwLen(HST.getVectorLength()) {} + : Lower(getHexagonLowering(G)), ISel(HS), DAG(G), + HST(getHexagonSubtarget(G)), HwLen(HST.getVectorLength()) {} MVT getSingleVT(MVT ElemTy) const { assert(ElemTy != MVT::i1 && "Use getBoolVT for predicates"); - unsigned NumElems = HwLen / (ElemTy.getSizeInBits()/8); + unsigned NumElems = HwLen / (ElemTy.getSizeInBits() / 8); return MVT::getVectorVT(ElemTy, NumElems); } MVT getPairVT(MVT ElemTy) const { assert(ElemTy != MVT::i1); // Suspicious: there are no predicate pairs. - unsigned NumElems = (2*HwLen) / (ElemTy.getSizeInBits()/8); + unsigned NumElems = (2 * HwLen) / (ElemTy.getSizeInBits() / 8); return MVT::getVectorVT(ElemTy, NumElems); } @@ -946,6 +953,12 @@ namespace llvm { void selectRor(SDNode *N); void selectVAlign(SDNode *N); + static SmallVector getPerfectCompletions(ShuffleMask SM, + unsigned Width); + static SmallVector completeToPerfect( + ArrayRef Completions, unsigned Width); + static std::optional rotationDistance(ShuffleMask SM, unsigned WrapAt); + private: void select(SDNode *ISelN); void materialize(const ResultStack &Results); @@ -958,8 +971,10 @@ namespace llvm { PackMux, }; OpRef concats(OpRef Va, OpRef Vb, ResultStack &Results); + OpRef funnels(OpRef Va, OpRef Vb, int Amount, ResultStack &Results); + OpRef packs(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results, - MutableArrayRef NewMask, unsigned Options = None); + MutableArrayRef NewMask, unsigned Options = None); OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results, MutableArrayRef NewMask); OpRef vmuxs(ArrayRef Bytes, OpRef Va, OpRef Vb, @@ -980,9 +995,8 @@ namespace llvm { bool selectVectorConstants(SDNode *N); bool scalarizeShuffle(ArrayRef Mask, const SDLoc &dl, MVT ResTy, SDValue Va, SDValue Vb, SDNode *N); - }; -} +} // namespace llvm static void splitMask(ArrayRef Mask, MutableArrayRef MaskL, MutableArrayRef MaskR) { @@ -1031,6 +1045,13 @@ static bool isIdentity(ArrayRef Mask) { return true; } +static bool isLowHalfOnly(ArrayRef Mask) { + int L = Mask.size(); + assert(L % 2 == 0); + // Check if the second half of the mask is all-undef. + return llvm::all_of(Mask.drop_front(L / 2), [](int M) { return M < 0; }); +} + static SmallVector getInputSegmentList(ShuffleMask SM, unsigned SegLen) { assert(isPowerOf2_32(SegLen)); @@ -1109,19 +1130,6 @@ static void packSegmentMask(ArrayRef Mask, ArrayRef OutSegMap, } } -static bool isPermutation(ArrayRef Mask) { - // Check by adding all numbers only works if there is no overflow. - assert(Mask.size() < 0x00007FFF && "Overflow failure"); - int Sum = 0; - for (int Idx : Mask) { - if (Idx == -1) - return false; - Sum += Idx; - } - int N = Mask.size(); - return 2*Sum == N*(N-1); -} - bool HvxSelector::selectVectorConstants(SDNode *N) { // Constant vectors are generated as loads from constant pools or as // splats of a constant value. Since they are generated during the @@ -1220,6 +1228,42 @@ OpRef HvxSelector::concats(OpRef Lo, OpRef Hi, ResultStack &Results) { return OpRef::res(Results.top()); } +OpRef HvxSelector::funnels(OpRef Va, OpRef Vb, int Amount, + ResultStack &Results) { + // Do a funnel shift towards the low end (shift right) by Amount bytes. + // If Amount < 0, treat it as shift left, i.e. do a shift right by + // Amount + HwLen. + auto VecLen = static_cast(HwLen); + + if (Amount == 0) + return Va; + if (Amount == VecLen) + return Vb; + + MVT Ty = getSingleVT(MVT::i8); + const SDLoc &dl(Results.InpNode); + + if (Amount < 0) + Amount += VecLen; + if (Amount > VecLen) { + Amount -= VecLen; + std::swap(Va, Vb); + } + + if (isUInt<3>(Amount)) { + SDValue A = getConst32(Amount, dl); + Results.push(Hexagon::V6_valignbi, Ty, {Vb, Va, A}); + } else if (isUInt<3>(VecLen - Amount)) { + SDValue A = getConst32(VecLen - Amount, dl); + Results.push(Hexagon::V6_vlalignbi, Ty, {Vb, Va, A}); + } else { + SDValue A = getConst32(Amount, dl); + Results.push(Hexagon::A2_tfrsi, Ty, {A}); + Results.push(Hexagon::V6_valignb, Ty, {Vb, Va, OpRef::res(-1)}); + } + return OpRef::res(Results.top()); +} + // Va, Vb are single vectors. If SM only uses two vector halves from Va/Vb, // pack these halves into a single vector, and remap SM into NewMask to use // the new vector instead. @@ -1230,6 +1274,16 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb, if (!Va.isValid() || !Vb.isValid()) return OpRef::fail(); + if (Vb.isUndef()) { + std::copy(SM.Mask.begin(), SM.Mask.end(), NewMask.begin()); + return Va; + } + if (Va.isUndef()) { + std::copy(SM.Mask.begin(), SM.Mask.end(), NewMask.begin()); + ShuffleVectorSDNode::commuteMask(NewMask); + return Vb; + } + MVT Ty = getSingleVT(MVT::i8); MVT PairTy = getPairVT(MVT::i8); OpRef Inp[2] = {Va, Vb}; @@ -1377,7 +1431,7 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb, } // Check if the arguments can be packed by valign(Va,Vb) or valign(Vb,Va). - + // FIXME: maybe remove this? ShuffleMask SMH(MaskH); assert(SMH.Mask.size() == VecLen); shuffles::MaskT MaskA(SMH.Mask); @@ -1518,6 +1572,12 @@ OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) { if (isUndef(SM.Mask)) return OpRef::undef(getSingleVT(MVT::i8)); + // First, check for rotations. + if (auto Dist = rotationDistance(SM, VecLen)) { + OpRef Rotate = funnels(Va, Va, *Dist, Results); + if (Rotate.isValid()) + return Rotate; + } unsigned HalfLen = HwLen / 2; assert(isPowerOf2_32(HalfLen)); @@ -1560,8 +1620,7 @@ OpRef HvxSelector::shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb, return shuffs1(ShuffleMask(PackedMask), P, Results); // TODO: Before we split the mask, try perfect shuffle on concatenated - // operands. This won't work now, because the perfect code does not - // tolerate undefs in the mask. + // operands. shuffles::MaskT MaskL(VecLen), MaskR(VecLen); splitMask(SM.Mask, MaskL, MaskR); @@ -1602,10 +1661,17 @@ OpRef HvxSelector::shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results) { return concats(L, H, Results); } - OpRef R = perfect(SM, Va, Results); - if (R.isValid()) - return R; - // TODO commute the mask and try the opposite order of the halves. + if (!isLowHalfOnly(SM.Mask)) { + // Doing a perfect shuffle on a low-half mask (i.e. where the upper half + // is all-undef) may produce a perfect shuffle that generates legitimate + // upper half. This isn't wrong, but if the perfect shuffle was possible, + // then there is a good chance that a shorter (contracting) code may be + // used as well (e.g. V6_vshuffeb, etc). + OpRef R = perfect(SM, Va, Results); + if (R.isValid()) + return R; + // TODO commute the mask and try the opposite order of the halves. + } OpRef L = shuffs2(SM.lo(), OpRef::lo(Va), OpRef::hi(Va), Results); OpRef H = shuffs2(SM.hi(), OpRef::lo(Va), OpRef::hi(Va), Results); @@ -1824,21 +1890,146 @@ bool HvxSelector::scalarizeShuffle(ArrayRef Mask, const SDLoc &dl, return true; } +SmallVector HvxSelector::getPerfectCompletions(ShuffleMask SM, + unsigned Width) { + auto possibilities = [](ArrayRef Bs, unsigned Width) -> uint32_t { + unsigned Impossible = ~(1u << Width) + 1; + for (unsigned I = 0, E = Bs.size(); I != E; ++I) { + uint8_t B = Bs[I]; + if (B == 0xff) + continue; + if (~Impossible == 0) + break; + for (unsigned Log = 0; Log != Width; ++Log) { + if (Impossible & (1u << Log)) + continue; + unsigned Expected = (I >> Log) % 2; + if (B != Expected) + Impossible |= (1u << Log); + } + } + return ~Impossible; + }; + + SmallVector Worklist(Width); + + for (unsigned BitIdx = 0; BitIdx != Width; ++BitIdx) { + SmallVector BitValues(SM.Mask.size()); + for (int i = 0, e = SM.Mask.size(); i != e; ++i) { + int M = SM.Mask[i]; + if (M < 0) + BitValues[i] = 0xff; + else + BitValues[i] = (M & (1u << BitIdx)) != 0; + } + Worklist[BitIdx] = possibilities(BitValues, Width); + } + + // If there is a word P in Worklist that matches multiple possibilities, + // then if any other word Q matches any of the possibilities matched by P, + // then Q matches all the possibilities matched by P. In fact, P == Q. + // In other words, for each words P, Q, the sets of possibilities matched + // by P and Q are either equal or disjoint (no partial overlap). + // + // Illustration: For 4-bit values there are 4 complete sequences: + // a: 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 + // b: 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 + // c: 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 + // d: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 + // + // Words containing unknown bits that match two of the complete + // sequences: + // ab: 0 u u 1 0 u u 1 0 u u 1 0 u u 1 + // ac: 0 u 0 u u 1 u 1 0 u 0 u u 1 u 1 + // ad: 0 u 0 u 0 u 0 u u 1 u 1 u 1 u 1 + // bc: 0 0 u u u u 1 1 0 0 u u u u 1 1 + // bd: 0 0 u u 0 0 u u u u 1 1 u u 1 1 + // cd: 0 0 0 0 u u u u u u u u 1 1 1 1 + // + // Proof of the claim above: + // Let P be a word that matches s0 and s1. For that to happen, all known + // bits in P must match s0 and s1 exactly. + // Assume there is Q that matches s1. Note that since P and Q came from + // the same shuffle mask, the positions of unknown bits in P and Q match + // exactly, which makes the indices of known bits be exactly the same + // between P and Q. Since P matches s0 and s1, the known bits of P much + // match both s0 and s1. Also, since Q matches s1, the known bits in Q + // are exactly the same as in s1, which means that they are exactly the + // same as in P. This implies that P == Q. + + return Worklist; +} + +SmallVector +HvxSelector::completeToPerfect(ArrayRef Completions, unsigned Width) { + // Pick a completion if there are multiple possibilities. For now just + // select any valid completion. + SmallVector Comps(Completions); + + for (unsigned I = 0; I != Width; ++I) { + uint32_t P = Comps[I]; + assert(P != 0); + if (isPowerOf2_32(P)) + continue; + // T = least significant bit of P. + uint32_t T = P ^ ((P - 1) & P); + // Clear T in all remaining words matching P. + for (unsigned J = I + 1; J != Width; ++J) { + if (Comps[J] == P) + Comps[J] ^= T; + } + Comps[I] = T; + } + + return Comps; +} + +std::optional HvxSelector::rotationDistance(ShuffleMask SM, + unsigned WrapAt) { + std::optional Dist; + for (int I = 0, E = SM.Mask.size(); I != E; ++I) { + int M = SM.Mask[I]; + if (M < 0) + continue; + if (Dist) { + if ((I + *Dist) % static_cast(WrapAt) != M) + return std::nullopt; + } else { + // Integer a%b operator assumes rounding towards zero by /, so it + // "misbehaves" when a crosses 0 (the remainder also changes sign). + // Add WrapAt in an attempt to keep I+Dist non-negative. + Dist = M - I; + if (Dist < 0) + Dist = *Dist + WrapAt; + } + } + return Dist; +} + OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results) { - DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); + DEBUG_WITH_TYPE("isel", { dbgs() << __func__ << '\n'; }); if (!Va.isValid() || !Vb.isValid()) return OpRef::fail(); // Contracting shuffles, i.e. instructions that always discard some bytes // from the operand vectors. // + // Funnel shifts // V6_vshuff{e,o}b + // V6_vshuf{e,o}h // V6_vdealb4w // V6_vpack{e,o}{b,h} int VecLen = SM.Mask.size(); + // First, check for funnel shifts. + if (auto Dist = rotationDistance(SM, 2 * VecLen)) { + OpRef Funnel = funnels(Va, Vb, *Dist, Results); + if (Funnel.isValid()) + return Funnel; + } + MVT SingleTy = getSingleVT(MVT::i8); MVT PairTy = getPairVT(MVT::i8); @@ -1916,48 +2107,6 @@ OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb, return OpRef::res(Results.top()); } - // Still need special treatment of vdealvdd - std::pair Strip = findStrip(SM.Mask, 1, VecLen); - if (Strip.second != 4) - return OpRef::fail(); - - int NextInMask = SM.Mask[Strip.second]; - if (NextInMask < VecLen) { - // Check if this is vpack{e,o}. - int N = VecLen; - int L = Strip.second; - // Check if the first strip starts at 0 or at L. - if (Strip.first != 0 && Strip.first != L) - return OpRef::fail(); - // Examine the rest of the mask. - for (int I = L; I < N; I += L) { - auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); - // Check whether the mask element at the beginning of each strip - // increases by 2L each time. - if (S.first - Strip.first != 2*I) - return OpRef::fail(); - // Check whether each strip is of the same length. - if (S.second != unsigned(L)) - return OpRef::fail(); - } - - // Strip.first == 0 => vpacke - // Strip.first == L => vpacko - assert(Strip.first == 0 || Strip.first == L); - NodeTemplate Res; - // FIXME: remove L=4 case after adding perfect mask completion. - if (L == 4) { - const SDLoc &dl(Results.InpNode); - Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(-L, dl)}); - OpRef C = OpRef::res(Results.top()); - MVT JoinTy = MVT::getVectorVT(SingleTy.getVectorElementType(), - 2 * SingleTy.getVectorNumElements()); - Results.push(Hexagon::V6_vdealvdd, JoinTy, {Vb, Va, C}); - return Strip.first == 0 ? OpRef::lo(OpRef::res(Results.top())) - : OpRef::hi(OpRef::res(Results.top())); - } - } - return OpRef::fail(); } @@ -2016,7 +2165,7 @@ OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) { } OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { - DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); + DEBUG_WITH_TYPE("isel", { dbgs() << __func__ << '\n'; }); // V6_vdeal{b,h} // V6_vshuff{b,h} @@ -2030,13 +2179,10 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { unsigned HwLog = Log2_32(HwLen); // The result length must be the same as the length of a single vector, // or a vector pair. - assert(LogLen == HwLog || LogLen == HwLog+1); - bool HavePairs = LogLen == HwLog+1; + assert(LogLen == HwLog || LogLen == HwLog + 1); + bool HavePairs = LogLen == HwLog + 1; - if (!isPermutation(SM.Mask)) - return OpRef::fail(); - - SmallVector Perm(LogLen); + SmallVector Perm(LogLen); // Check if this could be a perfect shuffle, or a combination of perfect // shuffles. @@ -2115,51 +2261,28 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { // E 1 1 1 0 7 0 1 1 1 7 0 1 1 1 7 0 1 1 1 // F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 - // There is one special case that is not a perfect shuffle, but - // can be turned into one easily: when the shuffle operates on - // a vector pair, but the two vectors in the pair are swapped. - // The code below that identifies perfect shuffles will reject - // it, unless the order is reversed. + // There is one special case that is not a perfect shuffle, but can be + // turned into one easily: when the shuffle operates on a vector pair, + // but the two vectors in the pair are swapped. The code that identifies + // perfect shuffles will reject it, unless the order is reversed. shuffles::MaskT MaskStorage(SM.Mask); bool InvertedPair = false; if (HavePairs && SM.Mask[0] >= int(HwLen)) { for (int i = 0, e = SM.Mask.size(); i != e; ++i) { int M = SM.Mask[i]; - MaskStorage[i] = M >= int(HwLen) ? M-HwLen : M+HwLen; + MaskStorage[i] = M >= int(HwLen) ? M - HwLen : M + HwLen; } InvertedPair = true; + SM = ShuffleMask(MaskStorage); } - ArrayRef LocalMask(MaskStorage); - auto XorPow2 = [] (ArrayRef Mask, unsigned Num) { - unsigned X = Mask[0] ^ Mask[Num/2]; - // Check that the first half has the X's bits clear. - if ((Mask[0] & X) != 0) - return 0u; - for (unsigned I = 1; I != Num/2; ++I) { - if (unsigned(Mask[I] ^ Mask[I+Num/2]) != X) - return 0u; - if ((Mask[I] & X) != 0) - return 0u; - } - return X; - }; + auto Comps = getPerfectCompletions(SM, LogLen); + if (llvm::any_of(Comps, [](uint32_t P) { return P == 0; })) + return OpRef::fail(); - // Create a vector of log2's for each column: Perm[i] corresponds to - // the i-th bit (lsb is 0). - assert(VecLen > 2); - for (unsigned I = VecLen; I >= 2; I >>= 1) { - // Examine the initial segment of Mask of size I. - unsigned X = XorPow2(LocalMask, I); - if (!isPowerOf2_32(X)) - return OpRef::fail(); - // Check the other segments of Mask. - for (int J = I; J < VecLen; J += I) { - if (XorPow2(LocalMask.slice(J, I), I) != X) - return OpRef::fail(); - } - Perm[Log2_32(X)] = Log2_32(I)-1; - } + auto Pick = completeToPerfect(Comps, LogLen); + for (unsigned I = 0; I != LogLen; ++I) + Perm[I] = Log2_32(Pick[I]); // Once we have Perm, represent it as cycles. Denote the maximum log2 // (equal to log2(VecLen)-1) as M. The cycle containing M can then be @@ -2186,7 +2309,7 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { // (4 0 1)(4 0 2 3)(4 2), // which can be implemented as 3 vshufvdd instructions. - using CycleType = SmallVector; + using CycleType = SmallVector; std::set Cycles; std::set All; @@ -2198,13 +2321,13 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { auto canonicalize = [LogLen](const CycleType &C) -> CycleType { unsigned LogPos, N = C.size(); for (LogPos = 0; LogPos != N; ++LogPos) - if (C[LogPos] == LogLen-1) + if (C[LogPos] == LogLen - 1) break; if (LogPos == N) return C; - CycleType NewC(C.begin()+LogPos, C.end()); - NewC.append(C.begin(), C.begin()+LogPos); + CycleType NewC(C.begin() + LogPos, C.end()); + NewC.append(C.begin(), C.begin() + LogPos); return NewC; }; @@ -2214,23 +2337,23 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { if (Cs.size() != 1) return 0u; const CycleType &C = *Cs.begin(); - if (C[0] != Len-1) + if (C[0] != Len - 1) return 0u; int D = Len - C.size(); if (D != 0 && D != 1) return 0u; bool IsDeal = true, IsShuff = true; - for (unsigned I = 1; I != Len-D; ++I) { - if (C[I] != Len-1-I) + for (unsigned I = 1; I != Len - D; ++I) { + if (C[I] != Len - 1 - I) IsDeal = false; - if (C[I] != I-(1-D)) // I-1, I + if (C[I] != I - (1 - D)) // I-1, I IsShuff = false; } // At most one, IsDeal or IsShuff, can be non-zero. assert(!(IsDeal || IsShuff) || IsDeal != IsShuff); - static unsigned Deals[] = { Hexagon::V6_vdealb, Hexagon::V6_vdealh }; - static unsigned Shufs[] = { Hexagon::V6_vshuffb, Hexagon::V6_vshuffh }; + static unsigned Deals[] = {Hexagon::V6_vdealb, Hexagon::V6_vdealh}; + static unsigned Shufs[] = {Hexagon::V6_vshuffb, Hexagon::V6_vshuffh}; return IsDeal ? Deals[D] : (IsShuff ? Shufs[D] : 0); }; @@ -2265,7 +2388,7 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { // This essentially strips the M value from the cycles where // it's present, and performs the insertion of M (then stripping) // for cycles without M (as described in an earlier comment). - SmallVector SwapElems; + SmallVector SwapElems; // When the input is extended (i.e. single vector becomes a pair), // this is done by using an "undef" vector as the second input. // However, then we get @@ -2277,28 +2400,27 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { // Then at the end, this needs to be undone. To accomplish this, // artificially add "LogLen-1" at both ends of the sequence. if (!HavePairs) - SwapElems.push_back(LogLen-1); + SwapElems.push_back(LogLen - 1); for (const CycleType &C : Cycles) { // Do the transformation: (a1..an) -> (M a1..an)(M a1). - unsigned First = (C[0] == LogLen-1) ? 1 : 0; - SwapElems.append(C.begin()+First, C.end()); + unsigned First = (C[0] == LogLen - 1) ? 1 : 0; + SwapElems.append(C.begin() + First, C.end()); if (First == 0) SwapElems.push_back(C[0]); } if (!HavePairs) - SwapElems.push_back(LogLen-1); + SwapElems.push_back(LogLen - 1); const SDLoc &dl(Results.InpNode); - OpRef Arg = HavePairs ? Va - : concats(Va, OpRef::undef(SingleTy), Results); + OpRef Arg = HavePairs ? Va : concats(Va, OpRef::undef(SingleTy), Results); if (InvertedPair) Arg = concats(OpRef::hi(Arg), OpRef::lo(Arg), Results); - for (unsigned I = 0, E = SwapElems.size(); I != E; ) { - bool IsInc = I == E-1 || SwapElems[I] < SwapElems[I+1]; + for (unsigned I = 0, E = SwapElems.size(); I != E;) { + bool IsInc = I == E - 1 || SwapElems[I] < SwapElems[I + 1]; unsigned S = (1u << SwapElems[I]); - if (I < E-1) { - while (++I < E-1 && IsInc == (SwapElems[I] < SwapElems[I+1])) + if (I < E - 1) { + while (++I < E - 1 && IsInc == (SwapElems[I] < SwapElems[I + 1])) S |= 1u << SwapElems[I]; // The above loop will not add a bit for the final SwapElems[I+1], // so add it here. @@ -2310,7 +2432,7 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(S, dl)}); Res.Opc = IsInc ? Hexagon::V6_vshuffvdd : Hexagon::V6_vdealvdd; Res.Ty = PairTy; - Res.Ops = { OpRef::hi(Arg), OpRef::lo(Arg), OpRef::res(-1) }; + Res.Ops = {OpRef::hi(Arg), OpRef::lo(Arg), OpRef::res(-1)}; Results.push(Res); Arg = OpRef::res(Results.top()); } @@ -2391,13 +2513,13 @@ void HvxSelector::selectExtractSubvector(SDNode *N) { MVT ResTy = N->getValueType(0).getSimpleVT(); auto IdxN = cast(N->getOperand(1)); unsigned Idx = IdxN->getZExtValue(); -#ifndef NDEBUG - MVT InpTy = Inp.getValueType().getSimpleVT(); + + [[maybe_unused]] MVT InpTy = Inp.getValueType().getSimpleVT(); + [[maybe_unused]] unsigned ResLen = ResTy.getVectorNumElements(); assert(InpTy.getVectorElementType() == ResTy.getVectorElementType()); - unsigned ResLen = ResTy.getVectorNumElements(); assert(2 * ResLen == InpTy.getVectorNumElements()); assert(Idx == 0 || Idx == ResLen); -#endif + unsigned SubReg = Idx == 0 ? Hexagon::vsub_lo : Hexagon::vsub_hi; SDValue Ext = DAG.getTargetExtractSubreg(SubReg, SDLoc(N), ResTy, Inp); @@ -2452,11 +2574,20 @@ void HvxSelector::selectShuffle(SDNode *N) { SDValue Vec0 = N->getOperand(0); SDValue Vec1 = N->getOperand(1); + assert(Vec0.getValueType() == ResTy && Vec1.getValueType() == ResTy); + ResultStack Results(SN); - Results.push(TargetOpcode::COPY, ResTy, {Vec0}); - Results.push(TargetOpcode::COPY, ResTy, {Vec1}); - OpRef Va = OpRef::res(Results.top()-1); - OpRef Vb = OpRef::res(Results.top()); + OpRef Va = OpRef::undef(ResTy); + OpRef Vb = OpRef::undef(ResTy); + + if (!Vec0.isUndef()) { + Results.push(TargetOpcode::COPY, ResTy, {Vec0}); + Va = OpRef::OpRef::res(Results.top()); + } + if (!Vec1.isUndef()) { + Results.push(TargetOpcode::COPY, ResTy, {Vec1}); + Vb = OpRef::res(Results.top()); + } OpRef Res = !HavePairs ? shuffs2(ShuffleMask(Mask), Va, Vb, Results) : shuffp2(ShuffleMask(Mask), Va, Vb, Results); @@ -2513,6 +2644,169 @@ void HvxSelector::selectVAlign(SDNode *N) { DAG.RemoveDeadNode(N); } +void HexagonDAGToDAGISel::PreprocessHvxISelDAG() { + auto getNodes = [this]() -> std::vector { + std::vector T; + T.reserve(CurDAG->allnodes_size()); + for (SDNode &N : CurDAG->allnodes()) + T.push_back(&N); + return T; + }; + + ppHvxShuffleOfShuffle(getNodes()); +} + +template <> struct std::hash { + std::size_t operator()(SDValue V) const { + return std::hash()(V.getNode()) + + std::hash()(V.getResNo()); + }; +}; + +void HexagonDAGToDAGISel::ppHvxShuffleOfShuffle(std::vector &&Nodes) { + // Motivating case: + // t10: v64i32 = ... + // t46: v128i8 = vector_shuffle<...> t44, t45 + // t48: v128i8 = vector_shuffle<...> t44, t45 + // t42: v128i8 = vector_shuffle<...> t46, t48 + // t12: v32i32 = extract_subvector t10, Constant:i32<0> + // t44: v128i8 = bitcast t12 + // t15: v32i32 = extract_subvector t10, Constant:i32<32> + // t45: v128i8 = bitcast t15 + SelectionDAG &DAG = *CurDAG; + unsigned HwLen = HST->getVectorLength(); + + struct SubVectorInfo { + SubVectorInfo(SDValue S, unsigned H) : Src(S), HalfIdx(H) {} + SDValue Src; + unsigned HalfIdx; + }; + + using MapType = std::unordered_map; + + auto getMaskElt = [&](unsigned Idx, ShuffleVectorSDNode *Shuff0, + ShuffleVectorSDNode *Shuff1, + const MapType &OpMap) -> int { + // Treat Shuff0 and Shuff1 as operands to another vector shuffle, and + // Idx as a (non-undef) element of the top level shuffle's mask, that + // is, index into concat(Shuff0, Shuff1). + // Assuming that Shuff0 and Shuff1 both operate on subvectors of the + // same source vector (as described by OpMap), return the index of + // that source vector corresponding to Idx. + ShuffleVectorSDNode *OpShuff = Idx < HwLen ? Shuff0 : Shuff1; + if (Idx >= HwLen) + Idx -= HwLen; + + // Get the mask index that M points at in the corresponding operand. + int MaybeN = OpShuff->getMaskElt(Idx); + if (MaybeN < 0) + return -1; + + auto N = static_cast(MaybeN); + unsigned SrcBase = N < HwLen ? OpMap.at(OpShuff->getOperand(0)) + : OpMap.at(OpShuff->getOperand(1)); + if (N >= HwLen) + N -= HwLen; + + return N + SrcBase; + }; + + auto fold3 = [&](SDValue TopShuff, SDValue Inp, MapType &&OpMap) -> SDValue { + // Fold all 3 shuffles into a single one. + auto *This = cast(TopShuff); + auto *S0 = cast(TopShuff.getOperand(0)); + auto *S1 = cast(TopShuff.getOperand(1)); + ArrayRef TopMask = This->getMask(); + // This should be guaranteed by type checks in the caller, and the fact + // that all shuffles should have been promoted to operate on MVT::i8. + assert(TopMask.size() == S0->getMask().size() && + TopMask.size() == S1->getMask().size()); + assert(TopMask.size() == HwLen); + + SmallVector FoldedMask(2 * HwLen); + for (unsigned I = 0; I != HwLen; ++I) { + int MaybeM = TopMask[I]; + if (MaybeM >= 0) { + FoldedMask[I] = + getMaskElt(static_cast(MaybeM), S0, S1, OpMap); + } else { + FoldedMask[I] = -1; + } + } + // The second half of the result will be all-undef. + std::fill(FoldedMask.begin() + HwLen, FoldedMask.end(), -1); + + // Return + // FoldedShuffle = (Shuffle Inp, undef, FoldedMask) + // (LoHalf FoldedShuffle) + const SDLoc &dl(TopShuff); + MVT SingleTy = MVT::getVectorVT(MVT::i8, HwLen); + MVT PairTy = MVT::getVectorVT(MVT::i8, 2 * HwLen); + SDValue FoldedShuff = + DAG.getVectorShuffle(PairTy, dl, DAG.getBitcast(PairTy, Inp), + DAG.getUNDEF(PairTy), FoldedMask); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SingleTy, FoldedShuff, + DAG.getConstant(0, dl, MVT::i32)); + }; + + auto getSourceInfo = [](SDValue V) -> std::optional { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + if (V.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return std::nullopt; + return SubVectorInfo(V.getOperand(0), + !cast(V.getOperand(1))->isZero()); + }; + + for (SDNode *N : Nodes) { + if (N->getOpcode() != ISD::VECTOR_SHUFFLE) + continue; + EVT ResTy = N->getValueType(0); + if (ResTy.getVectorElementType() != MVT::i8) + continue; + if (ResTy.getVectorNumElements() != HwLen) + continue; + + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + if (V0.getOpcode() != ISD::VECTOR_SHUFFLE) + continue; + if (V1.getOpcode() != ISD::VECTOR_SHUFFLE) + continue; + if (V0.getValueType() != ResTy || V1.getValueType() != ResTy) + continue; + + // Check if all operands of the two operand shuffles are extract_subvectors + // from the same vector pair. + auto V0A = getSourceInfo(V0.getOperand(0)); + if (!V0A.has_value()) + continue; + auto V0B = getSourceInfo(V0.getOperand(1)); + if (!V0B.has_value() || V0B->Src != V0A->Src) + continue; + auto V1A = getSourceInfo(V1.getOperand(0)); + if (!V1A.has_value() || V1A->Src != V0A->Src) + continue; + auto V1B = getSourceInfo(V1.getOperand(1)); + if (!V1B.has_value() || V1B->Src != V0A->Src) + continue; + + // The source must be a pair. This should be guaranteed here, + // but check just in case. + EVT SrcTy = V0A->Src.getValueType(); + assert(SrcTy.getSizeInBits() == 16 * HwLen); + + MapType OpMap = { + {V0.getOperand(0), V0A->HalfIdx * HwLen}, + {V0.getOperand(1), V0B->HalfIdx * HwLen}, + {V1.getOperand(0), V1A->HalfIdx * HwLen}, + {V1.getOperand(1), V1B->HalfIdx * HwLen}, + }; + SDValue NewS = fold3(SDValue(N, 0), V0A->Src, std::move(OpMap)); + ReplaceNode(N, NewS.getNode()); + } +} + void HexagonDAGToDAGISel::SelectHvxExtractSubvector(SDNode *N) { HvxSelector(*this, *CurDAG).selectExtractSubvector(N); } diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index afa4f66..3905a73 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -582,7 +582,14 @@ private: SelectionDAG &DAG) const; void ReplaceHvxNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const; - SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue combineTruncateBeforeLegal(SDValue Op, DAGCombinerInfo &DCI) const; + SDValue combineConcatVectorsBeforeLegal(SDValue Op, DAGCombinerInfo & DCI) + const; + SDValue combineVectorShuffleBeforeLegal(SDValue Op, DAGCombinerInfo & DCI) + const; + + SDValue PerformHvxDAGCombine(SDNode * N, DAGCombinerInfo & DCI) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 8fd9ab4f..b306e79 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -9,6 +9,8 @@ #include "HexagonISelLowering.h" #include "HexagonRegisterInfo.h" #include "HexagonSubtarget.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -19,6 +21,10 @@ #include "llvm/IR/IntrinsicsHexagon.h" #include "llvm/Support/CommandLine.h" +#include +#include +#include + using namespace llvm; static cl::opt HvxWidenThreshold("hexagon-hvx-widen", @@ -429,7 +435,7 @@ HexagonTargetLowering::initializeHVXLowering() { } } - setTargetDAGCombine({ISD::SPLAT_VECTOR, ISD::VSELECT, ISD::TRUNCATE}); + setTargetDAGCombine({ISD::CONCAT_VECTORS, ISD::TRUNCATE, ISD::VSELECT}); } unsigned @@ -3472,6 +3478,109 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N, } SDValue +HexagonTargetLowering::combineTruncateBeforeLegal(SDValue Op, + DAGCombinerInfo &DCI) const { + // Simplify V:v2NiB --(bitcast)--> vNi2B --(truncate)--> vNiB + // to extract-subvector (shuffle V, pick even, pick odd) + + assert(Op.getOpcode() == ISD::TRUNCATE); + SelectionDAG &DAG = DCI.DAG; + const SDLoc &dl(Op); + + if (Op.getOperand(0).getOpcode() == ISD::BITCAST) + return SDValue(); + SDValue Cast = Op.getOperand(0); + SDValue Src = Cast.getOperand(0); + + EVT TruncTy = Op.getValueType(); + EVT CastTy = Cast.getValueType(); + EVT SrcTy = Src.getValueType(); + if (SrcTy.isSimple()) + return SDValue(); + if (SrcTy.getVectorElementType() != TruncTy.getVectorElementType()) + return SDValue(); + unsigned SrcLen = SrcTy.getVectorNumElements(); + unsigned CastLen = CastTy.getVectorNumElements(); + if (2 * CastLen != SrcLen) + return SDValue(); + + SmallVector Mask(SrcLen); + for (int i = 0; i != static_cast(CastLen); ++i) { + Mask[i] = 2 * i; + Mask[i + CastLen] = 2 * i + 1; + } + SDValue Deal = + DAG.getVectorShuffle(SrcTy, dl, Src, DAG.getUNDEF(SrcTy), Mask); + return opSplit(Deal, dl, DAG).first; +} + +SDValue +HexagonTargetLowering::combineConcatVectorsBeforeLegal( + SDValue Op, DAGCombinerInfo &DCI) const { + // Fold + // concat (shuffle x, y, m1), (shuffle x, y, m2) + // into + // shuffle (concat x, y), undef, m3 + if (Op.getNumOperands() != 2) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + const SDLoc &dl(Op); + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + + if (V0.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + if (V1.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + SetVector Order; + Order.insert(V0.getOperand(0)); + Order.insert(V0.getOperand(1)); + Order.insert(V1.getOperand(0)); + Order.insert(V1.getOperand(1)); + + if (Order.size() > 2) + return SDValue(); + + // In ISD::VECTOR_SHUFFLE, the types of each input and the type of the + // result must be the same. + EVT InpTy = V0.getValueType(); + assert(InpTy.isVector()); + unsigned InpLen = InpTy.getVectorNumElements(); + + SmallVector LongMask; + auto AppendToMask = [&](SDValue Shuffle) { + auto *SV = cast(Shuffle.getNode()); + ArrayRef Mask = SV->getMask(); + SDValue X = Shuffle.getOperand(0); + SDValue Y = Shuffle.getOperand(1); + for (int M : Mask) { + if (M == -1) { + LongMask.push_back(M); + continue; + } + SDValue Src = static_cast(M) < InpLen ? X : Y; + if (static_cast(M) >= InpLen) + M -= InpLen; + + int OutOffset = Order[0] == Src ? 0 : InpLen; + LongMask.push_back(M + OutOffset); + } + }; + + AppendToMask(V0); + AppendToMask(V1); + + SDValue C0 = Order.front(); + SDValue C1 = Order.back(); // Can be same as front + EVT LongTy = InpTy.getDoubleNumVectorElementsVT(*DAG.getContext()); + + SDValue Cat = DAG.getNode(ISD::CONCAT_VECTORS, dl, LongTy, {C0, C1}); + return DAG.getVectorShuffle(LongTy, dl, Cat, DAG.getUNDEF(LongTy), LongMask); +} + +SDValue HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { const SDLoc &dl(N); @@ -3481,35 +3590,10 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) SmallVector Ops(N->ops().begin(), N->ops().end()); - if (Opc == ISD::TRUNCATE) { - // Simplify V:v2NiB --(bitcast)--> vNi2B --(truncate)--> vNiB - // to extract-subvector (shuffle V, pick even, pick odd) - if (Ops[0].getOpcode() == ISD::BITCAST) - return SDValue(); - SDValue Cast = Ops[0]; - SDValue Src = Cast.getOperand(0); - - EVT TruncTy = Op.getValueType(); - EVT CastTy = Cast.getValueType(); - EVT SrcTy = Src.getValueType(); - if (SrcTy.isSimple()) - return SDValue(); - if (SrcTy.getVectorElementType() != TruncTy.getVectorElementType()) - return SDValue(); - unsigned SrcLen = SrcTy.getVectorNumElements(); - unsigned CastLen = CastTy.getVectorNumElements(); - if (2 * CastLen != SrcLen) - return SDValue(); - - SmallVector Mask(SrcLen); - for (int i = 0; i != static_cast(CastLen); ++i) { - Mask[i] = 2 * i; - Mask[i + CastLen] = 2 * i + 1; - } - SDValue Deal = - DAG.getVectorShuffle(SrcTy, dl, Src, DAG.getUNDEF(SrcTy), Mask); - return opSplit(Deal, dl, DAG).first; - } + if (Opc == ISD::TRUNCATE) + return combineTruncateBeforeLegal(Op, DCI); + if (Opc == ISD::CONCAT_VECTORS) + return combineConcatVectorsBeforeLegal(Op, DCI); if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll index 3f754aa..369e9a9 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll @@ -219,209 +219,212 @@ define void @s8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r3:2 = combine(##.LCPI2_0,#8) -; CHECK-NEXT: v3:2.h = vunpack(v1.b) -; CHECK-NEXT: v1.cur = vmem(r0+#0) +; CHECK-NEXT: r7 = #64 +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r0 = #1 -; CHECK-NEXT: r7 = #512 -; CHECK-NEXT: r4 = #255 -; CHECK-NEXT: v3 = vmem(r3+#0) +; CHECK-NEXT: v2 = vsplat(r0) +; CHECK-NEXT: r3:2 = combine(##255,#8) +; CHECK-NEXT: v1 = valign(v0,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vsplat(r0) -; CHECK-NEXT: v13 = vsplat(r7) -; CHECK-NEXT: v4 = vdelta(v1,v3) -; CHECK-NEXT: v0 = vxor(v0,v0) +; CHECK-NEXT: v3 = vsplat(r3) +; CHECK-NEXT: r7 = #512 +; CHECK-NEXT: v9:8.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v10 = vsplat(r4) +; CHECK-NEXT: v4 = vsplat(r7) ; CHECK-NEXT: r6 = ##-2147483648 -; CHECK-NEXT: v3:2.w = vunpack(v2.h) +; CHECK-NEXT: v15 = vxor(v15,v15) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v15 = vsplat(r6) +; CHECK-NEXT: v16 = vsplat(r6) ; CHECK-NEXT: r5 = #159 -; CHECK-NEXT: v5:4.h = vunpack(v4.b) -; CHECK-NEXT: v6.w = vabs(v3.w) +; CHECK-NEXT: r4 = #23 +; CHECK-NEXT: v7:6.h = vunpack(v1.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v17 = vsplat(r5) -; CHECK-NEXT: r4 = #23 -; CHECK-NEXT: v8.w = vabs(v2.w) +; CHECK-NEXT: v18 = vsplat(r5) +; CHECK-NEXT: v1:0.w = vunpack(v8.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5:4.w = vunpack(v4.h) +; CHECK-NEXT: v7:6.w = vunpack(v6.h) +; CHECK-NEXT: v8.w = vabs(v1.w) +; CHECK-NEXT: v5.w = vabs(v0.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v9.w = vabs(v6.w) +; CHECK-NEXT: v11.w = vabs(v7.w) +; CHECK-NEXT: q0 = vcmp.gt(v15.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.uw = vcl0(v6.uw) -; CHECK-NEXT: v7.w = vabs(v4.w) -; CHECK-NEXT: v11.w = vabs(v5.w) -; CHECK-NEXT: q0 = vcmp.gt(v0.w,v4.w) +; CHECK-NEXT: v12.uw = vcl0(v8.uw) +; CHECK-NEXT: v17 = vmux(q0,v16,v15) +; CHECK-NEXT: q0 = vcmp.gt(v15.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vcl0(v8.uw) -; CHECK-NEXT: v9.w = vadd(v9.w,v1.w) -; CHECK-NEXT: v18 = vmux(q0,v15,v0) -; CHECK-NEXT: q1 = vcmp.gt(v0.w,v5.w) +; CHECK-NEXT: v13.uw = vcl0(v9.uw) +; CHECK-NEXT: v12.w = vadd(v12.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vcl0(v7.uw) -; CHECK-NEXT: v14.w = vadd(v14.w,v1.w) +; CHECK-NEXT: v14.uw = vcl0(v11.uw) +; CHECK-NEXT: v13.w = vadd(v13.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16.uw = vcl0(v11.uw) -; CHECK-NEXT: v12.w = vadd(v12.w,v1.w) +; CHECK-NEXT: v10.uw = vcl0(v5.uw) +; CHECK-NEXT: v14.w = vadd(v14.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vasl(v6.w,v9.w) -; CHECK-NEXT: v16.w = vadd(v16.w,v1.w) +; CHECK-NEXT: v9.w = vasl(v9.w,v13.w) +; CHECK-NEXT: v10.w = vadd(v10.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v7.w,v12.w) -; CHECK-NEXT: v19 = vand(v6,v13) +; CHECK-NEXT: v11.w = vasl(v11.w,v14.w) +; CHECK-NEXT: v20 = vand(v9,v4) +; CHECK-NEXT: v19.w = vadd(v9.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasl(v11.w,v16.w) -; CHECK-NEXT: v21 = vand(v7,v13) -; CHECK-NEXT: v31.w = vadd(v7.w,v10.w) -; CHECK-NEXT: q0 = vcmp.eq(v19.w,v0.w) +; CHECK-NEXT: v8.w = vasl(v8.w,v12.w) +; CHECK-NEXT: v23.w = vadd(v11.w,v3.w) +; CHECK-NEXT: q3 = vcmp.eq(v20.w,v15.w) +; CHECK-NEXT: v28 = vand(v11,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v8.w,v14.w) -; CHECK-NEXT: v22.w = vadd(v11.w,v10.w) -; CHECK-NEXT: q3 = vcmp.eq(v21.w,v0.w) -; CHECK-NEXT: v24 = vand(v11,v13) +; CHECK-NEXT: v30 = vmux(q3,v15,v2) +; CHECK-NEXT: q3 = vcmp.eq(v28.w,v15.w) +; CHECK-NEXT: v22 = vand(v8,v4) +; CHECK-NEXT: q2 = vcmp.gt(v9.uw,v19.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v31.uw,r2) -; CHECK-NEXT: v29 = vmux(q3,v0,v1) -; CHECK-NEXT: q3 = vcmp.eq(v24.w,v0.w) -; CHECK-NEXT: q2 = vcmp.gt(v7.uw,v31.uw) +; CHECK-NEXT: v21.uw = vlsr(v9.uw,r2) +; CHECK-NEXT: v27 = vmux(q3,v15,v2) +; CHECK-NEXT: q1 = vcmp.eq(v22.w,v15.w) +; CHECK-NEXT: v24 = vmux(q2,v2,v15) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.uw = vlsr(v11.uw,r2) -; CHECK-NEXT: v27 = vmux(q3,v0,v1) -; CHECK-NEXT: v19.w = vadd(v23.w,v29.w) -; CHECK-NEXT: v31 = vmux(q2,v1,v0) +; CHECK-NEXT: v9.uw = vlsr(v19.uw,r2) +; CHECK-NEXT: v26 = vmux(q1,v15,v2) +; CHECK-NEXT: v13.w = vsub(v24.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30.uw = vlsr(v22.uw,r2) -; CHECK-NEXT: v13 = vand(v8,v13) -; CHECK-NEXT: v26 = vmux(q0,v0,v1) -; CHECK-NEXT: v12.w = vsub(v31.w,v12.w) +; CHECK-NEXT: v31.uw = vlsr(v23.uw,r2) +; CHECK-NEXT: v22.w = vadd(v9.w,v30.w) +; CHECK-NEXT: v30.w = vadd(v8.w,v3.w) +; CHECK-NEXT: q2 = vcmp.eq(v21.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.uw = vlsr(v7.uw,r2) -; CHECK-NEXT: q3 = vcmp.eq(v28.w,v30.w) -; CHECK-NEXT: v28.w = vadd(v30.w,v27.w) -; CHECK-NEXT: v31 = vmux(q1,v15,v0) +; CHECK-NEXT: v5.w = vasl(v5.w,v10.w) +; CHECK-NEXT: v28.w = vadd(v31.w,v27.w) +; CHECK-NEXT: v13.w = vadd(v13.w,v18.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v30.uw,r0) -; CHECK-NEXT: v30.w = vadd(v6.w,v10.w) -; CHECK-NEXT: q2 = vcmp.eq(v20.w,v23.w) -; CHECK-NEXT: v10.w = vadd(v8.w,v10.w) +; CHECK-NEXT: v29.uw = vlsr(v11.uw,r2) +; CHECK-NEXT: v3.w = vadd(v5.w,v3.w) +; CHECK-NEXT: v4 = vand(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.uw = vlsr(v28.uw,r0) -; CHECK-NEXT: q0 = vcmp.gt(v8.uw,v10.uw) -; CHECK-NEXT: v12.w = vadd(v12.w,v17.w) +; CHECK-NEXT: v19.uw = vlsr(v31.uw,r0) +; CHECK-NEXT: q3 = vcmp.eq(v29.w,v31.w) +; CHECK-NEXT: v31 = vmux(q0,v16,v15) +; CHECK-NEXT: q0 = vcmp.gt(v5.uw,v3.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v23.uw,r0) -; CHECK-NEXT: v7 = vmux(q3,v7,v29) -; CHECK-NEXT: q3 = vcmp.eq(v13.w,v0.w) +; CHECK-NEXT: v20.uw = vlsr(v28.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19.uw = vlsr(v19.uw,r0) -; CHECK-NEXT: v29 = vmux(q3,v0,v1) -; CHECK-NEXT: v7 = vor(v31,v7) +; CHECK-NEXT: v9.uw = vlsr(v9.uw,r0) +; CHECK-NEXT: v19 = vmux(q3,v20,v19) +; CHECK-NEXT: q3 = vcmp.eq(v4.w,v15.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v29.uw = vlsr(v22.uw,r0) +; CHECK-NEXT: v19 = vor(v31,v19) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2) -; CHECK-NEXT: v19 = vmux(q2,v19,v23) -; CHECK-NEXT: q2 = vcmp.gt(v11.uw,v22.uw) +; CHECK-NEXT: v9 = vmux(q2,v29,v9) +; CHECK-NEXT: q2 = vcmp.gt(v11.uw,v23.uw) +; CHECK-NEXT: v29 = vmux(q3,v15,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v10.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: v27 = vmux(q2,v1,v0) -; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v30.uw) +; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2) +; CHECK-NEXT: v27 = vmux(q2,v2,v15) +; CHECK-NEXT: q2 = vcmp.gt(v8.uw,v30.uw) ; CHECK-NEXT: v28.w = vadd(v25.w,v26.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v31 = vmux(q2,v1,v0) -; CHECK-NEXT: v1 = vmux(q0,v1,v0) -; CHECK-NEXT: v30.w = vadd(v10.w,v29.w) +; CHECK-NEXT: v8.uw = vlsr(v8.uw,r2) +; CHECK-NEXT: v31 = vmux(q2,v2,v15) +; CHECK-NEXT: v2 = vmux(q0,v2,v15) +; CHECK-NEXT: v30.w = vadd(v3.w,v29.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.uw = vlsr(v8.uw,r2) -; CHECK-NEXT: v1.w = vsub(v1.w,v14.w) -; CHECK-NEXT: q3 = vcmp.eq(v6.w,v25.w) -; CHECK-NEXT: v21.w = vsub(v31.w,v9.w) +; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) +; CHECK-NEXT: v2.w = vsub(v2.w,v10.w) +; CHECK-NEXT: q3 = vcmp.eq(v8.w,v25.w) +; CHECK-NEXT: v22.w = vsub(v31.w,v12.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.uw = vlsr(v28.uw,r0) -; CHECK-NEXT: v6.w = vadd(v21.w,v17.w) -; CHECK-NEXT: v1.w = vadd(v1.w,v17.w) -; CHECK-NEXT: q0 = vcmp.eq(v24.w,v10.w) +; CHECK-NEXT: v5.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v4.w = vsub(v27.w,v14.w) +; CHECK-NEXT: v8.w = vadd(v22.w,v18.w) +; CHECK-NEXT: v2.w = vadd(v2.w,v18.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22.uw = vlsr(v25.uw,r0) -; CHECK-NEXT: v13.w = vsub(v27.w,v16.w) -; CHECK-NEXT: q2 = vcmp.gt(v0.w,v3.w) -; CHECK-NEXT: v18 = vor(v18,v19) +; CHECK-NEXT: v11.uw = vlsr(v25.uw,r0) +; CHECK-NEXT: q0 = vcmp.eq(v24.w,v3.w) +; CHECK-NEXT: q2 = vcmp.gt(v15.w,v1.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v18.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v23.uw = vlsr(v30.uw,r0) -; CHECK-NEXT: v8 = vmux(q3,v8,v22) -; CHECK-NEXT: q3 = vcmp.gt(v0.w,v2.w) -; CHECK-NEXT: v26 = vmux(q2,v15,v0) +; CHECK-NEXT: v5 = vmux(q3,v5,v11) +; CHECK-NEXT: q3 = vcmp.gt(v15.w,v0.w) +; CHECK-NEXT: v24 = vmux(q2,v16,v15) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.uw = vlsr(v10.uw,r0) -; CHECK-NEXT: v25.w = vadd(v13.w,v17.w) -; CHECK-NEXT: v27 = vmux(q3,v15,v0) -; CHECK-NEXT: v8 = vor(v26,v8) +; CHECK-NEXT: v3.uw = vlsr(v3.uw,r0) +; CHECK-NEXT: v25 = vmux(q3,v16,v15) +; CHECK-NEXT: v5 = vor(v24,v5) +; CHECK-NEXT: v9 = vor(v17,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vasl(v6.w,r4) -; CHECK-NEXT: v9 = vmux(q0,v23,v24) -; CHECK-NEXT: q2 = vcmp.eq(v3.w,v0.w) -; CHECK-NEXT: q3 = vcmp.eq(v2.w,v0.w) +; CHECK-NEXT: v8.w = vasl(v8.w,r4) +; CHECK-NEXT: v3 = vmux(q0,v23,v3) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v15.w) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.w = vasl(v1.w,r4) -; CHECK-NEXT: v9 = vor(v27,v9) -; CHECK-NEXT: v6 = vor(v8,v6) +; CHECK-NEXT: v2.w = vasl(v2.w,r4) +; CHECK-NEXT: v3 = vor(v25,v3) +; CHECK-NEXT: v5 = vor(v5,v8) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.w = vasl(v12.w,r4) -; CHECK-NEXT: v1 = vor(v9,v1) -; CHECK-NEXT: v29 = vmux(q2,v0,v6) -; CHECK-NEXT: vmem(r1+#1) = v29.new +; CHECK-NEXT: v13.w = vasl(v13.w,r4) +; CHECK-NEXT: v2 = vor(v3,v2) +; CHECK-NEXT: v27 = vmux(q2,v15,v5) +; CHECK-NEXT: vmem(r1+#1) = v27.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.w = vasl(v25.w,r4) -; CHECK-NEXT: v1 = vmux(q3,v0,v1) -; CHECK-NEXT: q2 = vcmp.eq(v5.w,v0.w) -; CHECK-NEXT: vmem(r1+#0) = v1.new +; CHECK-NEXT: v26.w = vasl(v4.w,r4) +; CHECK-NEXT: v29 = vmux(q3,v15,v2) +; CHECK-NEXT: q2 = vcmp.eq(v7.w,v15.w) +; CHECK-NEXT: vmem(r1+#0) = v29.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vor(v7,v28) -; CHECK-NEXT: v31 = vor(v18,v12) -; CHECK-NEXT: q3 = vcmp.eq(v4.w,v0.w) +; CHECK-NEXT: v28 = vor(v19,v26) +; CHECK-NEXT: v30 = vor(v9,v13) +; CHECK-NEXT: q3 = vcmp.eq(v6.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vmux(q2,v0,v30) -; CHECK-NEXT: v0 = vmux(q3,v0,v31) -; CHECK-NEXT: vmem(r1+#3) = v2.new +; CHECK-NEXT: v0 = vmux(q2,v15,v28) +; CHECK-NEXT: v31 = vmux(q3,v15,v30) +; CHECK-NEXT: vmem(r1+#3) = v0.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: vmem(r1+#2) = v0 +; CHECK-NEXT: vmem(r1+#2) = v31 ; CHECK-NEXT: } %v0 = load <128 x i8>, ptr %a0, align 128 %v1 = sitofp <128 x i8> %v0 to <128 x float> @@ -1630,189 +1633,190 @@ define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r3:2 = combine(##.LCPI15_0,#8) -; CHECK-NEXT: v3:2.uh = vunpack(v1.ub) -; CHECK-NEXT: v1.cur = vmem(r0+#0) +; CHECK-NEXT: r7 = #64 +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r0 = #1 -; CHECK-NEXT: r6 = #512 -; CHECK-NEXT: r7 = #255 -; CHECK-NEXT: v3 = vmem(r3+#0) +; CHECK-NEXT: v4 = vsplat(r0) +; CHECK-NEXT: r3:2 = combine(##255,#8) +; CHECK-NEXT: v1 = valign(v0,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vsplat(r0) -; CHECK-NEXT: v16 = vsplat(r6) -; CHECK-NEXT: v3 = vdelta(v1,v3) -; CHECK-NEXT: v0 = vxor(v0,v0) +; CHECK-NEXT: v6 = vsplat(r3) +; CHECK-NEXT: r6 = #512 +; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v10 = vsplat(r7) +; CHECK-NEXT: v13 = vsplat(r6) ; CHECK-NEXT: r5 = #159 -; CHECK-NEXT: v5:4.uw = vunpack(v2.uh) +; CHECK-NEXT: v31:30.uh = vunpack(v1.ub) +; CHECK-NEXT: v15 = vxor(v15,v15) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19 = vsplat(r5) +; CHECK-NEXT: v16 = vsplat(r5) ; CHECK-NEXT: r4 = #23 -; CHECK-NEXT: v31:30.uh = vunpack(v3.ub) +; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.uw = vcl0(v4.uw) +; CHECK-NEXT: v1:0.uw = vunpack(v30.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3:2.uw = vunpack(v30.uh) -; CHECK-NEXT: v6.w = vadd(v6.w,v1.w) +; CHECK-NEXT: v5.uw = vcl0(v2.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.uw = vcl0(v5.uw) +; CHECK-NEXT: v7.uw = vcl0(v0.uw) +; CHECK-NEXT: v5.w = vadd(v5.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.uw = vcl0(v2.uw) -; CHECK-NEXT: v7.w = vadd(v7.w,v1.w) +; CHECK-NEXT: v8.uw = vcl0(v3.uw) +; CHECK-NEXT: v7.w = vadd(v7.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vcl0(v3.uw) -; CHECK-NEXT: v11.w = vadd(v11.w,v1.w) +; CHECK-NEXT: v9.uw = vcl0(v1.uw) +; CHECK-NEXT: v8.w = vadd(v8.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v4.w,v6.w) -; CHECK-NEXT: v12.w = vadd(v12.w,v1.w) +; CHECK-NEXT: v10.w = vasl(v2.w,v5.w) +; CHECK-NEXT: v9.w = vadd(v9.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v5.w,v7.w) -; CHECK-NEXT: v20 = vand(v8,v16) -; CHECK-NEXT: v17.w = vadd(v8.w,v10.w) +; CHECK-NEXT: v12.w = vasl(v0.w,v7.w) +; CHECK-NEXT: v19 = vand(v10,v13) +; CHECK-NEXT: v18.w = vadd(v10.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v18.w = vasl(v2.w,v11.w) -; CHECK-NEXT: v22 = vand(v9,v16) -; CHECK-NEXT: q1 = vcmp.eq(v20.w,v0.w) -; CHECK-NEXT: v13.w = vadd(v9.w,v10.w) +; CHECK-NEXT: v11.w = vasl(v3.w,v8.w) +; CHECK-NEXT: v24 = vand(v12,v13) +; CHECK-NEXT: q2 = vcmp.eq(v19.w,v15.w) +; CHECK-NEXT: v20.w = vadd(v12.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.w = vasl(v3.w,v12.w) -; CHECK-NEXT: v28.w = vadd(v18.w,v10.w) -; CHECK-NEXT: q2 = vcmp.eq(v22.w,v0.w) -; CHECK-NEXT: v25 = vand(v18,v16) +; CHECK-NEXT: v14.w = vasl(v1.w,v9.w) +; CHECK-NEXT: v23 = vand(v11,v13) +; CHECK-NEXT: v22.w = vadd(v11.w,v6.w) +; CHECK-NEXT: q3 = vcmp.eq(v24.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29 = vmux(q1,v0,v1) -; CHECK-NEXT: v24 = vmux(q2,v0,v1) -; CHECK-NEXT: v16 = vand(v21,v16) -; CHECK-NEXT: q1 = vcmp.eq(v25.w,v0.w) +; CHECK-NEXT: v19.uw = vlsr(v18.uw,r2) +; CHECK-NEXT: v6.w = vadd(v14.w,v6.w) +; CHECK-NEXT: v13 = vand(v14,v13) +; CHECK-NEXT: v31 = vmux(q3,v15,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.uw = vlsr(v28.uw,r2) -; CHECK-NEXT: v10.w = vadd(v21.w,v10.w) -; CHECK-NEXT: q2 = vcmp.gt(v18.uw,v28.uw) -; CHECK-NEXT: q3 = vcmp.eq(v16.w,v0.w) +; CHECK-NEXT: v21.uw = vlsr(v12.uw,r2) +; CHECK-NEXT: q3 = vcmp.eq(v13.w,v15.w) +; CHECK-NEXT: v28 = vmux(q2,v15,v4) +; CHECK-NEXT: q0 = vcmp.gt(v12.uw,v20.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.uw = vlsr(v18.uw,r2) -; CHECK-NEXT: q0 = vcmp.gt(v9.uw,v13.uw) -; CHECK-NEXT: v18 = vmux(q2,v1,v0) -; CHECK-NEXT: v30 = vmux(q1,v0,v1) +; CHECK-NEXT: v12.uw = vlsr(v20.uw,r2) +; CHECK-NEXT: q1 = vcmp.eq(v23.w,v15.w) +; CHECK-NEXT: v26 = vmux(q3,v15,v4) +; CHECK-NEXT: v23.w = vadd(v19.w,v28.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v15.uw = vlsr(v13.uw,r2) -; CHECK-NEXT: q2 = vcmp.gt(v8.uw,v17.uw) -; CHECK-NEXT: v13.w = vadd(v26.w,v30.w) -; CHECK-NEXT: v27 = vmux(q3,v0,v1) +; CHECK-NEXT: v13.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: v20.w = vadd(v12.w,v31.w) +; CHECK-NEXT: q3 = vcmp.gt(v11.uw,v22.uw) +; CHECK-NEXT: v31 = vmux(q1,v15,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v17.uw,r2) -; CHECK-NEXT: v30 = vmux(q0,v1,v0) -; CHECK-NEXT: q3 = vcmp.gt(v21.uw,v10.uw) -; CHECK-NEXT: v11.w = vsub(v18.w,v11.w) +; CHECK-NEXT: v28.uw = vlsr(v22.uw,r2) +; CHECK-NEXT: v30.w = vadd(v13.w,v26.w) +; CHECK-NEXT: q1 = vcmp.gt(v10.uw,v18.uw) +; CHECK-NEXT: v29 = vmux(q0,v4,v15) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v25.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: v7.w = vsub(v30.w,v7.w) -; CHECK-NEXT: v22.w = vadd(v23.w,v29.w) -; CHECK-NEXT: v29.w = vadd(v15.w,v24.w) +; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0) +; CHECK-NEXT: q0 = vcmp.eq(v21.w,v12.w) +; CHECK-NEXT: v22.w = vadd(v28.w,v31.w) +; CHECK-NEXT: v23 = vmux(q3,v4,v15) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16.uw = vlsr(v21.uw,r2) -; CHECK-NEXT: v21 = vmux(q2,v1,v0) -; CHECK-NEXT: v31.w = vadd(v25.w,v27.w) -; CHECK-NEXT: v1 = vmux(q3,v1,v0) +; CHECK-NEXT: v24.uw = vlsr(v30.uw,r0) +; CHECK-NEXT: v31 = vmux(q1,v4,v15) +; CHECK-NEXT: q3 = vcmp.gt(v14.uw,v6.uw) +; CHECK-NEXT: v30.w = vsub(v23.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vlsr(v8.uw,r2) -; CHECK-NEXT: v6.w = vsub(v21.w,v6.w) -; CHECK-NEXT: v7.w = vadd(v7.w,v19.w) -; CHECK-NEXT: v1.w = vsub(v1.w,v12.w) +; CHECK-NEXT: v5.w = vsub(v31.w,v5.w) +; CHECK-NEXT: v4 = vmux(q3,v4,v15) +; CHECK-NEXT: v7.w = vsub(v29.w,v7.w) +; CHECK-NEXT: v6.w = vadd(v30.w,v16.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2) -; CHECK-NEXT: v6.w = vadd(v6.w,v19.w) -; CHECK-NEXT: v11.w = vadd(v11.w,v19.w) -; CHECK-NEXT: v1.w = vadd(v1.w,v19.w) +; CHECK-NEXT: v17.uw = vlsr(v10.uw,r2) +; CHECK-NEXT: v4.w = vsub(v4.w,v9.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v16.w) +; CHECK-NEXT: v7.w = vadd(v7.w,v16.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v18.uw = vlsr(v31.uw,r0) -; CHECK-NEXT: q1 = vcmp.eq(v20.w,v26.w) -; CHECK-NEXT: q0 = vcmp.eq(v16.w,v25.w) -; CHECK-NEXT: q2 = vcmp.eq(v14.w,v23.w) +; CHECK-NEXT: v11.uw = vlsr(v11.uw,r2) +; CHECK-NEXT: q2 = vcmp.eq(v17.w,v19.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v16.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uw = vlsr(v25.uw,r0) -; CHECK-NEXT: q3 = vcmp.eq(v9.w,v15.w) +; CHECK-NEXT: v25.uw = vlsr(v14.uw,r2) +; CHECK-NEXT: q3 = vcmp.eq(v11.w,v28.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.uw = vlsr(v22.uw,r0) +; CHECK-NEXT: v27.uw = vlsr(v12.uw,r0) +; CHECK-NEXT: q1 = vcmp.eq(v25.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v31.uw = vlsr(v23.uw,r0) +; CHECK-NEXT: v29.uw = vlsr(v19.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.uw = vlsr(v29.uw,r0) +; CHECK-NEXT: v12.uw = vlsr(v22.uw,r0) +; CHECK-NEXT: v23 = vmux(q2,v21,v29) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.uw = vlsr(v15.uw,r0) +; CHECK-NEXT: v14.uw = vlsr(v28.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.uw = vlsr(v26.uw,r0) -; CHECK-NEXT: v26 = vmux(q0,v18,v27) -; CHECK-NEXT: v8 = vmux(q3,v8,v24) -; CHECK-NEXT: v27 = vmux(q2,v20,v31) +; CHECK-NEXT: v20.uw = vlsr(v20.uw,r0) +; CHECK-NEXT: v8 = vmux(q3,v12,v14) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v7.w,r4) -; CHECK-NEXT: q2 = vcmp.eq(v5.w,v0.w) -; CHECK-NEXT: q3 = vcmp.eq(v4.w,v0.w) +; CHECK-NEXT: v6.w = vasl(v6.w,r4) +; CHECK-NEXT: v20 = vmux(q0,v20,v27) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.uw = vlsr(v13.uw,r0) -; CHECK-NEXT: v7 = vor(v8,v7) +; CHECK-NEXT: v26.uw = vlsr(v13.uw,r0) +; CHECK-NEXT: v6 = vor(v8,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vasl(v6.w,r4) -; CHECK-NEXT: v25 = vmux(q1,v13,v28) -; CHECK-NEXT: v29 = vmux(q2,v0,v7) -; CHECK-NEXT: vmem(r1+#1) = v29.new +; CHECK-NEXT: v5.w = vasl(v5.w,r4) +; CHECK-NEXT: v22 = vmux(q1,v24,v26) +; CHECK-NEXT: v26 = vmux(q2,v15,v6) +; CHECK-NEXT: vmem(r1+#1) = v26.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.w = vasl(v1.w,r4) -; CHECK-NEXT: v28 = vor(v27,v6) -; CHECK-NEXT: q2 = vcmp.eq(v3.w,v0.w) +; CHECK-NEXT: v7.w = vasl(v7.w,r4) +; CHECK-NEXT: v25 = vor(v23,v5) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v15.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v24.w = vasl(v4.w,r4) +; CHECK-NEXT: v28 = vmux(q3,v15,v25) +; CHECK-NEXT: v29 = vor(v20,v7) +; CHECK-NEXT: vmem(r1+#0) = v28.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasl(v11.w,r4) -; CHECK-NEXT: v1 = vor(v26,v1) -; CHECK-NEXT: v30 = vmux(q3,v0,v28) -; CHECK-NEXT: vmem(r1+#0) = v30.new +; CHECK-NEXT: v27 = vor(v22,v24) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v31 = vor(v25,v11) -; CHECK-NEXT: q3 = vcmp.eq(v2.w,v0.w) -; CHECK-NEXT: v1 = vmux(q2,v0,v1) -; CHECK-NEXT: vmem(r1+#3) = v1.new +; CHECK-NEXT: v30 = vmux(q2,v15,v27) +; CHECK-NEXT: v31 = vmux(q3,v15,v29) +; CHECK-NEXT: vmem(r1+#3) = v30.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q3,v0,v31) ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: vmem(r1+#2) = v0.new +; CHECK-NEXT: vmem(r1+#2) = v31 ; CHECK-NEXT: } %v0 = load <128 x i8>, ptr %a0, align 128 %v1 = uitofp <128 x i8> %v0 to <128 x float> diff --git a/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll b/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll index 5009adf8..8cf7cb2 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll @@ -10,28 +10,7 @@ define <64 x i16> @mulhs16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V60-NEXT: v1:0.w = vmpy(v1.h,v0.h) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: r7:6 = combine(#64,#68) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: r5 = #120 -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v1:0 = vshuff(v1,v0,r7) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v3:2 = vdeal(v0,v0,r6) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v31:30 = vdeal(v0,v1,r6) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v3:2 = vshuff(v3,v2,r5) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v1:0 = vshuff(v31,v30,r5) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v0.h = vpacko(v0.w,v2.w) +; V60-NEXT: v0.h = vshuffo(v1.h,v0.h) ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: jumpr r31 @@ -43,28 +22,7 @@ define <64 x i16> @mulhs16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V65-NEXT: v1:0.w = vmpy(v1.h,v0.h) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: r7:6 = combine(#64,#68) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: r5 = #120 -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v1:0 = vshuff(v1,v0,r7) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v3:2 = vdeal(v0,v0,r6) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v31:30 = vdeal(v0,v1,r6) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v3:2 = vshuff(v3,v2,r5) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v1:0 = vshuff(v31,v30,r5) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v0.h = vpacko(v0.w,v2.w) +; V65-NEXT: v0.h = vshuffo(v1.h,v0.h) ; V65-NEXT: } ; V65-NEXT: { ; V65-NEXT: jumpr r31 @@ -76,28 +34,7 @@ define <64 x i16> @mulhs16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V69-NEXT: v1:0.w = vmpy(v1.h,v0.h) ; V69-NEXT: } ; V69-NEXT: { -; V69-NEXT: r7:6 = combine(#64,#68) -; V69-NEXT: } -; V69-NEXT: { -; V69-NEXT: r5 = #120 -; V69-NEXT: } -; V69-NEXT: { -; V69-NEXT: v1:0 = vshuff(v1,v0,r7) -; V69-NEXT: } -; V69-NEXT: { -; V69-NEXT: v3:2 = vdeal(v0,v0,r6) -; V69-NEXT: } -; V69-NEXT: { -; V69-NEXT: v31:30 = vdeal(v0,v1,r6) -; V69-NEXT: } -; V69-NEXT: { -; V69-NEXT: v3:2 = vshuff(v3,v2,r5) -; V69-NEXT: } -; V69-NEXT: { -; V69-NEXT: v1:0 = vshuff(v31,v30,r5) -; V69-NEXT: } -; V69-NEXT: { -; V69-NEXT: v0.h = vpacko(v0.w,v2.w) +; V69-NEXT: v0.h = vshuffo(v1.h,v0.h) ; V69-NEXT: } ; V69-NEXT: { ; V69-NEXT: jumpr r31 @@ -117,28 +54,7 @@ define <64 x i16> @mulhu16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V60-NEXT: v1:0.uw = vmpy(v1.uh,v0.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: r7:6 = combine(#64,#68) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: r5 = #120 -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v1:0 = vshuff(v1,v0,r7) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v3:2 = vdeal(v0,v0,r6) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v31:30 = vdeal(v0,v1,r6) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v3:2 = vshuff(v3,v2,r5) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v1:0 = vshuff(v31,v30,r5) -; V60-NEXT: } -; V60-NEXT: { -; V60-NEXT: v0.h = vpacko(v0.w,v2.w) +; V60-NEXT: v0.h = vshuffo(v1.h,v0.h) ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: jumpr r31 @@ -150,28 +66,7 @@ define <64 x i16> @mulhu16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V65-NEXT: v1:0.uw = vmpy(v1.uh,v0.uh) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: r7:6 = combine(#64,#68) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: r5 = #120 -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v1:0 = vshuff(v1,v0,r7) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v3:2 = vdeal(v0,v0,r6) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v31:30 = vdeal(v0,v1,r6) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v3:2 = vshuff(v3,v2,r5) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v1:0 = vshuff(v31,v30,r5) -; V65-NEXT: } -; V65-NEXT: { -; V65-NEXT: v0.h = vpacko(v0.w,v2.w) +; V65-NEXT: v0.h = vshuffo(v1.h,v0.h) ; V65-NEXT: } ; V65-NEXT: { ; V65-NEXT: jumpr r31 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/qmul.ll b/llvm/test/CodeGen/Hexagon/autohvx/qmul.ll index 13634ed4..866bb28 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/qmul.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/qmul.ll @@ -75,10 +75,7 @@ define void @f2(ptr %a0, ptr %a1, ptr %a2) #0 { ; CHECK-NEXT: v0 = vmem(r1+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r7 = #64 -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: r5:4 = combine(#68,#120) +; CHECK-NEXT: r7 = #124 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #15 @@ -93,25 +90,13 @@ define void @f2(ptr %a0, ptr %a1, ptr %a2) #0 { ; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3:2 = vdeal(v0,v0,r5) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v31:30 = vdeal(v0,v1,r5) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v3:2 = vshuff(v3,v2,r4) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v1:0 = vshuff(v31,v30,r4) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v1.uw = vlsr(v2.uw,r3) +; CHECK-NEXT: v0.uw = vlsr(v0.uw,r3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.uw = vlsr(v0.uw,r3) +; CHECK-NEXT: v1.uw = vlsr(v1.uw,r3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.h = vpacke(v0.w,v1.w) +; CHECK-NEXT: v0.h = vpacke(v1.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: vmem(r2+#0) = v0 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/shuffle-half-128b.ll b/llvm/test/CodeGen/Hexagon/autohvx/shuffle-half-128b.ll index 41e195d..b10ca68 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/shuffle-half-128b.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/shuffle-half-128b.ll @@ -60,10 +60,10 @@ define <128 x i8> @test_04(<128 x i8> %a0, <128 x i8> %a1) #0 { ; CHECK-LABEL: test_04: ; CHECK: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r0 = #64 +; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vror(v0,r0) +; CHECK-NEXT: v0 = valign(v0,v0,r7) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } %v0 = shufflevector <128 x i8> %a0, <128 x i8> %a1, <128 x i32> @@ -209,10 +209,10 @@ define <128 x i8> @test_14(<128 x i8> %a0, <128 x i8> %a1) #0 { ; CHECK-LABEL: test_14: ; CHECK: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r0 = #64 +; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vror(v1,r0) +; CHECK-NEXT: v0 = valign(v1,v1,r7) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } %v0 = shufflevector <128 x i8> %a0, <128 x i8> %a1, <128 x i32> diff --git a/llvm/test/CodeGen/Hexagon/autohvx/shuffle-half-64b.ll b/llvm/test/CodeGen/Hexagon/autohvx/shuffle-half-64b.ll index 4d87673..18a9c02 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/shuffle-half-64b.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/shuffle-half-64b.ll @@ -60,10 +60,10 @@ define <64 x i8> @test_04(<64 x i8> %a0, <64 x i8> %a1) #0 { ; CHECK-LABEL: test_04: ; CHECK: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r0 = #32 +; CHECK-NEXT: r7 = #32 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vror(v0,r0) +; CHECK-NEXT: v0 = valign(v0,v0,r7) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } %v0 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> @@ -209,10 +209,10 @@ define <64 x i8> @test_14(<64 x i8> %a0, <64 x i8> %a1) #0 { ; CHECK-LABEL: test_14: ; CHECK: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r0 = #32 +; CHECK-NEXT: r7 = #32 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vror(v1,r0) +; CHECK-NEXT: v0 = valign(v1,v1,r7) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } %v0 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> -- 2.7.4