}
static bool isZeroVector(SDValue N) {
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0);
if (N->getOpcode() == ISD::SPLAT_VECTOR)
if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0)))
return Op->getZExtValue() == 0;
return ISD::isBuildVectorAllZeros(N.getNode());
}
+// Return the index of the zero/undef vector, or UINT32_MAX if not found.
+static uint32_t findZeroVectorIdx(SDValue *Ops, unsigned Num) {
+ for (unsigned I = 0; I < Num ; I++)
+ if (isZeroVector(Ops[I]))
+ return I;
+ return UINT32_MAX;
+}
+
// Bytes is a VPERM-like permute vector, except that -1 is used for
// undefined bytes. Implement it on operands Ops[0] and Ops[1] using
// VSLDB or VPERM.
// Fall back on VPERM. Construct an SDNode for the permute vector. Try to
// eliminate a zero vector by reusing any zero index in the permute vector.
- unsigned ZeroVecIdx =
- isZeroVector(Ops[0]) ? 0 : (isZeroVector(Ops[1]) ? 1 : UINT_MAX);
- if (ZeroVecIdx != UINT_MAX) {
+ unsigned ZeroVecIdx = findZeroVectorIdx(&Ops[0], 2);
+ if (ZeroVecIdx != UINT32_MAX) {
bool MaskFirst = true;
int ZeroIdx = -1;
for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
namespace {
// Describes a general N-operand vector shuffle.
struct GeneralShuffle {
- GeneralShuffle(EVT vt) : VT(vt) {}
+ GeneralShuffle(EVT vt) : VT(vt), UnpackFromEltSize(UINT_MAX) {}
void addUndef();
bool add(SDValue, unsigned);
SDValue getNode(SelectionDAG &, const SDLoc &);
+ void tryPrepareForUnpack();
+ bool unpackWasPrepared() { return UnpackFromEltSize <= 4; }
+ SDValue insertUnpackIfPrepared(SelectionDAG &DAG, const SDLoc &DL, SDValue Op);
// The operands of the shuffle.
SmallVector<SDValue, SystemZ::VectorBytes> Ops;
// The type of the shuffle result.
EVT VT;
+
+ // Holds a value of 1, 2 or 4 if a final unpack has been prepared for.
+ unsigned UnpackFromEltSize;
};
}
if (Ops.size() == 0)
return DAG.getUNDEF(VT);
+ // Use a single unpack if possible as the last operation.
+ tryPrepareForUnpack();
+
// Make sure that there are at least two shuffle operands.
if (Ops.size() == 1)
Ops.push_back(DAG.getUNDEF(MVT::v16i8));
// to VPERM.
unsigned OpNo0, OpNo1;
SDValue Op;
- if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
+ if (unpackWasPrepared() && Ops[1].isUndef())
+ Op = Ops[0];
+ else if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
else
Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
+
+ Op = insertUnpackIfPrepared(DAG, DL, Op);
+
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
+#ifndef NDEBUG
+static void dumpBytes(const SmallVectorImpl<int> &Bytes, std::string Msg) {
+ dbgs() << Msg.c_str() << " { ";
+ for (unsigned i = 0; i < Bytes.size(); i++)
+ dbgs() << Bytes[i] << " ";
+ dbgs() << "}\n";
+}
+#endif
+
+// If the Bytes vector matches an unpack operation, prepare to do the unpack
+// after all else by removing the zero vector and the effect of the unpack on
+// Bytes.
+void GeneralShuffle::tryPrepareForUnpack() {
+ uint32_t ZeroVecOpNo = findZeroVectorIdx(&Ops[0], Ops.size());
+ if (ZeroVecOpNo == UINT32_MAX || Ops.size() == 1)
+ return;
+
+ // Only do this if removing the zero vector reduces the depth, otherwise
+ // the critical path will increase with the final unpack.
+ if (Ops.size() > 2 &&
+ Log2_32_Ceil(Ops.size()) == Log2_32_Ceil(Ops.size() - 1))
+ return;
+
+ // Find an unpack that would allow removing the zero vector from Ops.
+ UnpackFromEltSize = 1;
+ for (; UnpackFromEltSize <= 4; UnpackFromEltSize *= 2) {
+ bool MatchUnpack = true;
+ SmallVector<int, SystemZ::VectorBytes> SrcBytes;
+ for (unsigned Elt = 0; Elt < SystemZ::VectorBytes; Elt++) {
+ unsigned ToEltSize = UnpackFromEltSize * 2;
+ bool IsZextByte = (Elt % ToEltSize) < UnpackFromEltSize;
+ if (!IsZextByte)
+ SrcBytes.push_back(Bytes[Elt]);
+ if (Bytes[Elt] != -1) {
+ unsigned OpNo = unsigned(Bytes[Elt]) / SystemZ::VectorBytes;
+ if (IsZextByte != (OpNo == ZeroVecOpNo)) {
+ MatchUnpack = false;
+ break;
+ }
+ }
+ }
+ if (MatchUnpack) {
+ if (Ops.size() == 2) {
+ // Don't use unpack if a single source operand needs rearrangement.
+ for (unsigned i = 0; i < SystemZ::VectorBytes / 2; i++)
+ if (SrcBytes[i] != -1 && SrcBytes[i] % 16 != int(i)) {
+ UnpackFromEltSize = UINT_MAX;
+ return;
+ }
+ }
+ break;
+ }
+ }
+ if (UnpackFromEltSize > 4)
+ return;
+
+ LLVM_DEBUG(dbgs() << "Preparing for final unpack of element size "
+ << UnpackFromEltSize << ". Zero vector is Op#" << ZeroVecOpNo
+ << ".\n";
+ dumpBytes(Bytes, "Original Bytes vector:"););
+
+ // Apply the unpack in reverse to the Bytes array.
+ unsigned B = 0;
+ for (unsigned Elt = 0; Elt < SystemZ::VectorBytes;) {
+ Elt += UnpackFromEltSize;
+ for (unsigned i = 0; i < UnpackFromEltSize; i++, Elt++, B++)
+ Bytes[B] = Bytes[Elt];
+ }
+ while (B < SystemZ::VectorBytes)
+ Bytes[B++] = -1;
+
+ // Remove the zero vector from Ops
+ Ops.erase(&Ops[ZeroVecOpNo]);
+ for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
+ if (Bytes[I] >= 0) {
+ unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
+ if (OpNo > ZeroVecOpNo)
+ Bytes[I] -= SystemZ::VectorBytes;
+ }
+
+ LLVM_DEBUG(dumpBytes(Bytes, "Resulting Bytes vector, zero vector removed:");
+ dbgs() << "\n";);
+}
+
+SDValue GeneralShuffle::insertUnpackIfPrepared(SelectionDAG &DAG,
+ const SDLoc &DL,
+ SDValue Op) {
+ if (!unpackWasPrepared())
+ return Op;
+ unsigned InBits = UnpackFromEltSize * 8;
+ EVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBits),
+ SystemZ::VectorBits / InBits);
+ SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, InVT, Op);
+ unsigned OutBits = InBits * 2;
+ EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(OutBits),
+ SystemZ::VectorBits / OutBits);
+ return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp);
+}
+
// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
static bool isScalarToVector(SDValue Op) {
for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}
-SDValue
-SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
- unsigned UnpackHigh) const {
+SDValue SystemZTargetLowering::
+lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
SDValue PackedOp = Op.getOperand(0);
EVT OutVT = Op.getValueType();
EVT InVT = PackedOp.getValueType();
FromBits *= 2;
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
SystemZ::VectorBits / FromBits);
- PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
+ PackedOp =
+ DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp);
} while (FromBits != ToBits);
return PackedOp;
}
+// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector.
+SDValue SystemZTargetLowering::
+lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
+ SDValue PackedOp = Op.getOperand(0);
+ SDLoc DL(Op);
+ EVT OutVT = Op.getValueType();
+ EVT InVT = PackedOp.getValueType();
+ unsigned InNumElts = InVT.getVectorNumElements();
+ unsigned OutNumElts = OutVT.getVectorNumElements();
+ unsigned NumInPerOut = InNumElts / OutNumElts;
+
+ SDValue ZeroVec =
+ DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType()));
+
+ SmallVector<int, 16> Mask(InNumElts);
+ unsigned ZeroVecElt = InNumElts;
+ for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) {
+ unsigned MaskElt = PackedElt * NumInPerOut;
+ unsigned End = MaskElt + NumInPerOut - 1;
+ for (; MaskElt < End; MaskElt++)
+ Mask[MaskElt] = ZeroVecElt++;
+ Mask[MaskElt] = PackedElt;
+ }
+ SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask);
+ return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf);
+}
+
SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
unsigned ByScalar) const {
// Look for cases where a vector shift can use the *_BY_SCALAR form.
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::SIGN_EXTEND_VECTOR_INREG:
- return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
+ return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG);
case ISD::ZERO_EXTEND_VECTOR_INREG:
- return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
+ return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
case ISD::SHL:
return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
case ISD::SRL:
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
- unsigned UnpackHigh) const;
+ SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
bool canTreatAsByteVector(EVT VT) const;
; Test a v4i8->v4i32 extension.
define <4 x i32> @f5(<4 x i8> *%ptr) {
; CHECK-LABEL: f5:
+; CHECK: larl %r1, .LCPI4_0
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
-; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
-; CHECK: vuplhh %v24, [[REG2]]
+; CHECK: vl %v1, 0(%r1), 3
+; CHECK: vperm %v24, %v1, [[REG1]], %v1
; CHECK: br %r14
%val = load <4 x i8>, <4 x i8> *%ptr
%ret = zext <4 x i8> %val to <4 x i32>
; Test a v2i8->v2i64 extension.
define <2 x i64> @f8(<2 x i8> *%ptr) {
; CHECK-LABEL: f8:
-; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2)
-; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
-; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]]
-; CHECK: vuplhf %v24, [[REG3]]
+; CHECK: larl %r1, .LCPI7_0
+; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK: vl %v1, 0(%r1), 3
+; CHECK: vperm %v24, %v1, [[REG1]], %v1
; CHECK: br %r14
%val = load <2 x i8>, <2 x i8> *%ptr
%ret = zext <2 x i8> %val to <2 x i64>
; Test a v2i16->v2i64 extension.
define <2 x i64> @f9(<2 x i16> *%ptr) {
; CHECK-LABEL: f9:
-; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
-; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]]
-; CHECK: vuplhf %v24, [[REG2]]
+; CHECK: larl %r1, .LCPI8_0
+; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK: vl %v1, 0(%r1), 3
+; CHECK: vperm %v24, %v1, [[REG1]], %v1
; CHECK: br %r14
%val = load <2 x i16>, <2 x i16> *%ptr
%ret = zext <2 x i16> %val to <2 x i64>
define void @fun4(<2 x i8> %Src, <2 x double>* %Dst) {
; CHECK-LABEL: fun4:
-; CHECK: vuplhb %v0, %v24
-; CHECK-NEXT: vuplhh %v0, %v0
-; CHECK-NEXT: vuplhf %v0, %v0
+; CHECK: larl %r1, .LCPI4_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
define void @fun5(<2 x i16> %Src, <2 x double>* %Dst) {
; CHECK-LABEL: fun5:
-; CHECK: vuplhh %v0, %v24
-; CHECK-NEXT: vuplhf %v0, %v0
+; CHECK: larl %r1, .LCPI5_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
--- /dev/null
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+;
+; Test that vperm is not used if a single unpack is enough.
+
+define <4 x i32> @fun0(<4 x i32>* %Src) nounwind {
+; CHECK-LABEL: fun0:
+; CHECK-NOT: vperm
+ %tmp = load <4 x i32>, <4 x i32>* %Src
+ %tmp2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> <i32 0, i32 4, i32 2, i32 5>
+ ret <4 x i32> %tmp2
+}
+
+define void @fun1(i8 %Src, <32 x i8>* %Dst) nounwind {
+; CHECK-LABEL: fun1:
+; CHECK-NOT: vperm
+ %I0 = insertelement <16 x i8> undef, i8 %Src, i32 0
+ %I1 = insertelement <16 x i8> %I0, i8 %Src, i32 1
+ %I2 = insertelement <16 x i8> %I1, i8 %Src, i32 2
+ %I3 = insertelement <16 x i8> %I2, i8 %Src, i32 3
+ %I4 = insertelement <16 x i8> %I3, i8 %Src, i32 4
+ %I5 = insertelement <16 x i8> %I4, i8 %Src, i32 5
+ %I6 = insertelement <16 x i8> %I5, i8 %Src, i32 6
+ %I7 = insertelement <16 x i8> %I6, i8 %Src, i32 7
+ %I8 = insertelement <16 x i8> %I7, i8 %Src, i32 8
+ %I9 = insertelement <16 x i8> %I8, i8 %Src, i32 9
+ %I10 = insertelement <16 x i8> %I9, i8 %Src, i32 10
+ %I11 = insertelement <16 x i8> %I10, i8 %Src, i32 11
+ %I12 = insertelement <16 x i8> %I11, i8 %Src, i32 12
+ %I13 = insertelement <16 x i8> %I12, i8 %Src, i32 13
+ %I14 = insertelement <16 x i8> %I13, i8 %Src, i32 14
+ %I15 = insertelement <16 x i8> %I14, i8 %Src, i32 15
+
+ %tmp = shufflevector <16 x i8> zeroinitializer,
+ <16 x i8> %I15,
+ <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+ i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %tmp9 = shufflevector <32 x i8> undef,
+ <32 x i8> %tmp,
+ <32 x i32> <i32 33, i32 32, i32 48, i32 49, i32 1, i32 17, i32 50, i32 51,
+ i32 2, i32 18, i32 52, i32 53, i32 3, i32 19, i32 54, i32 55,
+ i32 4, i32 20, i32 56, i32 57, i32 5, i32 21, i32 58, i32 59,
+ i32 6, i32 22, i32 60, i32 61, i32 7, i32 62, i32 55, i32 63>
+
+ store <32 x i8> %tmp9, <32 x i8>* %Dst
+ ret void
+}
+
-; Test that vector zexts are done efficently with unpack instructions also in
-; case of fewer elements than allowed, e.g. <2 x i32>.
+; Test that vector zexts are done efficently also in case of fewer elements
+; than allowed, e.g. <2 x i32>.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
define <2 x i32> @fun2(<2 x i8> %val1) {
; CHECK-LABEL: fun2:
-; CHECK: vuplhb %v0, %v24
-; CHECK-NEXT: vuplhh %v24, %v0
+; CHECK: larl %r1, .LCPI1_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
; CHECK-NEXT: br %r14
%z = zext <2 x i8> %val1 to <2 x i32>
ret <2 x i32> %z
define <2 x i64> @fun3(<2 x i8> %val1) {
; CHECK-LABEL: fun3:
-; CHECK: vuplhb %v0, %v24
-; CHECK-NEXT: vuplhh %v0, %v0
-; CHECK-NEXT: vuplhf %v24, %v0
+; CHECK: larl %r1, .LCPI2_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
; CHECK-NEXT: br %r14
%z = zext <2 x i8> %val1 to <2 x i64>
ret <2 x i64> %z
define <2 x i64> @fun5(<2 x i16> %val1) {
; CHECK-LABEL: fun5:
-; CHECK: vuplhh %v0, %v24
-; CHECK-NEXT: vuplhf %v24, %v0
+; CHECK: larl %r1, .LCPI4_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
; CHECK-NEXT: br %r14
%z = zext <2 x i16> %val1 to <2 x i64>
ret <2 x i64> %z
define <4 x i32> @fun8(<4 x i8> %val1) {
; CHECK-LABEL: fun8:
-; CHECK: vuplhb %v0, %v24
-; CHECK-NEXT: vuplhh %v24, %v0
+; CHECK: larl %r1, .LCPI7_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
; CHECK-NEXT: br %r14
%z = zext <4 x i8> %val1 to <4 x i32>
ret <4 x i32> %z