return SDValue();
}
+// Called by type legalization to handle splat of i64 on RV32.
+// FIXME: We can optimize this when the type has sign or zero bits in one
+// of the halves.
+static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar,
+ SDValue VL, SelectionDAG &DAG) {
+ SDValue ThirtyTwoV = DAG.getConstant(32, DL, VT);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
+ DAG.getConstant(0, DL, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
+ DAG.getConstant(1, DL, MVT::i32));
+
+ // vmv.v.x vX, hi
+ // vsll.vx vX, vX, /*32*/
+ // vmv.v.x vY, lo
+ // vsll.vx vY, vY, /*32*/
+ // vsrl.vx vY, vY, /*32*/
+ // vor.vv vX, vX, vY
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+ Lo = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL);
+ Lo = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL);
+ Lo = DAG.getNode(RISCVISD::SRL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL);
+
+ Hi = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Hi, VL);
+ Hi = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Hi, ThirtyTwoV, Mask, VL);
+
+ return DAG.getNode(RISCVISD::OR_VL, DL, VT, Lo, Hi, Mask, VL);
+}
+
+// This function lowers a splat of a scalar operand Splat with the vector
+// length VL. It ensures the final sequence is type legal, which is useful when
+// lowering a splat after type legalization.
+static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL,
+ SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ if (VT.isFloatingPoint())
+ return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Scalar, VL);
+
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ // Simplest case is that the operand needs to be promoted to XLenVT.
+ if (Scalar.getValueType().bitsLE(XLenVT)) {
+ // If the operand is a constant, sign extend to increase our chances
+ // of being able to use a .vi instruction. ANY_EXTEND would become a
+ // a zero extend and the simm5 check in isel would fail.
+ // FIXME: Should we ignore the upper bits in isel instead?
+ unsigned ExtOpc =
+ isa<ConstantSDNode>(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
+ Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar);
+ return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Scalar, VL);
+ }
+
+ assert(XLenVT == MVT::i32 && Scalar.getValueType() == MVT::i64 &&
+ "Unexpected scalar for splat lowering!");
+
+ // If this is a sign-extended 32-bit constant, we can truncate it and rely
+ // on the instruction to sign-extend since SEW>XLEN.
+ if (auto *CVal = dyn_cast<ConstantSDNode>(Scalar)) {
+ if (isInt<32>(CVal->getSExtValue()))
+ return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
+ DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32),
+ VL);
+ }
+
+ // Otherwise use the more complicated splatting algorithm.
+ return splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
+}
+
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
SDValue V1 = Op.getOperand(0);
unsigned NumElts = VT.getVectorNumElements();
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ MVT ContainerVT =
+ RISCVTargetLowering::getContainerForFixedLengthVector(DAG, VT, Subtarget);
+
+ SDValue TrueMask, VL;
+ std::tie(TrueMask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
if (Lane >= 0) {
- MVT ContainerVT = RISCVTargetLowering::getContainerForFixedLengthVector(
- DAG, VT, Subtarget);
-
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
assert(Lane < (int)NumElts && "Unexpected lane!");
-
- SDValue Mask, VL;
- std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
SDValue Gather =
DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, V1,
- DAG.getConstant(Lane, DL, XLenVT), Mask, VL);
+ DAG.getConstant(Lane, DL, XLenVT), TrueMask, VL);
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
}
}
- // Detect shuffles which can be re-expressed as vector selects.
- SmallVector<SDValue> MaskVals;
- // By default we preserve the original operand order, and select LHS as true
- // and RHS as false. However, since RVV vector selects may feature splats but
- // only on the LHS, we may choose to invert our mask and instead select
- // between RHS and LHS.
- bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
-
+ // Detect shuffles which can be re-expressed as vector selects; these are
+ // shuffles in which each element in the destination is taken from an element
+ // at the corresponding index in either source vectors.
bool IsSelect = all_of(enumerate(SVN->getMask()), [&](const auto &MaskIdx) {
int MaskIndex = MaskIdx.value();
- bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ SwapOps;
- MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts;
});
- if (IsSelect) {
- assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
- MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
- SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
- return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, SwapOps ? V2 : V1,
- SwapOps ? V1 : V2);
+ assert(!V1.isUndef() && "Unexpected shuffle canonicalization");
+
+ SmallVector<SDValue> MaskVals;
+ // As a backup, shuffles can be lowered via a vrgather instruction, possibly
+ // merged with a second vrgather.
+ SmallVector<SDValue> GatherIndicesLHS, GatherIndicesRHS;
+
+ // By default we preserve the original operand order, and use a mask to
+ // select LHS as true and RHS as false. However, since RVV vector selects may
+ // feature splats but only on the LHS, we may choose to invert our mask and
+ // instead select between RHS and LHS.
+ bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
+ bool InvertMask = IsSelect == SwapOps;
+
+ // Now construct the mask that will be used by the vselect or blended
+ // vrgather operation. For vrgathers, construct the appropriate indices into
+ // each vector.
+ for (int MaskIndex : SVN->getMask()) {
+ bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ InvertMask;
+ MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
+ if (!IsSelect) {
+ bool IsLHS = MaskIndex < (int)NumElts;
+ // For "undef" elements of -1, shuffle in element 0 instead.
+ GatherIndicesLHS.push_back(
+ DAG.getConstant(IsLHS ? std::max(MaskIndex, 0) : 0, DL, XLenVT));
+ // TODO: If we're masking out unused elements anyway, it might produce
+ // better code if we use the most-common element index instead of 0.
+ GatherIndicesRHS.push_back(
+ DAG.getConstant(IsLHS ? 0 : MaskIndex - NumElts, DL, XLenVT));
+ }
}
- return SDValue();
+ if (SwapOps) {
+ std::swap(V1, V2);
+ std::swap(GatherIndicesLHS, GatherIndicesRHS);
+ }
+
+ assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
+
+ if (IsSelect)
+ return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
+
+ if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) {
+ // On such a large vector we're unable to use i8 as the index type.
+ // FIXME: We could promote the index to i16 and use vrgatherei16, but that
+ // may involve vector splitting if we're already at LMUL=8, or our
+ // user-supplied maximum fixed-length LMUL.
+ return SDValue();
+ }
+
+ unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL;
+ MVT IndexVT = VT.changeTypeToInteger();
+ // Since we can't introduce illegal index types at this stage, use i16 and
+ // vrgatherei16 if the corresponding index type for plain vrgather is greater
+ // than XLenVT.
+ if (IndexVT.getScalarType().bitsGT(XLenVT)) {
+ GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
+ IndexVT = IndexVT.changeVectorElementType(MVT::i16);
+ }
+
+ MVT IndexContainerVT =
+ ContainerVT.changeVectorElementType(IndexVT.getScalarType());
+
+ SDValue Gather;
+ // TODO: This doesn't trigger for i64 vectors on RV32, since there we
+ // encounter a bitcasted BUILD_VECTOR with low/high i32 values.
+ if (SDValue SplatValue = DAG.getSplatValue(V1)) {
+ Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget);
+ } else {
+ SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+ LHSIndices =
+ convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
+
+ V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+ Gather =
+ DAG.getNode(GatherOpc, DL, ContainerVT, V1, LHSIndices, TrueMask, VL);
+ }
+
+ // If a second vector operand is used by this shuffle, blend it in with an
+ // additional vrgather.
+ if (!V2.isUndef()) {
+ MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
+ SelectMask =
+ convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
+
+ SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
+ RHSIndices =
+ convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
+
+ V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+ V2 = DAG.getNode(GatherOpc, DL, ContainerVT, V2, RHSIndices, TrueMask, VL);
+ Gather = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, SelectMask, V2,
+ Gather, VL);
+ }
+
+ return convertFromScalableVector(VT, Gather, DAG, Subtarget);
}
static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT,
return DAG.getNode(ISD::TRUNCATE, DL, EltVT, Elt0);
}
-// Called by type legalization to handle splat of i64 on RV32.
-// FIXME: We can optimize this when the type has sign or zero bits in one
-// of the halves.
-static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar,
- SDValue VL, SelectionDAG &DAG) {
- SDValue ThirtyTwoV = DAG.getConstant(32, DL, VT);
- SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
- DAG.getConstant(0, DL, MVT::i32));
- SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
- DAG.getConstant(1, DL, MVT::i32));
-
- // vmv.v.x vX, hi
- // vsll.vx vX, vX, /*32*/
- // vmv.v.x vY, lo
- // vsll.vx vY, vY, /*32*/
- // vsrl.vx vY, vY, /*32*/
- // vor.vv vX, vX, vY
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
- SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
- Lo = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL);
- Lo = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL);
- Lo = DAG.getNode(RISCVISD::SRL_VL, DL, VT, Lo, ThirtyTwoV, Mask, VL);
-
- Hi = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Hi, VL);
- Hi = DAG.getNode(RISCVISD::SHL_VL, DL, VT, Hi, ThirtyTwoV, Mask, VL);
-
- return DAG.getNode(RISCVISD::OR_VL, DL, VT, Lo, Hi, Mask, VL);
-}
-
// Some RVV intrinsics may claim that they want an integer operand to be
// promoted or expanded.
static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
assert(Op.getValueType() == XLenVT && "Unexpected VT!");
return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(),
Op.getOperand(1));
- case Intrinsic::riscv_vmv_v_x: {
- SDValue Scalar = Op.getOperand(1);
- if (Scalar.getValueType().bitsLE(XLenVT)) {
- unsigned ExtOpc =
- isa<ConstantSDNode>(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
- Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar);
- return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, Op.getValueType(), Scalar,
- Op.getOperand(2));
- }
-
- assert(Scalar.getValueType() == MVT::i64 && "Unexpected scalar VT!");
-
- // If this is a sign-extended 32-bit constant, we can truncate it and rely
- // on the instruction to sign-extend since SEW>XLEN.
- if (auto *CVal = dyn_cast<ConstantSDNode>(Scalar)) {
- if (isInt<32>(CVal->getSExtValue()))
- return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, Op.getValueType(),
- DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32),
- Op.getOperand(2));
- }
-
- // Otherwise use the more complicated splatting algorithm.
- return splatSplitI64WithVL(DL, Op.getSimpleValueType(), Scalar,
- Op.getOperand(2), DAG);
- }
+ case Intrinsic::riscv_vmv_v_x:
+ return lowerScalarSplat(Op.getOperand(1), Op.getOperand(2),
+ Op.getSimpleValueType(), DL, DAG, Subtarget);
case Intrinsic::riscv_vfmv_v_f:
return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
define <4 x double> @vrgather_permute_shuffle_vu_v4f64(<4 x double> %x) {
; RV32-LABEL: vrgather_permute_shuffle_vu_v4f64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 64
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -32
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v8, 1
-; RV32-NEXT: vfmv.f.s ft0, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.v.f v25, ft0
-; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu
-; RV32-NEXT: vfmv.f.s ft1, v8
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.s.f v25, ft1
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vse64.v v25, (a0)
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v8, 2
-; RV32-NEXT: vfmv.f.s ft1, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.v.f v25, ft1
-; RV32-NEXT: vfmv.s.f v25, ft0
-; RV32-NEXT: vse64.v v25, (sp)
+; RV32-NEXT: lui a0, %hi(.LCPI4_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI4_0)
+; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT: vle64.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -64
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 64
+; RV32-NEXT: vrgatherei16.vv v26, v8, v25
+; RV32-NEXT: vmv2r.v v8, v26
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_permute_shuffle_vu_v4f64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: .cfi_def_cfa_offset 64
-; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 64
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -32
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v8, 1
-; RV64-NEXT: vfmv.f.s ft0, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.v.f v25, ft0
-; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu
-; RV64-NEXT: vfmv.f.s ft1, v8
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.s.f v25, ft1
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vse64.v v25, (a0)
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v8, 2
-; RV64-NEXT: vfmv.f.s ft1, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.v.f v25, ft1
-; RV64-NEXT: vfmv.s.f v25, ft0
-; RV64-NEXT: vse64.v v25, (sp)
-; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -64
-; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: lui a0, %hi(.LCPI4_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI4_0)
+; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT: vle64.v v28, (a0)
+; RV64-NEXT: vrgather.vv v26, v8, v28
+; RV64-NEXT: vmv2r.v v8, v26
; RV64-NEXT: ret
%s = shufflevector <4 x double> %x, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 1>
ret <4 x double> %s
define <4 x double> @vrgather_permute_shuffle_uv_v4f64(<4 x double> %x) {
; RV32-LABEL: vrgather_permute_shuffle_uv_v4f64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 64
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -32
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v8, 1
-; RV32-NEXT: vfmv.f.s ft0, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.v.f v25, ft0
-; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu
-; RV32-NEXT: vfmv.f.s ft1, v8
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.s.f v25, ft1
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vse64.v v25, (a0)
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v8, 2
-; RV32-NEXT: vfmv.f.s ft1, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.v.f v25, ft1
-; RV32-NEXT: vfmv.s.f v25, ft0
-; RV32-NEXT: vse64.v v25, (sp)
+; RV32-NEXT: lui a0, %hi(.LCPI5_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI5_0)
+; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT: vle64.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -64
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 64
+; RV32-NEXT: vrgatherei16.vv v26, v8, v25
+; RV32-NEXT: vmv2r.v v8, v26
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_permute_shuffle_uv_v4f64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: .cfi_def_cfa_offset 64
-; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 64
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -32
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v8, 1
-; RV64-NEXT: vfmv.f.s ft0, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.v.f v25, ft0
-; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu
-; RV64-NEXT: vfmv.f.s ft1, v8
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.s.f v25, ft1
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vse64.v v25, (a0)
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v8, 2
-; RV64-NEXT: vfmv.f.s ft1, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.v.f v25, ft1
-; RV64-NEXT: vfmv.s.f v25, ft0
-; RV64-NEXT: vse64.v v25, (sp)
-; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -64
-; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: lui a0, %hi(.LCPI5_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI5_0)
+; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT: vle64.v v28, (a0)
+; RV64-NEXT: vrgather.vv v26, v8, v28
+; RV64-NEXT: vmv2r.v v8, v26
; RV64-NEXT: ret
%s = shufflevector <4 x double> undef, <4 x double> %x, <4 x i32> <i32 5, i32 6, i32 4, i32 5>
ret <4 x double> %s
define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) {
; RV32-LABEL: vrgather_shuffle_vv_v4f64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 64
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -32
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v10, 1
-; RV32-NEXT: vfmv.f.s ft0, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.v.f v25, ft0
-; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu
-; RV32-NEXT: vfmv.f.s ft0, v8
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.s.f v25, ft0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vse64.v v25, (a0)
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v8, 2
-; RV32-NEXT: vfmv.f.s ft0, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.v.f v25, ft0
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v8, 1
-; RV32-NEXT: vfmv.f.s ft0, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.s.f v25, ft0
-; RV32-NEXT: vse64.v v25, (sp)
+; RV32-NEXT: addi a0, zero, 1
+; RV32-NEXT: addi a1, zero, 8
+; RV32-NEXT: vsetivli a2, 1, e8,m1,ta,mu
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT: vmv.s.x v25, a0
+; RV32-NEXT: vmv.v.i v28, 0
+; RV32-NEXT: vsetivli a0, 4, e16,m1,tu,mu
+; RV32-NEXT: vslideup.vi v28, v25, 3
+; RV32-NEXT: lui a0, %hi(.LCPI6_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI6_0)
+; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT: vle64.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -64
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 64
+; RV32-NEXT: vrgatherei16.vv v26, v8, v25
+; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu
+; RV32-NEXT: vrgatherei16.vv v26, v10, v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v26
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_shuffle_vv_v4f64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: .cfi_def_cfa_offset 64
-; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 64
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -32
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v10, 1
-; RV64-NEXT: vfmv.f.s ft0, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.v.f v25, ft0
-; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu
-; RV64-NEXT: vfmv.f.s ft0, v8
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.s.f v25, ft0
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vse64.v v25, (a0)
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v8, 2
-; RV64-NEXT: vfmv.f.s ft0, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.v.f v25, ft0
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v8, 1
-; RV64-NEXT: vfmv.f.s ft0, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.s.f v25, ft0
-; RV64-NEXT: vse64.v v25, (sp)
-; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -64
-; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: addi a0, zero, 1
+; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT: vmv.s.x v26, a0
+; RV64-NEXT: vmv.v.i v28, 0
+; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT: vslideup.vi v28, v26, 3
+; RV64-NEXT: addi a0, zero, 8
+; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT: vmv.s.x v0, a0
+; RV64-NEXT: lui a0, %hi(.LCPI6_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI6_0)
+; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT: vle64.v v30, (a0)
+; RV64-NEXT: vrgather.vv v26, v8, v30
+; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT: vrgather.vv v26, v10, v28, v0.t
+; RV64-NEXT: vmv2r.v v8, v26
; RV64-NEXT: ret
%s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
ret <4 x double> %s
define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
; RV32-LABEL: vrgather_shuffle_xv_v4f64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 64
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -32
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v8, 1
-; RV32-NEXT: vfmv.f.s ft0, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.v.f v25, ft0
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v8, 2
-; RV32-NEXT: vfmv.f.s ft0, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: lui a0, %hi(.LCPI7_0)
-; RV32-NEXT: fld ft1, %lo(.LCPI7_0)(a0)
-; RV32-NEXT: vfmv.s.f v25, ft0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vse64.v v25, (a0)
-; RV32-NEXT: fsd ft1, 8(sp)
-; RV32-NEXT: fsd ft1, 0(sp)
+; RV32-NEXT: addi a0, zero, 12
+; RV32-NEXT: lui a1, %hi(.LCPI7_0)
+; RV32-NEXT: fld ft0, %lo(.LCPI7_0)(a1)
+; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT: vmv.s.x v0, a0
; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT: vle64.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -64
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 64
+; RV32-NEXT: vfmv.v.f v26, ft0
+; RV32-NEXT: lui a0, %hi(.LCPI7_1)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI7_1)
+; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
+; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu
+; RV32-NEXT: vrgatherei16.vv v26, v8, v25, v0.t
+; RV32-NEXT: vmv2r.v v8, v26
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_shuffle_xv_v4f64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: .cfi_def_cfa_offset 64
-; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 64
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -32
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v8, 1
-; RV64-NEXT: vfmv.f.s ft0, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.v.f v25, ft0
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v8, 2
-; RV64-NEXT: vfmv.f.s ft0, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
+; RV64-NEXT: addi a0, zero, 12
+; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT: vmv.s.x v0, a0
; RV64-NEXT: lui a0, %hi(.LCPI7_0)
-; RV64-NEXT: fld ft1, %lo(.LCPI7_0)(a0)
-; RV64-NEXT: vfmv.s.f v25, ft0
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vse64.v v25, (a0)
-; RV64-NEXT: fsd ft1, 8(sp)
-; RV64-NEXT: fsd ft1, 0(sp)
-; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -64
-; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0)
+; RV64-NEXT: lui a1, %hi(.LCPI7_1)
+; RV64-NEXT: fld ft0, %lo(.LCPI7_1)(a1)
+; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT: vle64.v v28, (a0)
+; RV64-NEXT: vfmv.v.f v26, ft0
+; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t
+; RV64-NEXT: vmv2r.v v8, v26
; RV64-NEXT: ret
%s = shufflevector <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x double> %x, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
ret <4 x double> %s
define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
; RV32-LABEL: vrgather_shuffle_vx_v4f64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -64
-; RV32-NEXT: .cfi_def_cfa_offset 64
-; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 64
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -32
-; RV32-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV32-NEXT: vslidedown.vi v26, v8, 3
-; RV32-NEXT: vfmv.f.s ft0, v26
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.v.f v25, ft0
-; RV32-NEXT: vsetvli zero, zero, e64,m2,ta,mu
-; RV32-NEXT: vfmv.f.s ft0, v8
+; RV32-NEXT: addi a0, zero, 3
+; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; RV32-NEXT: vmv.s.x v25, a0
+; RV32-NEXT: vmv.v.i v28, 0
; RV32-NEXT: lui a0, %hi(.LCPI8_0)
-; RV32-NEXT: fld ft1, %lo(.LCPI8_0)(a0)
-; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV32-NEXT: vfmv.s.f v25, ft0
-; RV32-NEXT: vse64.v v25, (sp)
-; RV32-NEXT: fsd ft1, 24(sp)
-; RV32-NEXT: fsd ft1, 16(sp)
+; RV32-NEXT: fld ft0, %lo(.LCPI8_0)(a0)
+; RV32-NEXT: vsetivli a0, 2, e16,m1,tu,mu
+; RV32-NEXT: vslideup.vi v28, v25, 1
; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT: vle64.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -64
-; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 64
+; RV32-NEXT: vfmv.v.f v26, ft0
+; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu
+; RV32-NEXT: vrgatherei16.vv v26, v8, v28, v0.t
+; RV32-NEXT: vmv2r.v v8, v26
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_shuffle_vx_v4f64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: .cfi_def_cfa_offset 64
-; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 64
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -32
-; RV64-NEXT: vsetivli a0, 1, e64,m2,ta,mu
-; RV64-NEXT: vslidedown.vi v26, v8, 3
-; RV64-NEXT: vfmv.f.s ft0, v26
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.v.f v25, ft0
-; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu
-; RV64-NEXT: vfmv.f.s ft0, v8
-; RV64-NEXT: lui a0, %hi(.LCPI8_0)
-; RV64-NEXT: fld ft1, %lo(.LCPI8_0)(a0)
-; RV64-NEXT: vsetivli a0, 2, e64,m1,ta,mu
-; RV64-NEXT: vfmv.s.f v25, ft0
-; RV64-NEXT: vse64.v v25, (sp)
-; RV64-NEXT: fsd ft1, 24(sp)
-; RV64-NEXT: fsd ft1, 16(sp)
+; RV64-NEXT: addi a0, zero, 3
+; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT: vmv.s.x v26, a0
+; RV64-NEXT: vmv.v.i v28, 0
+; RV64-NEXT: vsetivli a1, 2, e64,m2,tu,mu
+; RV64-NEXT: vslideup.vi v28, v26, 1
+; RV64-NEXT: lui a1, %hi(.LCPI8_0)
+; RV64-NEXT: fld ft0, %lo(.LCPI8_0)(a1)
+; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT: vmv.s.x v0, a0
; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -64
-; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: vfmv.v.f v26, ft0
+; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu
+; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t
+; RV64-NEXT: vmv2r.v v8, v26
; RV64-NEXT: ret
%s = shufflevector <4 x double> %x, <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
ret <4 x double> %s
define <4 x i16> @vrgather_permute_shuffle_vu_v4i16(<4 x i16> %x) {
; CHECK-LABEL: vrgather_permute_shuffle_vu_v4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: sh a0, 12(sp)
-; CHECK-NEXT: vsetivli a0, 1, e16,m1,ta,mu
-; CHECK-NEXT: vslidedown.vi v25, v8, 1
-; CHECK-NEXT: vmv.x.s a0, v25
-; CHECK-NEXT: sh a0, 14(sp)
-; CHECK-NEXT: vslidedown.vi v25, v8, 2
-; CHECK-NEXT: vmv.x.s a1, v25
-; CHECK-NEXT: sh a1, 10(sp)
-; CHECK-NEXT: sh a0, 8(sp)
-; CHECK-NEXT: vsetivli a0, 4, e16,m1,ta,mu
-; CHECK-NEXT: addi a0, sp, 8
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0)
+; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT: vle16.v v26, (a0)
+; CHECK-NEXT: vrgather.vv v25, v8, v26
+; CHECK-NEXT: vmv1r.v v8, v25
; CHECK-NEXT: ret
%s = shufflevector <4 x i16> %x, <4 x i16> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 1>
ret <4 x i16> %s
define <4 x i16> @vrgather_permute_shuffle_uv_v4i16(<4 x i16> %x) {
; CHECK-LABEL: vrgather_permute_shuffle_uv_v4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: sh a0, 12(sp)
-; CHECK-NEXT: vsetivli a0, 1, e16,m1,ta,mu
-; CHECK-NEXT: vslidedown.vi v25, v8, 1
-; CHECK-NEXT: vmv.x.s a0, v25
-; CHECK-NEXT: sh a0, 14(sp)
-; CHECK-NEXT: vslidedown.vi v25, v8, 2
-; CHECK-NEXT: vmv.x.s a1, v25
-; CHECK-NEXT: sh a1, 10(sp)
-; CHECK-NEXT: sh a0, 8(sp)
-; CHECK-NEXT: vsetivli a0, 4, e16,m1,ta,mu
-; CHECK-NEXT: addi a0, sp, 8
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0)
+; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT: vle16.v v26, (a0)
+; CHECK-NEXT: vrgather.vv v25, v8, v26
+; CHECK-NEXT: vmv1r.v v8, v25
; CHECK-NEXT: ret
%s = shufflevector <4 x i16> undef, <4 x i16> %x, <4 x i32> <i32 5, i32 6, i32 4, i32 5>
ret <4 x i16> %s
define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: vrgather_shuffle_vv_v4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: vsetvli zero, zero, e16,m1,ta,mu
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: sh a0, 12(sp)
-; CHECK-NEXT: vsetivli a0, 1, e16,m1,ta,mu
-; CHECK-NEXT: vslidedown.vi v25, v9, 1
-; CHECK-NEXT: vmv.x.s a0, v25
-; CHECK-NEXT: sh a0, 14(sp)
-; CHECK-NEXT: vslidedown.vi v25, v8, 2
-; CHECK-NEXT: vmv.x.s a0, v25
-; CHECK-NEXT: sh a0, 10(sp)
-; CHECK-NEXT: vslidedown.vi v25, v8, 1
-; CHECK-NEXT: vmv.x.s a0, v25
-; CHECK-NEXT: sh a0, 8(sp)
-; CHECK-NEXT: vsetivli a0, 4, e16,m1,ta,mu
-; CHECK-NEXT: addi a0, sp, 8
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: addi a0, zero, 1
+; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT: vmv.s.x v25, a0
+; CHECK-NEXT: vmv.v.i v26, 0
+; CHECK-NEXT: vsetivli a0, 4, e16,m1,tu,mu
+; CHECK-NEXT: vslideup.vi v26, v25, 3
+; CHECK-NEXT: addi a0, zero, 8
+; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: lui a0, %hi(.LCPI6_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0)
+; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT: vle16.v v27, (a0)
+; CHECK-NEXT: vrgather.vv v25, v8, v27
+; CHECK-NEXT: vsetivli a0, 4, e16,m1,tu,mu
+; CHECK-NEXT: vrgather.vv v25, v9, v26, v0.t
+; CHECK-NEXT: vmv1r.v v8, v25
; CHECK-NEXT: ret
%s = shufflevector <4 x i16> %x, <4 x i16> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
ret <4 x i16> %s
}
define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) {
-; RV32-LABEL: vrgather_shuffle_xv_v4i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 80
-; RV32-NEXT: addi a0, a0, 5
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetivli a0, 1, e16,m1,ta,mu
-; RV32-NEXT: vslidedown.vi v25, v8, 1
-; RV32-NEXT: vmv.x.s a0, v25
-; RV32-NEXT: sh a0, 14(sp)
-; RV32-NEXT: vslidedown.vi v25, v8, 2
-; RV32-NEXT: vmv.x.s a0, v25
-; RV32-NEXT: sh a0, 12(sp)
-; RV32-NEXT: vsetivli a0, 4, e16,m1,ta,mu
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vle16.v v8, (a0)
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vrgather_shuffle_xv_v4i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: lui a0, 80
-; RV64-NEXT: addiw a0, a0, 5
-; RV64-NEXT: sw a0, 8(sp)
-; RV64-NEXT: vsetivli a0, 1, e16,m1,ta,mu
-; RV64-NEXT: vslidedown.vi v25, v8, 1
-; RV64-NEXT: vmv.x.s a0, v25
-; RV64-NEXT: sh a0, 14(sp)
-; RV64-NEXT: vslidedown.vi v25, v8, 2
-; RV64-NEXT: vmv.x.s a0, v25
-; RV64-NEXT: sh a0, 12(sp)
-; RV64-NEXT: vsetivli a0, 4, e16,m1,ta,mu
-; RV64-NEXT: addi a0, sp, 8
-; RV64-NEXT: vle16.v v8, (a0)
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: ret
+; CHECK-LABEL: vrgather_shuffle_xv_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a0, zero, 12
+; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: lui a0, %hi(.LCPI7_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0)
+; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT: vle16.v v26, (a0)
+; CHECK-NEXT: vmv.v.i v25, 5
+; CHECK-NEXT: vsetivli a0, 4, e16,m1,tu,mu
+; CHECK-NEXT: vrgather.vv v25, v8, v26, v0.t
+; CHECK-NEXT: vmv1r.v v8, v25
+; CHECK-NEXT: ret
%s = shufflevector <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i16> %x, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
ret <4 x i16> %s
}
define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) {
-; RV32-LABEL: vrgather_shuffle_vx_v4i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 80
-; RV32-NEXT: addi a0, a0, 5
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: vsetvli zero, zero, e16,m1,ta,mu
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: sh a0, 8(sp)
-; RV32-NEXT: vsetivli a0, 1, e16,m1,ta,mu
-; RV32-NEXT: vslidedown.vi v25, v8, 3
-; RV32-NEXT: vmv.x.s a0, v25
-; RV32-NEXT: sh a0, 10(sp)
-; RV32-NEXT: vsetivli a0, 4, e16,m1,ta,mu
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vle16.v v8, (a0)
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vrgather_shuffle_vx_v4i16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: lui a0, 80
-; RV64-NEXT: addiw a0, a0, 5
-; RV64-NEXT: sw a0, 12(sp)
-; RV64-NEXT: vsetvli zero, zero, e16,m1,ta,mu
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: sh a0, 8(sp)
-; RV64-NEXT: vsetivli a0, 1, e16,m1,ta,mu
-; RV64-NEXT: vslidedown.vi v25, v8, 3
-; RV64-NEXT: vmv.x.s a0, v25
-; RV64-NEXT: sh a0, 10(sp)
-; RV64-NEXT: vsetivli a0, 4, e16,m1,ta,mu
-; RV64-NEXT: addi a0, sp, 8
-; RV64-NEXT: vle16.v v8, (a0)
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: ret
+; CHECK-LABEL: vrgather_shuffle_vx_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a0, zero, 3
+; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu
+; CHECK-NEXT: vmv.s.x v25, a0
+; CHECK-NEXT: vmv.v.i v26, 0
+; CHECK-NEXT: vsetivli a1, 2, e16,m1,tu,mu
+; CHECK-NEXT: vslideup.vi v26, v25, 1
+; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vsetivli a0, 4, e16,m1,ta,mu
+; CHECK-NEXT: vmv.v.i v25, 5
+; CHECK-NEXT: vsetivli a0, 4, e16,m1,tu,mu
+; CHECK-NEXT: vrgather.vv v25, v8, v26, v0.t
+; CHECK-NEXT: vmv1r.v v8, v25
+; CHECK-NEXT: ret
%s = shufflevector <4 x i16> %x, <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
ret <4 x i16> %s
}
define <8 x i64> @vrgather_permute_shuffle_vu_v8i64(<8 x i64> %x) {
; RV32-LABEL: vrgather_permute_shuffle_vu_v8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: sw a0, 48(sp)
-; RV32-NEXT: sw a0, 16(sp)
-; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT: vslidedown.vi v28, v8, 3
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 60(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 2
-; RV32-NEXT: vmv.x.s a1, v28
-; RV32-NEXT: sw a1, 56(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 1
-; RV32-NEXT: vmv.x.s a2, v28
-; RV32-NEXT: sw a2, 52(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 13
-; RV32-NEXT: vmv.x.s a3, v28
-; RV32-NEXT: sw a3, 44(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 12
-; RV32-NEXT: vmv.x.s a3, v28
-; RV32-NEXT: sw a3, 40(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 15
-; RV32-NEXT: vmv.x.s a3, v28
-; RV32-NEXT: sw a3, 36(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 14
-; RV32-NEXT: vmv.x.s a3, v28
-; RV32-NEXT: sw a3, 32(sp)
-; RV32-NEXT: sw a0, 28(sp)
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a2, 20(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 5
-; RV32-NEXT: vmv.x.s a2, v28
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 4
-; RV32-NEXT: vmv.x.s a2, v28
-; RV32-NEXT: sw a2, 8(sp)
-; RV32-NEXT: sw a0, 4(sp)
-; RV32-NEXT: sw a1, 0(sp)
-; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT: vle32.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: lui a0, %hi(.LCPI9_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI9_0)
+; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
+; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT: vrgatherei16.vv v28, v8, v25
+; RV32-NEXT: vmv4r.v v8, v28
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_permute_shuffle_vu_v8i64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: sd a0, 48(sp)
-; RV64-NEXT: sd a0, 16(sp)
-; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT: vslidedown.vi v28, v8, 1
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 56(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 6
-; RV64-NEXT: vmv.x.s a1, v28
-; RV64-NEXT: sd a1, 40(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 7
-; RV64-NEXT: vmv.x.s a1, v28
-; RV64-NEXT: sd a1, 32(sp)
-; RV64-NEXT: sd a0, 24(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 2
-; RV64-NEXT: vmv.x.s a1, v28
-; RV64-NEXT: sd a1, 8(sp)
-; RV64-NEXT: sd a0, 0(sp)
-; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -128
-; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: lui a0, %hi(.LCPI9_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI9_0)
+; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT: vle64.v v12, (a0)
+; RV64-NEXT: vrgather.vv v28, v8, v12
+; RV64-NEXT: vmv4r.v v8, v28
; RV64-NEXT: ret
%s = shufflevector <8 x i64> %x, <8 x i64> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 1, i32 7, i32 6, i32 0, i32 1>
ret <8 x i64> %s
define <8 x i64> @vrgather_permute_shuffle_uv_v8i64(<8 x i64> %x) {
; RV32-LABEL: vrgather_permute_shuffle_uv_v8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: sw a0, 48(sp)
-; RV32-NEXT: sw a0, 40(sp)
-; RV32-NEXT: sw a0, 16(sp)
-; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT: vslidedown.vi v28, v8, 7
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 60(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 6
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 56(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 1
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 52(sp)
-; RV32-NEXT: sw a0, 44(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 15
-; RV32-NEXT: vmv.x.s a1, v28
-; RV32-NEXT: sw a1, 36(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 14
-; RV32-NEXT: vmv.x.s a1, v28
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 3
-; RV32-NEXT: vmv.x.s a1, v28
-; RV32-NEXT: sw a1, 28(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 2
-; RV32-NEXT: vmv.x.s a2, v28
-; RV32-NEXT: sw a2, 24(sp)
-; RV32-NEXT: sw a0, 20(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 5
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 4
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: sw a2, 0(sp)
-; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT: vle32.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: lui a0, %hi(.LCPI10_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI10_0)
+; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
+; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT: vrgatherei16.vv v28, v8, v25
+; RV32-NEXT: vmv4r.v v8, v28
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_permute_shuffle_uv_v8i64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: sd a0, 48(sp)
-; RV64-NEXT: sd a0, 40(sp)
-; RV64-NEXT: sd a0, 16(sp)
-; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT: vslidedown.vi v28, v8, 3
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 56(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 7
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 32(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 1
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 24(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 2
-; RV64-NEXT: vmv.x.s a1, v28
-; RV64-NEXT: sd a1, 8(sp)
-; RV64-NEXT: sd a0, 0(sp)
-; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -128
-; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: lui a0, %hi(.LCPI10_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI10_0)
+; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT: vle64.v v12, (a0)
+; RV64-NEXT: vrgather.vv v28, v8, v12
+; RV64-NEXT: vmv4r.v v8, v28
; RV64-NEXT: ret
%s = shufflevector <8 x i64> undef, <8 x i64> %x, <8 x i32> <i32 9, i32 10, i32 8, i32 9, i32 15, i32 8, i32 8, i32 11>
ret <8 x i64> %s
define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
; RV32-LABEL: vrgather_shuffle_vv_v8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT: vslidedown.vi v28, v12, 11
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 60(sp)
-; RV32-NEXT: vslidedown.vi v28, v12, 10
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 56(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 7
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 52(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 6
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 48(sp)
-; RV32-NEXT: vslidedown.vi v28, v12, 5
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 44(sp)
-; RV32-NEXT: vslidedown.vi v28, v12, 4
-; RV32-NEXT: vmv.x.s a1, v28
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 3
-; RV32-NEXT: vmv.x.s a2, v28
-; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 2
-; RV32-NEXT: vmv.x.s a3, v28
-; RV32-NEXT: sw a3, 32(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 11
-; RV32-NEXT: vmv.x.s a4, v28
-; RV32-NEXT: sw a4, 28(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 10
-; RV32-NEXT: vmv.x.s a4, v28
-; RV32-NEXT: sw a4, 24(sp)
-; RV32-NEXT: sw a0, 20(sp)
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 5
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 4
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw a2, 4(sp)
-; RV32-NEXT: sw a3, 0(sp)
-; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT: vle32.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: addi a0, zero, 5
+; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT: vmv.s.x v25, a0
+; RV32-NEXT: addi a0, zero, 36
+; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: vsetivli a0, 8, e16,m1,ta,mu
+; RV32-NEXT: vmv.v.i v26, 0
+; RV32-NEXT: vmerge.vim v26, v26, 2, v0
+; RV32-NEXT: vsetivli a0, 8, e16,m1,tu,mu
+; RV32-NEXT: vslideup.vi v26, v25, 7
+; RV32-NEXT: addi a0, zero, 164
+; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: lui a0, %hi(.LCPI11_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI11_0)
+; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
+; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT: vrgatherei16.vv v28, v8, v25
+; RV32-NEXT: vsetivli a0, 8, e64,m4,tu,mu
+; RV32-NEXT: vrgatherei16.vv v28, v12, v26, v0.t
+; RV32-NEXT: vmv4r.v v8, v28
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_shuffle_vv_v8i64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT: vslidedown.vi v28, v12, 5
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 56(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 3
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 48(sp)
-; RV64-NEXT: vslidedown.vi v28, v12, 2
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 40(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 1
-; RV64-NEXT: vmv.x.s a1, v28
-; RV64-NEXT: sd a1, 32(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 5
-; RV64-NEXT: vmv.x.s a2, v28
-; RV64-NEXT: sd a2, 24(sp)
-; RV64-NEXT: sd a0, 16(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 2
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 8(sp)
-; RV64-NEXT: sd a1, 0(sp)
+; RV64-NEXT: addi a0, zero, 5
+; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT: vmv.s.x v28, a0
+; RV64-NEXT: addi a0, zero, 36
+; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT: vmv.s.x v0, a0
; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -128
-; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: vmv.v.i v16, 0
+; RV64-NEXT: vmerge.vim v16, v16, 2, v0
+; RV64-NEXT: vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT: vslideup.vi v16, v28, 7
+; RV64-NEXT: addi a0, zero, 164
+; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT: vmv.s.x v0, a0
+; RV64-NEXT: lui a0, %hi(.LCPI11_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI11_0)
+; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT: vle64.v v20, (a0)
+; RV64-NEXT: vrgather.vv v28, v8, v20
+; RV64-NEXT: vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT: vrgather.vv v28, v12, v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v28
; RV64-NEXT: ret
%s = shufflevector <8 x i64> %x, <8 x i64> %y, <8 x i32> <i32 1, i32 2, i32 10, i32 5, i32 1, i32 10, i32 3, i32 13>
ret <8 x i64> %s
define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) {
; RV32-LABEL: vrgather_shuffle_xv_v8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: addi a0, zero, -1
-; RV32-NEXT: sw a0, 60(sp)
-; RV32-NEXT: sw a0, 56(sp)
-; RV32-NEXT: sw a0, 28(sp)
-; RV32-NEXT: sw a0, 24(sp)
-; RV32-NEXT: sw a0, 20(sp)
-; RV32-NEXT: sw a0, 16(sp)
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: sw a0, 32(sp)
-; RV32-NEXT: sw a0, 0(sp)
-; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT: vslidedown.vi v28, v8, 13
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 52(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 12
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 48(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 9
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 44(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 8
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 40(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 1
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 36(sp)
-; RV32-NEXT: sw a0, 4(sp)
+; RV32-NEXT: addi a0, zero, 6
+; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT: vmv.s.x v25, a0
+; RV32-NEXT: addi a0, zero, 4
+; RV32-NEXT: vmv.s.x v26, a0
+; RV32-NEXT: vmv.v.i v27, 0
+; RV32-NEXT: vsetivli a0, 6, e16,m1,tu,mu
+; RV32-NEXT: vslideup.vi v27, v26, 5
+; RV32-NEXT: vsetivli a0, 7, e16,m1,tu,mu
+; RV32-NEXT: vslideup.vi v27, v25, 6
+; RV32-NEXT: addi a0, zero, 113
+; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: lui a0, %hi(.LCPI12_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI12_0)
+; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT: vle32.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: vmv.v.i v12, -1
+; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT: vrgatherei16.vv v28, v12, v25
+; RV32-NEXT: vsetivli a0, 8, e64,m4,tu,mu
+; RV32-NEXT: vrgatherei16.vv v28, v8, v27, v0.t
+; RV32-NEXT: vmv4r.v v8, v28
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_shuffle_xv_v8i64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: addi a0, zero, -1
-; RV64-NEXT: sd a0, 56(sp)
-; RV64-NEXT: sd a0, 24(sp)
-; RV64-NEXT: sd a0, 16(sp)
-; RV64-NEXT: sd a0, 8(sp)
-; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: sd a0, 32(sp)
-; RV64-NEXT: sd a0, 0(sp)
-; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT: vslidedown.vi v28, v8, 6
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 48(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 4
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 40(sp)
+; RV64-NEXT: addi a0, zero, 6
+; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT: vmv.s.x v28, a0
+; RV64-NEXT: addi a0, zero, 4
+; RV64-NEXT: vmv.s.x v12, a0
+; RV64-NEXT: vmv.v.i v16, 0
+; RV64-NEXT: vsetivli a0, 6, e64,m4,tu,mu
+; RV64-NEXT: vslideup.vi v16, v12, 5
+; RV64-NEXT: vsetivli a0, 7, e64,m4,tu,mu
+; RV64-NEXT: vslideup.vi v16, v28, 6
+; RV64-NEXT: addi a0, zero, 113
+; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT: vmv.s.x v0, a0
; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -128
-; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: vmv.v.i v28, -1
+; RV64-NEXT: vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT: vrgather.vv v28, v8, v16, v0.t
+; RV64-NEXT: vmv4r.v v8, v28
; RV64-NEXT: ret
%s = shufflevector <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x, <8 x i32> <i32 8, i32 3, i32 6, i32 5, i32 8, i32 12, i32 14, i32 3>
ret <8 x i64> %s
define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) {
; RV32-LABEL: vrgather_shuffle_vx_v8i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: sw zero, 60(sp)
-; RV32-NEXT: addi a0, zero, 5
-; RV32-NEXT: sw a0, 56(sp)
-; RV32-NEXT: sw zero, 28(sp)
-; RV32-NEXT: sw a0, 24(sp)
-; RV32-NEXT: sw zero, 20(sp)
-; RV32-NEXT: sw a0, 16(sp)
-; RV32-NEXT: vsetvli zero, zero, e32,m4,ta,mu
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: sw a0, 0(sp)
-; RV32-NEXT: vsetivli a0, 1, e32,m4,ta,mu
-; RV32-NEXT: vslidedown.vi v28, v8, 15
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 52(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 14
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 48(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 3
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 44(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 2
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 40(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 9
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 36(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 8
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 32(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 7
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 6
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vslidedown.vi v28, v8, 1
-; RV32-NEXT: vmv.x.s a0, v28
-; RV32-NEXT: sw a0, 4(sp)
-; RV32-NEXT: vsetivli a0, 16, e32,m4,ta,mu
-; RV32-NEXT: vle32.v v8, (sp)
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: addi a0, zero, 140
+; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: lui a0, %hi(.LCPI13_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI13_0)
+; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
+; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT: vrgatherei16.vv v28, v8, v25
+; RV32-NEXT: lui a0, %hi(.LCPI13_1)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1)
+; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu
+; RV32-NEXT: vle16.v v25, (a0)
+; RV32-NEXT: vsetivli a0, 8, e64,m4,ta,mu
+; RV32-NEXT: vmv.v.i v8, 5
+; RV32-NEXT: vsetivli a0, 8, e64,m4,tu,mu
+; RV32-NEXT: vrgatherei16.vv v28, v8, v25, v0.t
+; RV32-NEXT: vmv4r.v v8, v28
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_shuffle_vx_v8i64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: addi a0, zero, 5
-; RV64-NEXT: sd a0, 56(sp)
-; RV64-NEXT: sd a0, 24(sp)
-; RV64-NEXT: sd a0, 16(sp)
-; RV64-NEXT: vsetvli zero, zero, e64,m4,ta,mu
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: sd a0, 0(sp)
-; RV64-NEXT: vsetivli a0, 1, e64,m4,ta,mu
-; RV64-NEXT: vslidedown.vi v28, v8, 7
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 48(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 1
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 40(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 4
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 32(sp)
-; RV64-NEXT: vslidedown.vi v28, v8, 3
-; RV64-NEXT: vmv.x.s a0, v28
-; RV64-NEXT: sd a0, 8(sp)
-; RV64-NEXT: vsetivli a0, 8, e64,m4,ta,mu
-; RV64-NEXT: vle64.v v8, (sp)
-; RV64-NEXT: addi sp, s0, -128
-; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: addi a0, zero, 115
+; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
+; RV64-NEXT: vmv.s.x v0, a0
+; RV64-NEXT: lui a0, %hi(.LCPI13_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI13_0)
+; RV64-NEXT: vsetivli a1, 8, e64,m4,ta,mu
+; RV64-NEXT: vle64.v v12, (a0)
+; RV64-NEXT: vmv.v.i v28, 5
+; RV64-NEXT: vsetivli a0, 8, e64,m4,tu,mu
+; RV64-NEXT: vrgather.vv v28, v8, v12, v0.t
+; RV64-NEXT: vmv4r.v v8, v28
; RV64-NEXT: ret
%s = shufflevector <8 x i64> %x, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i32> <i32 0, i32 3, i32 10, i32 9, i32 4, i32 1, i32 7, i32 14>
ret <8 x i64> %s