break;
case ISD::VECREDUCE_SEQ_FADD:
case ISD::VECREDUCE_SEQ_FMUL:
+ case ISD::VP_REDUCE_FADD:
+ case ISD::VP_REDUCE_FMUL:
+ case ISD::VP_REDUCE_ADD:
+ case ISD::VP_REDUCE_MUL:
+ case ISD::VP_REDUCE_AND:
+ case ISD::VP_REDUCE_OR:
+ case ISD::VP_REDUCE_XOR:
+ case ISD::VP_REDUCE_SMAX:
+ case ISD::VP_REDUCE_SMIN:
+ case ISD::VP_REDUCE_UMAX:
+ case ISD::VP_REDUCE_UMIN:
+ case ISD::VP_REDUCE_FMAX:
+ case ISD::VP_REDUCE_FMIN:
+ case ISD::VP_REDUCE_SEQ_FADD:
+ case ISD::VP_REDUCE_SEQ_FMUL:
Action = TLI.getOperationAction(
Node->getOpcode(), Node->getOperand(1).getValueType());
break;
Res = PromoteIntRes_VECREDUCE(N);
break;
+ case ISD::VP_REDUCE_ADD:
+ case ISD::VP_REDUCE_MUL:
+ case ISD::VP_REDUCE_AND:
+ case ISD::VP_REDUCE_OR:
+ case ISD::VP_REDUCE_XOR:
+ case ISD::VP_REDUCE_SMAX:
+ case ISD::VP_REDUCE_SMIN:
+ case ISD::VP_REDUCE_UMAX:
+ case ISD::VP_REDUCE_UMIN:
+ Res = PromoteIntRes_VP_REDUCE(N);
+ break;
+
case ISD::FREEZE:
Res = PromoteIntRes_FREEZE(N);
break;
case ISD::VECREDUCE_SMIN:
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN: Res = PromoteIntOp_VECREDUCE(N); break;
+ case ISD::VP_REDUCE_ADD:
+ case ISD::VP_REDUCE_MUL:
+ case ISD::VP_REDUCE_AND:
+ case ISD::VP_REDUCE_OR:
+ case ISD::VP_REDUCE_XOR:
+ case ISD::VP_REDUCE_SMAX:
+ case ISD::VP_REDUCE_SMIN:
+ case ISD::VP_REDUCE_UMAX:
+ case ISD::VP_REDUCE_UMIN:
+ Res = PromoteIntOp_VP_REDUCE(N, OpNo);
+ break;
case ISD::SET_ROUNDING: Res = PromoteIntOp_SET_ROUNDING(N); break;
}
return SDValue();
}
-SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) {
- SDLoc dl(N);
- SDValue Op;
+static unsigned getExtendForIntVecReduction(SDNode *N) {
switch (N->getOpcode()) {
- default: llvm_unreachable("Expected integer vector reduction");
+ default:
+ llvm_unreachable("Expected integer vector reduction");
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_MUL:
case ISD::VECREDUCE_AND:
case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:
- Op = GetPromotedInteger(N->getOperand(0));
- break;
+ case ISD::VP_REDUCE_ADD:
+ case ISD::VP_REDUCE_MUL:
+ case ISD::VP_REDUCE_AND:
+ case ISD::VP_REDUCE_OR:
+ case ISD::VP_REDUCE_XOR:
+ return ISD::ANY_EXTEND;
case ISD::VECREDUCE_SMAX:
case ISD::VECREDUCE_SMIN:
- Op = SExtPromotedInteger(N->getOperand(0));
- break;
+ case ISD::VP_REDUCE_SMAX:
+ case ISD::VP_REDUCE_SMIN:
+ return ISD::SIGN_EXTEND;
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN:
- Op = ZExtPromotedInteger(N->getOperand(0));
- break;
+ case ISD::VP_REDUCE_UMAX:
+ case ISD::VP_REDUCE_UMIN:
+ return ISD::ZERO_EXTEND;
}
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOpVectorReduction(SDNode *N, SDValue V) {
+ switch (getExtendForIntVecReduction(N)) {
+ default:
+ llvm_unreachable("Impossible extension kind for integer reduction");
+ case ISD::ANY_EXTEND:
+ return GetPromotedInteger(V);
+ case ISD::SIGN_EXTEND:
+ return SExtPromotedInteger(V);
+ case ISD::ZERO_EXTEND:
+ return ZExtPromotedInteger(V);
+ }
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) {
+ SDLoc dl(N);
+ SDValue Op = PromoteIntOpVectorReduction(N, N->getOperand(0));
EVT EltVT = Op.getValueType().getVectorElementType();
EVT VT = N->getValueType(0);
+
if (VT.bitsGE(EltVT))
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, Op);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Reduce);
}
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo) {
+ SDLoc DL(N);
+ SDValue Op = N->getOperand(OpNo);
+ SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+
+ if (OpNo == 2) { // Mask
+ // Update in place.
+ NewOps[2] = PromoteTargetBoolean(Op, N->getOperand(1).getValueType());
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+ }
+
+ assert(OpNo == 1 && "Unexpected operand for promotion");
+
+ Op = PromoteIntOpVectorReduction(N, Op);
+
+ NewOps[OpNo] = Op;
+
+ EVT VT = N->getValueType(0);
+ EVT EltVT = Op.getValueType().getScalarType();
+
+ if (VT.bitsGE(EltVT))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, NewOps);
+
+ // Result size must be >= element/start-value size. If this is not the case
+ // after promotion, also promote both the start value and result type and
+ // then truncate.
+ NewOps[0] =
+ DAG.getNode(getExtendForIntVecReduction(N), DL, EltVT, N->getOperand(0));
+ SDValue Reduce = DAG.getNode(N->getOpcode(), DL, EltVT, NewOps);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Reduce);
+}
+
SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) {
SDValue Op = ZExtPromotedInteger(N->getOperand(1));
return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op), 0);
// we can simply change the result type.
SDLoc dl(N);
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
- return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+ return DAG.getNode(N->getOpcode(), dl, NVT, N->ops());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VP_REDUCE(SDNode *N) {
+ // The VP_REDUCE result size may be larger than the element size, so we can
+ // simply change the result type. However the start value and result must be
+ // the same.
+ SDLoc DL(N);
+ SDValue Start = PromoteIntOpVectorReduction(N, N->getOperand(0));
+ return DAG.getNode(N->getOpcode(), DL, Start.getValueType(), Start,
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
}
SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
return DAG.getZeroExtendInReg(Op, DL, OldVT);
}
+ // Promote the given operand V (vector or scalar) according to N's specific
+ // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns
+ // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the
+ // promoted value.
+ SDValue PromoteIntOpVectorReduction(SDNode *N, SDValue V);
+
// Integer Result Promotion.
void PromoteIntegerResult(SDNode *N, unsigned ResNo);
SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
SDValue PromoteIntRes_DIVFIX(SDNode *N);
SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N);
SDValue PromoteIntRes_VECREDUCE(SDNode *N);
+ SDValue PromoteIntRes_VP_REDUCE(SDNode *N);
SDValue PromoteIntRes_ABS(SDNode *N);
SDValue PromoteIntRes_Rotate(SDNode *N);
SDValue PromoteIntRes_FunnelShift(SDNode *N);
SDValue PromoteIntOp_FIX(SDNode *N);
SDValue PromoteIntOp_FPOWI(SDNode *N);
SDValue PromoteIntOp_VECREDUCE(SDNode *N);
+ SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_SET_ROUNDING(SDNode *N);
void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
static const unsigned IntegerVPOps[] = {
- ISD::VP_ADD, ISD::VP_SUB, ISD::VP_MUL, ISD::VP_SDIV, ISD::VP_UDIV,
- ISD::VP_SREM, ISD::VP_UREM, ISD::VP_AND, ISD::VP_OR, ISD::VP_XOR,
- ISD::VP_ASHR, ISD::VP_LSHR, ISD::VP_SHL};
-
- static const unsigned FloatingPointVPOps[] = {ISD::VP_FADD, ISD::VP_FSUB,
- ISD::VP_FMUL, ISD::VP_FDIV};
+ ISD::VP_ADD, ISD::VP_SUB, ISD::VP_MUL,
+ ISD::VP_SDIV, ISD::VP_UDIV, ISD::VP_SREM,
+ ISD::VP_UREM, ISD::VP_AND, ISD::VP_OR,
+ ISD::VP_XOR, ISD::VP_ASHR, ISD::VP_LSHR,
+ ISD::VP_SHL, ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND,
+ ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR, ISD::VP_REDUCE_SMAX,
+ ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN};
+
+ static const unsigned FloatingPointVPOps[] = {
+ ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL,
+ ISD::VP_FDIV, ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD,
+ ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX};
if (!Subtarget.is64Bit()) {
// We must custom-lower certain vXi64 operations on RV32 due to the vector
setOperationAction(ISD::VECREDUCE_SMIN, MVT::i64, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, MVT::i64, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, MVT::i64, Custom);
+
+ setOperationAction(ISD::VP_REDUCE_ADD, MVT::i64, Custom);
+ setOperationAction(ISD::VP_REDUCE_AND, MVT::i64, Custom);
+ setOperationAction(ISD::VP_REDUCE_OR, MVT::i64, Custom);
+ setOperationAction(ISD::VP_REDUCE_XOR, MVT::i64, Custom);
+ setOperationAction(ISD::VP_REDUCE_SMAX, MVT::i64, Custom);
+ setOperationAction(ISD::VP_REDUCE_SMIN, MVT::i64, Custom);
+ setOperationAction(ISD::VP_REDUCE_UMAX, MVT::i64, Custom);
+ setOperationAction(ISD::VP_REDUCE_UMIN, MVT::i64, Custom);
}
for (MVT VT : BoolVecVTs) {
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ setOperationAction(ISD::VP_REDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VP_REDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VP_REDUCE_XOR, VT, Custom);
+
// RVV has native int->float & float->int conversions where the
// element type sizes are within one power-of-two of each other. Any
// wider distances between type sizes have to be lowered as sequences
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+
setOperationAction(ISD::FCOPYSIGN, VT, Legal);
setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+ setOperationAction(ISD::VP_REDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VP_REDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VP_REDUCE_XOR, VT, Custom);
+
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::UINT_TO_FP, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:
if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
- return lowerVectorMaskVECREDUCE(Op, DAG);
+ return lowerVectorMaskVecReduction(Op, DAG, /*IsVP*/ false);
return lowerVECREDUCE(Op, DAG);
case ISD::VECREDUCE_FADD:
case ISD::VECREDUCE_SEQ_FADD:
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAX:
return lowerFPVECREDUCE(Op, DAG);
+ case ISD::VP_REDUCE_ADD:
+ case ISD::VP_REDUCE_UMAX:
+ case ISD::VP_REDUCE_SMAX:
+ case ISD::VP_REDUCE_UMIN:
+ case ISD::VP_REDUCE_SMIN:
+ case ISD::VP_REDUCE_FADD:
+ case ISD::VP_REDUCE_SEQ_FADD:
+ case ISD::VP_REDUCE_FMIN:
+ case ISD::VP_REDUCE_FMAX:
+ return lowerVPREDUCE(Op, DAG);
+ case ISD::VP_REDUCE_AND:
+ case ISD::VP_REDUCE_OR:
+ case ISD::VP_REDUCE_XOR:
+ if (Op.getOperand(1).getValueType().getVectorElementType() == MVT::i1)
+ return lowerVectorMaskVecReduction(Op, DAG, /*IsVP*/ true);
+ return lowerVPREDUCE(Op, DAG);
case ISD::INSERT_SUBVECTOR:
return lowerINSERT_SUBVECTOR(Op, DAG);
case ISD::EXTRACT_SUBVECTOR:
}
}
-SDValue RISCVTargetLowering::lowerVectorMaskVECREDUCE(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op,
+ SelectionDAG &DAG,
+ bool IsVP) const {
SDLoc DL(Op);
- SDValue Vec = Op.getOperand(0);
+ SDValue Vec = Op.getOperand(IsVP ? 1 : 0);
MVT VecVT = Vec.getSimpleValueType();
assert((Op.getOpcode() == ISD::VECREDUCE_AND ||
Op.getOpcode() == ISD::VECREDUCE_OR ||
- Op.getOpcode() == ISD::VECREDUCE_XOR) &&
+ Op.getOpcode() == ISD::VECREDUCE_XOR ||
+ Op.getOpcode() == ISD::VP_REDUCE_AND ||
+ Op.getOpcode() == ISD::VP_REDUCE_OR ||
+ Op.getOpcode() == ISD::VP_REDUCE_XOR) &&
"Unexpected reduction lowering");
MVT XLenVT = Subtarget.getXLenVT();
}
SDValue Mask, VL;
- std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+ if (IsVP) {
+ Mask = Op.getOperand(2);
+ VL = Op.getOperand(3);
+ } else {
+ std::tie(Mask, VL) =
+ getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+ }
+
+ unsigned BaseOpc;
+ ISD::CondCode CC;
SDValue Zero = DAG.getConstant(0, DL, XLenVT);
switch (Op.getOpcode()) {
default:
llvm_unreachable("Unhandled reduction");
case ISD::VECREDUCE_AND:
+ case ISD::VP_REDUCE_AND: {
// vpopc ~x == 0
- Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, Mask, VL);
+ SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, ContainerVT, VL);
+ Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, TrueMask, VL);
Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL);
- return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETEQ);
+ CC = ISD::SETEQ;
+ BaseOpc = ISD::AND;
+ break;
+ }
case ISD::VECREDUCE_OR:
+ case ISD::VP_REDUCE_OR:
// vpopc x != 0
Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL);
- return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETNE);
- case ISD::VECREDUCE_XOR: {
+ CC = ISD::SETNE;
+ BaseOpc = ISD::OR;
+ break;
+ case ISD::VECREDUCE_XOR:
+ case ISD::VP_REDUCE_XOR: {
// ((vpopc x) & 1) != 0
SDValue One = DAG.getConstant(1, DL, XLenVT);
Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL);
Vec = DAG.getNode(ISD::AND, DL, XLenVT, Vec, One);
- return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETNE);
+ CC = ISD::SETNE;
+ BaseOpc = ISD::XOR;
+ break;
}
}
+
+ SDValue SetCC = DAG.getSetCC(DL, XLenVT, Vec, Zero, CC);
+
+ if (!IsVP)
+ return SetCC;
+
+ // Now include the start value in the operation.
+ // Note that we must return the start value when no elements are operated
+ // upon. The vpopc instructions we've emitted in each case above will return
+ // 0 for an inactive vector, and so we've already received the neutral value:
+ // AND gives us (0 == 0) -> 1 and OR/XOR give us (0 != 0) -> 0. Therefore we
+ // can simply include the start value.
+ return DAG.getNode(BaseOpc, DL, XLenVT, SetCC, Op.getOperand(0));
}
SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
SDValue NeutralElem =
DAG.getNeutralElement(BaseOpc, DL, VecEltVT, SDNodeFlags());
SDValue IdentitySplat = DAG.getSplatVector(M1VT, DL, NeutralElem);
- SDValue Reduction =
- DAG.getNode(RVVOpcode, DL, M1VT, Vec, IdentitySplat, Mask, VL);
+ SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT), Vec,
+ IdentitySplat, Mask, VL);
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
DAG.getConstant(0, DL, Subtarget.getXLenVT()));
return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType());
// FIXME: This is a VLMAX splat which might be too large and can prevent
// vsetvli removal.
SDValue ScalarSplat = DAG.getSplatVector(M1VT, DL, ScalarVal);
- SDValue Reduction =
- DAG.getNode(RVVOpcode, DL, M1VT, VectorVal, ScalarSplat, Mask, VL);
+ SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT),
+ VectorVal, ScalarSplat, Mask, VL);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
DAG.getConstant(0, DL, Subtarget.getXLenVT()));
}
+static unsigned getRVVVPReductionOp(unsigned ISDOpcode) {
+ switch (ISDOpcode) {
+ default:
+ llvm_unreachable("Unhandled reduction");
+ case ISD::VP_REDUCE_ADD:
+ return RISCVISD::VECREDUCE_ADD_VL;
+ case ISD::VP_REDUCE_UMAX:
+ return RISCVISD::VECREDUCE_UMAX_VL;
+ case ISD::VP_REDUCE_SMAX:
+ return RISCVISD::VECREDUCE_SMAX_VL;
+ case ISD::VP_REDUCE_UMIN:
+ return RISCVISD::VECREDUCE_UMIN_VL;
+ case ISD::VP_REDUCE_SMIN:
+ return RISCVISD::VECREDUCE_SMIN_VL;
+ case ISD::VP_REDUCE_AND:
+ return RISCVISD::VECREDUCE_AND_VL;
+ case ISD::VP_REDUCE_OR:
+ return RISCVISD::VECREDUCE_OR_VL;
+ case ISD::VP_REDUCE_XOR:
+ return RISCVISD::VECREDUCE_XOR_VL;
+ case ISD::VP_REDUCE_FADD:
+ return RISCVISD::VECREDUCE_FADD_VL;
+ case ISD::VP_REDUCE_SEQ_FADD:
+ return RISCVISD::VECREDUCE_SEQ_FADD_VL;
+ case ISD::VP_REDUCE_FMAX:
+ return RISCVISD::VECREDUCE_FMAX_VL;
+ case ISD::VP_REDUCE_FMIN:
+ return RISCVISD::VECREDUCE_FMIN_VL;
+ }
+}
+
+SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Vec = Op.getOperand(1);
+ EVT VecEVT = Vec.getValueType();
+
+ // TODO: The type may need to be widened rather than split. Or widened before
+ // it can be split.
+ if (!isTypeLegal(VecEVT))
+ return SDValue();
+
+ MVT VecVT = VecEVT.getSimpleVT();
+ MVT VecEltVT = VecVT.getVectorElementType();
+ unsigned RVVOpcode = getRVVVPReductionOp(Op.getOpcode());
+
+ MVT ContainerVT = VecVT;
+ if (VecVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VecVT);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
+
+ SDValue VL = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(2);
+
+ MVT M1VT = getLMUL1VT(ContainerVT);
+ MVT XLenVT = Subtarget.getXLenVT();
+ MVT ResVT = !VecVT.isInteger() || VecEltVT.bitsGE(XLenVT) ? VecEltVT : XLenVT;
+
+ // FIXME: This is a VLMAX splat which might be too large and can prevent
+ // vsetvli removal.
+ SDValue StartSplat = DAG.getSplatVector(M1VT, DL, Op.getOperand(0));
+ SDValue Reduction =
+ DAG.getNode(RVVOpcode, DL, M1VT, StartSplat, Vec, StartSplat, Mask, VL);
+ SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction,
+ DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+ if (!VecVT.isInteger())
+ return Elt0;
+ return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType());
+}
+
SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDValue Vec = Op.getOperand(0);
if (SDValue V = lowerVECREDUCE(SDValue(N, 0), DAG))
Results.push_back(V);
break;
+ case ISD::VP_REDUCE_ADD:
+ case ISD::VP_REDUCE_AND:
+ case ISD::VP_REDUCE_OR:
+ case ISD::VP_REDUCE_XOR:
+ case ISD::VP_REDUCE_SMAX:
+ case ISD::VP_REDUCE_UMAX:
+ case ISD::VP_REDUCE_SMIN:
+ case ISD::VP_REDUCE_UMIN:
+ if (SDValue V = lowerVPREDUCE(SDValue(N, 0), DAG))
+ Results.push_back(V);
+ break;
case ISD::FLT_ROUNDS_: {
SDVTList VTs = DAG.getVTList(Subtarget.getXLenVT(), MVT::Other);
SDValue Res = DAG.getNode(ISD::FLT_ROUNDS_, DL, VTs, N->getOperand(0));
VFNCVT_ROD_VL,
// These nodes match the semantics of the corresponding RVV vector reduction
// instructions. They produce a vector result which is the reduction
- // performed over the first vector operand plus the first element of the
- // second vector operand. The first operand is an unconstrained vector type,
- // and the result and second operand's types are expected to be the
- // corresponding full-width LMUL=1 type for the first operand:
- // nxv8i8 = vecreduce_add nxv32i8, nxv8i8
- // nxv2i32 = vecreduce_add nxv8i32, nxv2i32
+ // performed over the second vector operand plus the first element of the
+ // third vector operand. The first operand is the pass-thru operand. The
+ // second operand is an unconstrained vector type, and the result, first, and
+ // third operand's types are expected to be the corresponding full-width
+ // LMUL=1 type for the second operand:
+ // nxv8i8 = vecreduce_add nxv8i8, nxv32i8, nxv8i8
+ // nxv2i32 = vecreduce_add nxv2i32, nxv8i32, nxv2i32
// The different in types does introduce extra vsetvli instructions but
// similarly it reduces the number of registers consumed per reduction.
// Also has a mask and VL operand.
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVPREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerVectorMaskVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVectorMaskVecReduction(SDValue Op, SelectionDAG &DAG,
+ bool IsVP) const;
SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
-def SDTRVVVecReduce : SDTypeProfile<1, 4, [
- SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>,
- SDTCisSameNumEltsAs<1, 3>, SDTCisVT<4, XLenVT>
+def SDTRVVVecReduce : SDTypeProfile<1, 5, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>,
+ SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<2, 4>, SDTCisVT<5, XLenVT>
]>;
def riscv_mul_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D),
multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in {
defvar vti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # vti.SEW # "M1");
- def: Pat<(vti_m1.Vector (vop (vti.Vector vti.RegClass:$rs1), VR:$rs2,
+ def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), VR:$rs2,
(vti.Mask true_mask),
VLOpFrag)),
(!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX)
- (vti_m1.Vector (IMPLICIT_DEF)),
+ (vti_m1.Vector VR:$merge),
(vti.Vector vti.RegClass:$rs1),
(vti_m1.Vector VR:$rs2),
GPR:$vl, vti.Log2SEW)>;
+
+ def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), VR:$rs2,
+ (vti.Mask V0), VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_MASK")
+ (vti_m1.Vector VR:$merge),
+ (vti.Vector vti.RegClass:$rs1),
+ (vti_m1.Vector VR:$rs2),
+ (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
}
}
VLOpFrag)),
(!cast<Instruction>("PseudoVPOPC_M_" # mti.BX)
VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+ def : Pat<(XLenVT (riscv_vpopc_vl (mti.Mask VR:$rs2), (mti.Mask V0),
+ VLOpFrag)),
+ (!cast<Instruction>("PseudoVPOPC_M_" # mti.BX # "_MASK")
+ VR:$rs2, (mti.Mask V0), GPR:$vl, mti.Log2SEW)>;
}
} // Predicates = [HasStdExtV]
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=ilp32d -riscv-v-vector-bits-min=128 \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=lp64d -riscv-v-vector-bits-min=128 \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+
+declare half @llvm.vp.reduce.fadd.v2f16(half, <2 x half>, <2 x i1>, i32)
+
+define half @vpreduce_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_v2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc half @llvm.vp.reduce.fadd.v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+define half @vpreduce_ord_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_v2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call half @llvm.vp.reduce.fadd.v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+declare half @llvm.vp.reduce.fadd.v4f16(half, <4 x half>, <4 x i1>, i32)
+
+define half @vpreduce_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_v4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc half @llvm.vp.reduce.fadd.v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+define half @vpreduce_ord_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_v4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call half @llvm.vp.reduce.fadd.v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+declare float @llvm.vp.reduce.fadd.v2f32(float, <2 x float>, <2 x i1>, i32)
+
+define float @vpreduce_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_v2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc float @llvm.vp.reduce.fadd.v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+define float @vpreduce_ord_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_v2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call float @llvm.vp.reduce.fadd.v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32)
+
+define float @vpreduce_fadd_v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+define float @vpreduce_ord_fadd_v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call float @llvm.vp.reduce.fadd.v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+declare double @llvm.vp.reduce.fadd.v2f64(double, <2 x double>, <2 x i1>, i32)
+
+define double @vpreduce_fadd_v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc double @llvm.vp.reduce.fadd.v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 %evl)
+ ret double %r
+}
+
+define double @vpreduce_ord_fadd_v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call double @llvm.vp.reduce.fadd.v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 %evl)
+ ret double %r
+}
+
+declare double @llvm.vp.reduce.fadd.v4f64(double, <4 x double>, <4 x i1>, i32)
+
+define double @vpreduce_fadd_v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc double @llvm.vp.reduce.fadd.v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 %evl)
+ ret double %r
+}
+
+define double @vpreduce_ord_fadd_v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call double @llvm.vp.reduce.fadd.v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 %evl)
+ ret double %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare i8 @llvm.vp.reduce.add.v2i8(i8, <2 x i8>, <2 x i1>, i32)
+
+define signext i8 @vpreduce_add_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.add.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umax.v2i8(i8, <2 x i8>, <2 x i1>, i32)
+
+define signext i8 @vpreduce_umax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umax_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umax.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smax.v2i8(i8, <2 x i8>, <2 x i1>, i32)
+
+define signext i8 @vpreduce_smax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smax.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umin.v2i8(i8, <2 x i8>, <2 x i1>, i32)
+
+define signext i8 @vpreduce_umin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umin_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umin.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smin.v2i8(i8, <2 x i8>, <2 x i1>, i32)
+
+define signext i8 @vpreduce_smin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smin.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.and.v2i8(i8, <2 x i8>, <2 x i1>, i32)
+
+define signext i8 @vpreduce_and_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.and.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.or.v2i8(i8, <2 x i8>, <2 x i1>, i32)
+
+define signext i8 @vpreduce_or_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.or.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.xor.v2i8(i8, <2 x i8>, <2 x i1>, i32)
+
+define signext i8 @vpreduce_xor_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.xor.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.add.v4i8(i8, <4 x i8>, <4 x i1>, i32)
+
+define signext i8 @vpreduce_add_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.add.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umax.v4i8(i8, <4 x i8>, <4 x i1>, i32)
+
+define signext i8 @vpreduce_umax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umax_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umax.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smax.v4i8(i8, <4 x i8>, <4 x i1>, i32)
+
+define signext i8 @vpreduce_smax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smax.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umin.v4i8(i8, <4 x i8>, <4 x i1>, i32)
+
+define signext i8 @vpreduce_umin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umin_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umin.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smin.v4i8(i8, <4 x i8>, <4 x i1>, i32)
+
+define signext i8 @vpreduce_smin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smin.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.and.v4i8(i8, <4 x i8>, <4 x i1>, i32)
+
+define signext i8 @vpreduce_and_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.and.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.or.v4i8(i8, <4 x i8>, <4 x i1>, i32)
+
+define signext i8 @vpreduce_or_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.or.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.xor.v4i8(i8, <4 x i8>, <4 x i1>, i32)
+
+define signext i8 @vpreduce_xor_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.xor.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i16 @llvm.vp.reduce.add.v2i16(i16, <2 x i16>, <2 x i1>, i32)
+
+define signext i16 @vpreduce_add_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.add.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umax.v2i16(i16, <2 x i16>, <2 x i1>, i32)
+
+define signext i16 @vpreduce_umax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umax.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smax.v2i16(i16, <2 x i16>, <2 x i1>, i32)
+
+define signext i16 @vpreduce_smax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smax.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umin.v2i16(i16, <2 x i16>, <2 x i1>, i32)
+
+define signext i16 @vpreduce_umin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_v2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_v2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umin.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smin.v2i16(i16, <2 x i16>, <2 x i1>, i32)
+
+define signext i16 @vpreduce_smin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smin.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.and.v2i16(i16, <2 x i16>, <2 x i1>, i32)
+
+define signext i16 @vpreduce_and_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.and.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.or.v2i16(i16, <2 x i16>, <2 x i1>, i32)
+
+define signext i16 @vpreduce_or_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.or.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.xor.v2i16(i16, <2 x i16>, <2 x i1>, i32)
+
+define signext i16 @vpreduce_xor_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.xor.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.add.v4i16(i16, <4 x i16>, <4 x i1>, i32)
+
+define signext i16 @vpreduce_add_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.add.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umax.v4i16(i16, <4 x i16>, <4 x i1>, i32)
+
+define signext i16 @vpreduce_umax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umax.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smax.v4i16(i16, <4 x i16>, <4 x i1>, i32)
+
+define signext i16 @vpreduce_smax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smax.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umin.v4i16(i16, <4 x i16>, <4 x i1>, i32)
+
+define signext i16 @vpreduce_umin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_v4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_v4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umin.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smin.v4i16(i16, <4 x i16>, <4 x i1>, i32)
+
+define signext i16 @vpreduce_smin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smin.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.and.v4i16(i16, <4 x i16>, <4 x i1>, i32)
+
+define signext i16 @vpreduce_and_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.and.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.or.v4i16(i16, <4 x i16>, <4 x i1>, i32)
+
+define signext i16 @vpreduce_or_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.or.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.xor.v4i16(i16, <4 x i16>, <4 x i1>, i32)
+
+define signext i16 @vpreduce_xor_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.xor.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i32 @llvm.vp.reduce.add.v2i32(i32, <2 x i32>, <2 x i1>, i32)
+
+define signext i32 @vpreduce_add_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.add.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umax.v2i32(i32, <2 x i32>, <2 x i1>, i32)
+
+define signext i32 @vpreduce_umax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_v2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_v2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umax.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smax.v2i32(i32, <2 x i32>, <2 x i1>, i32)
+
+define signext i32 @vpreduce_smax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smax.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umin.v2i32(i32, <2 x i32>, <2 x i1>, i32)
+
+define signext i32 @vpreduce_umin_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_v2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_v2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umin.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smin.v2i32(i32, <2 x i32>, <2 x i1>, i32)
+
+define signext i32 @vpreduce_smin_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smin.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.and.v2i32(i32, <2 x i32>, <2 x i1>, i32)
+
+define signext i32 @vpreduce_and_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.and.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.or.v2i32(i32, <2 x i32>, <2 x i1>, i32)
+
+define signext i32 @vpreduce_or_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.or.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.xor.v2i32(i32, <2 x i32>, <2 x i1>, i32)
+
+define signext i32 @vpreduce_xor_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.xor.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+
+define signext i32 @vpreduce_add_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+
+define signext i32 @vpreduce_umax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_v4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_v4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umax.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smax.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+
+define signext i32 @vpreduce_smax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smax.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+
+define signext i32 @vpreduce_umin_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_v4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_v4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umin.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smin.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+
+define signext i32 @vpreduce_smin_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smin.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.and.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+
+define signext i32 @vpreduce_and_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.and.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.or.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+
+define signext i32 @vpreduce_or_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.or.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.xor.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+
+define signext i32 @vpreduce_xor_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.xor.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i64 @llvm.vp.reduce.add.v2i64(i64, <2 x i64>, <2 x i1>, i32)
+
+define signext i64 @vpreduce_add_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_add_v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.add.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umax.v2i64(i64, <2 x i64>, <2 x i1>, i32)
+
+define signext i64 @vpreduce_umax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umax.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smax.v2i64(i64, <2 x i64>, <2 x i1>, i32)
+
+define signext i64 @vpreduce_smax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smax_v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smax.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umin.v2i64(i64, <2 x i64>, <2 x i1>, i32)
+
+define signext i64 @vpreduce_umin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umin.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smin.v2i64(i64, <2 x i64>, <2 x i1>, i32)
+
+define signext i64 @vpreduce_smin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smin_v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smin.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.and.v2i64(i64, <2 x i64>, <2 x i1>, i32)
+
+define signext i64 @vpreduce_and_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_and_v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.and.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.or.v2i64(i64, <2 x i64>, <2 x i1>, i32)
+
+define signext i64 @vpreduce_or_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_or_v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.or.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.xor.v2i64(i64, <2 x i64>, <2 x i1>, i32)
+
+define signext i64 @vpreduce_xor_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_xor_v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.xor.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.add.v4i64(i64, <4 x i64>, <4 x i1>, i32)
+
+define signext i64 @vpreduce_add_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_add_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.add.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umax.v4i64(i64, <4 x i64>, <4 x i1>, i32)
+
+define signext i64 @vpreduce_umax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umax.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smax.v4i64(i64, <4 x i64>, <4 x i1>, i32)
+
+define signext i64 @vpreduce_smax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smax_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smax.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umin.v4i64(i64, <4 x i64>, <4 x i1>, i32)
+
+define signext i64 @vpreduce_umin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umin.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smin.v4i64(i64, <4 x i64>, <4 x i1>, i32)
+
+define signext i64 @vpreduce_smin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smin_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smin.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.and.v4i64(i64, <4 x i64>, <4 x i1>, i32)
+
+define signext i64 @vpreduce_and_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_and_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.and.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.or.v4i64(i64, <4 x i64>, <4 x i1>, i32)
+
+define signext i64 @vpreduce_or_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_or_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.or.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.xor.v4i64(i64, <4 x i64>, <4 x i1>, i32)
+
+define signext i64 @vpreduce_xor_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_xor_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.xor.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+declare i1 @llvm.vp.reduce.and.v1i1(i1, <1 x i1>, <1 x i1>, i32)
+
+define signext i1 @vpreduce_and_v1i1(i1 signext %s, <1 x i1> %v, <1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v1i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.v1i1(i1 %s, <1 x i1> %v, <1 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.v1i1(i1, <1 x i1>, <1 x i1>, i32)
+
+define signext i1 @vpreduce_or_v1i1(i1 signext %s, <1 x i1> %v, <1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v1i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.v1i1(i1 %s, <1 x i1> %v, <1 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.v1i1(i1, <1 x i1>, <1 x i1>, i32)
+
+define signext i1 @vpreduce_xor_v1i1(i1 signext %s, <1 x i1> %v, <1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v1i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.v1i1(i1 %s, <1 x i1> %v, <1 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.v2i1(i1, <2 x i1>, <2 x i1>, i32)
+
+define signext i1 @vpreduce_and_v2i1(i1 signext %s, <2 x i1> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v2i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.v2i1(i1 %s, <2 x i1> %v, <2 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.v2i1(i1, <2 x i1>, <2 x i1>, i32)
+
+define signext i1 @vpreduce_or_v2i1(i1 signext %s, <2 x i1> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v2i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.v2i1(i1 %s, <2 x i1> %v, <2 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.v2i1(i1, <2 x i1>, <2 x i1>, i32)
+
+define signext i1 @vpreduce_xor_v2i1(i1 signext %s, <2 x i1> %v, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v2i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.v2i1(i1 %s, <2 x i1> %v, <2 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.v4i1(i1, <4 x i1>, <4 x i1>, i32)
+
+define signext i1 @vpreduce_and_v4i1(i1 signext %s, <4 x i1> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v4i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.v4i1(i1 %s, <4 x i1> %v, <4 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.v4i1(i1, <4 x i1>, <4 x i1>, i32)
+
+define signext i1 @vpreduce_or_v4i1(i1 signext %s, <4 x i1> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v4i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.v4i1(i1 %s, <4 x i1> %v, <4 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.v4i1(i1, <4 x i1>, <4 x i1>, i32)
+
+define signext i1 @vpreduce_xor_v4i1(i1 signext %s, <4 x i1> %v, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v4i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.v4i1(i1 %s, <4 x i1> %v, <4 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.v8i1(i1, <8 x i1>, <8 x i1>, i32)
+
+define signext i1 @vpreduce_and_v8i1(i1 signext %s, <8 x i1> %v, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.v8i1(i1 %s, <8 x i1> %v, <8 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.v8i1(i1, <8 x i1>, <8 x i1>, i32)
+
+define signext i1 @vpreduce_or_v8i1(i1 signext %s, <8 x i1> %v, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.v8i1(i1 %s, <8 x i1> %v, <8 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.v8i1(i1, <8 x i1>, <8 x i1>, i32)
+
+define signext i1 @vpreduce_xor_v8i1(i1 signext %s, <8 x i1> %v, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.v8i1(i1 %s, <8 x i1> %v, <8 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.v16i1(i1, <16 x i1>, <16 x i1>, i32)
+
+define signext i1 @vpreduce_and_v16i1(i1 signext %s, <16 x i1> %v, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_v16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.v16i1(i1 %s, <16 x i1> %v, <16 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.v16i1(i1, <16 x i1>, <16 x i1>, i32)
+
+define signext i1 @vpreduce_or_v16i1(i1 signext %s, <16 x i1> %v, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_v16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.v16i1(i1 %s, <16 x i1> %v, <16 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.v16i1(i1, <16 x i1>, <16 x i1>, i32)
+
+define signext i1 @vpreduce_xor_v16i1(i1 signext %s, <16 x i1> %v, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_v16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.v16i1(i1 %s, <16 x i1> %v, <16 x i1> %m, i32 %evl)
+ ret i1 %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+
+declare half @llvm.vp.reduce.fadd.nxv1f16(half, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
+
+define half @vpreduce_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv1f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc half @llvm.vp.reduce.fadd.nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+define half @vpreduce_ord_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv1f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call half @llvm.vp.reduce.fadd.nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+declare half @llvm.vp.reduce.fadd.nxv2f16(half, <vscale x 2 x half>, <vscale x 2 x i1>, i32)
+
+define half @vpreduce_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc half @llvm.vp.reduce.fadd.nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+define half @vpreduce_ord_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call half @llvm.vp.reduce.fadd.nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+declare half @llvm.vp.reduce.fadd.nxv4f16(half, <vscale x 4 x half>, <vscale x 4 x i1>, i32)
+
+define half @vpreduce_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc half @llvm.vp.reduce.fadd.nxv4f16(half %s, <vscale x 4 x half> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+define half @vpreduce_ord_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call half @llvm.vp.reduce.fadd.nxv4f16(half %s, <vscale x 4 x half> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret half %r
+}
+
+declare float @llvm.vp.reduce.fadd.nxv1f32(float, <vscale x 1 x float>, <vscale x 1 x i1>, i32)
+
+define float @vpreduce_fadd_nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc float @llvm.vp.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+define float @vpreduce_ord_fadd_nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call float @llvm.vp.reduce.fadd.nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+declare float @llvm.vp.reduce.fadd.nxv2f32(float, <vscale x 2 x float>, <vscale x 2 x i1>, i32)
+
+define float @vpreduce_fadd_nxv2f32(float %s, <vscale x 2 x float> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc float @llvm.vp.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+define float @vpreduce_ord_fadd_nxv2f32(float %s, <vscale x 2 x float> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call float @llvm.vp.reduce.fadd.nxv2f32(float %s, <vscale x 2 x float> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+declare float @llvm.vp.reduce.fadd.nxv4f32(float, <vscale x 4 x float>, <vscale x 4 x i1>, i32)
+
+define float @vpreduce_fadd_nxv4f32(float %s, <vscale x 4 x float> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+define float @vpreduce_ord_fadd_nxv4f32(float %s, <vscale x 4 x float> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call float @llvm.vp.reduce.fadd.nxv4f32(float %s, <vscale x 4 x float> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret float %r
+}
+
+declare double @llvm.vp.reduce.fadd.nxv1f64(double, <vscale x 1 x double>, <vscale x 1 x i1>, i32)
+
+define double @vpreduce_fadd_nxv1f64(double %s, <vscale x 1 x double> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv1f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc double @llvm.vp.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret double %r
+}
+
+define double @vpreduce_ord_fadd_nxv1f64(double %s, <vscale x 1 x double> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv1f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call double @llvm.vp.reduce.fadd.nxv1f64(double %s, <vscale x 1 x double> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret double %r
+}
+
+declare double @llvm.vp.reduce.fadd.nxv2f64(double, <vscale x 2 x double>, <vscale x 2 x i1>, i32)
+
+define double @vpreduce_fadd_nxv2f64(double %s, <vscale x 2 x double> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc double @llvm.vp.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret double %r
+}
+
+define double @vpreduce_ord_fadd_nxv2f64(double %s, <vscale x 2 x double> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call double @llvm.vp.reduce.fadd.nxv2f64(double %s, <vscale x 2 x double> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret double %r
+}
+
+declare double @llvm.vp.reduce.fadd.nxv4f64(double, <vscale x 4 x double>, <vscale x 4 x i1>, i32)
+
+define double @vpreduce_fadd_nxv4f64(double %s, <vscale x 4 x double> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu
+; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call reassoc double @llvm.vp.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret double %r
+}
+
+define double @vpreduce_ord_fadd_nxv4f64(double %s, <vscale x 4 x double> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfmv.v.f v25, fa0
+; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu
+; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vfmv.f.s fa0, v25
+; CHECK-NEXT: ret
+ %r = call double @llvm.vp.reduce.fadd.nxv4f64(double %s, <vscale x 4 x double> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret double %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare i8 @llvm.vp.reduce.add.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define signext i8 @vpreduce_add_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.add.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umax.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define signext i8 @vpreduce_umax_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umax_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umax.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smax.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define signext i8 @vpreduce_smax_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smax.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umin.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define signext i8 @vpreduce_umin_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umin_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umin.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smin.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define signext i8 @vpreduce_smin_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smin.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.and.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define signext i8 @vpreduce_and_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.and.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.or.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define signext i8 @vpreduce_or_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.or.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.xor.nxv1i8(i8, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define signext i8 @vpreduce_xor_nxv1i8(i8 signext %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.xor.nxv1i8(i8 %s, <vscale x 1 x i8> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.add.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define signext i8 @vpreduce_add_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.add.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umax.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define signext i8 @vpreduce_umax_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umax_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umax.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smax.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define signext i8 @vpreduce_smax_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smax.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umin.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define signext i8 @vpreduce_umin_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umin_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umin.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smin.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define signext i8 @vpreduce_smin_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smin.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.and.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define signext i8 @vpreduce_and_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.and.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.or.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define signext i8 @vpreduce_or_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.or.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.xor.nxv2i8(i8, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define signext i8 @vpreduce_xor_nxv2i8(i8 signext %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.xor.nxv2i8(i8 %s, <vscale x 2 x i8> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.add.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define signext i8 @vpreduce_add_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.add.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umax.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define signext i8 @vpreduce_umax_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umax_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu
+; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umax.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smax.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define signext i8 @vpreduce_smax_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smax.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.umin.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define signext i8 @vpreduce_umin_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_umin_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu
+; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.umin.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.smin.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define signext i8 @vpreduce_smin_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.smin.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.and.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define signext i8 @vpreduce_and_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.and.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.or.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define signext i8 @vpreduce_or_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.or.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i8 @llvm.vp.reduce.xor.nxv4i8(i8, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define signext i8 @vpreduce_xor_nxv4i8(i8 signext %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i8 @llvm.vp.reduce.xor.nxv4i8(i8 %s, <vscale x 4 x i8> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i8 %r
+}
+
+declare i16 @llvm.vp.reduce.add.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define signext i16 @vpreduce_add_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_nxv1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.add.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umax.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define signext i16 @vpreduce_umax_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umax.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smax.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define signext i16 @vpreduce_smax_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_nxv1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smax.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umin.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define signext i16 @vpreduce_umin_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umin.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smin.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define signext i16 @vpreduce_smin_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_nxv1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smin.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.and.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define signext i16 @vpreduce_and_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.and.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.or.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define signext i16 @vpreduce_or_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.or.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.xor.nxv1i16(i16, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define signext i16 @vpreduce_xor_nxv1i16(i16 signext %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.xor.nxv1i16(i16 %s, <vscale x 1 x i16> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.add.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define signext i16 @vpreduce_add_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_nxv2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.add.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umax.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define signext i16 @vpreduce_umax_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umax.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smax.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define signext i16 @vpreduce_smax_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_nxv2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smax.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umin.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define signext i16 @vpreduce_umin_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umin.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smin.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define signext i16 @vpreduce_smin_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_nxv2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smin.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.and.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define signext i16 @vpreduce_and_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.and.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.or.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define signext i16 @vpreduce_or_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.or.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.xor.nxv2i16(i16, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define signext i16 @vpreduce_xor_nxv2i16(i16 signext %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.xor.nxv2i16(i16 %s, <vscale x 2 x i16> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.add.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define signext i16 @vpreduce_add_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.add.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umax.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define signext i16 @vpreduce_umax_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umax.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smax.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define signext i16 @vpreduce_smax_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smax.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.umin.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define signext i16 @vpreduce_umin_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a2, 16
+; RV64-NEXT: addiw a2, a2, -1
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.umin.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.smin.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define signext i16 @vpreduce_smin_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.smin.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.and.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define signext i16 @vpreduce_and_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.and.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.or.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define signext i16 @vpreduce_or_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.or.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i16 @llvm.vp.reduce.xor.nxv4i16(i16, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define signext i16 @vpreduce_xor_nxv4i16(i16 signext %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i16 @llvm.vp.reduce.xor.nxv4i16(i16 %s, <vscale x 4 x i16> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i16 %r
+}
+
+declare i32 @llvm.vp.reduce.add.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define signext i32 @vpreduce_add_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_nxv1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.add.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umax.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define signext i32 @vpreduce_umax_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_nxv1i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv1i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umax.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smax.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define signext i32 @vpreduce_smax_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_nxv1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smax.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umin.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define signext i32 @vpreduce_umin_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_nxv1i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv1i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umin.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smin.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define signext i32 @vpreduce_smin_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_nxv1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smin.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.and.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define signext i32 @vpreduce_and_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.and.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.or.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define signext i32 @vpreduce_or_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.or.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.xor.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define signext i32 @vpreduce_xor_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.xor.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.add.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define signext i32 @vpreduce_add_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.add.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umax.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define signext i32 @vpreduce_umax_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_nxv2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umax.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smax.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define signext i32 @vpreduce_smax_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smax.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umin.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define signext i32 @vpreduce_umin_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_nxv2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umin.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smin.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define signext i32 @vpreduce_smin_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smin.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.and.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define signext i32 @vpreduce_and_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.and.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.or.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define signext i32 @vpreduce_or_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.or.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.xor.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define signext i32 @vpreduce_xor_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.xor.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.add.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define signext i32 @vpreduce_add_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_add_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.add.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umax.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define signext i32 @vpreduce_umax_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smax.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define signext i32 @vpreduce_smax_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smax_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.umin.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define signext i32 @vpreduce_umin_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.x v25, a0
+; RV32-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.smin.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define signext i32 @vpreduce_smin_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_smin_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.smin.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.and.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define signext i32 @vpreduce_and_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.and.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.or.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define signext i32 @vpreduce_or_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.or.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i32 @llvm.vp.reduce.xor.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define signext i32 @vpreduce_xor_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmv.v.x v25, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu
+; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
+; CHECK-NEXT: vmv.x.s a0, v25
+; CHECK-NEXT: ret
+ %r = call i32 @llvm.vp.reduce.xor.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i32 %r
+}
+
+declare i64 @llvm.vp.reduce.add.nxv1i64(i64, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define signext i64 @vpreduce_add_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_add_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umax.nxv1i64(i64, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define signext i64 @vpreduce_umax_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umax.nxv1i64(i64 %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smax.nxv1i64(i64, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define signext i64 @vpreduce_smax_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smax_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smax.nxv1i64(i64 %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umin.nxv1i64(i64, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define signext i64 @vpreduce_umin_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umin.nxv1i64(i64 %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smin.nxv1i64(i64, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define signext i64 @vpreduce_smin_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smin_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smin.nxv1i64(i64 %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.and.nxv1i64(i64, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define signext i64 @vpreduce_and_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_and_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.and.nxv1i64(i64 %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.or.nxv1i64(i64, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define signext i64 @vpreduce_or_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_or_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.or.nxv1i64(i64 %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.xor.nxv1i64(i64, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define signext i64 @vpreduce_xor_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_xor_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.xor.nxv1i64(i64 %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.add.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define signext i64 @vpreduce_add_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_add_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umax.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define signext i64 @vpreduce_umax_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umax.nxv2i64(i64 %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smax.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define signext i64 @vpreduce_smax_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smax_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smax.nxv2i64(i64 %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umin.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define signext i64 @vpreduce_umin_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umin.nxv2i64(i64 %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smin.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define signext i64 @vpreduce_smin_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smin_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smin.nxv2i64(i64 %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.and.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define signext i64 @vpreduce_and_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_and_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.and.nxv2i64(i64 %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.or.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define signext i64 @vpreduce_or_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_or_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.or.nxv2i64(i64 %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.xor.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define signext i64 @vpreduce_xor_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_xor_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.xor.nxv2i64(i64 %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.add.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define signext i64 @vpreduce_add_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_add_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_add_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu
+; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umax.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define signext i64 @vpreduce_umax_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umax_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umax_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu
+; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umax.nxv4i64(i64 %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smax.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define signext i64 @vpreduce_smax_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smax_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smax_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu
+; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smax.nxv4i64(i64 %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.umin.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define signext i64 @vpreduce_umin_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_umin_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_umin_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu
+; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.umin.nxv4i64(i64 %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.smin.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define signext i64 @vpreduce_smin_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_smin_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_smin_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu
+; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.smin.nxv4i64(i64 %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.and.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define signext i64 @vpreduce_and_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_and_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_and_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu
+; RV64-NEXT: vredand.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.and.nxv4i64(i64 %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.or.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define signext i64 @vpreduce_or_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_or_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_or_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu
+; RV64-NEXT: vredor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.or.nxv4i64(i64 %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
+
+declare i64 @llvm.vp.reduce.xor.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define signext i64 @vpreduce_xor_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpreduce_xor_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v25, (a0), zero
+; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV32-NEXT: vmv.x.s a0, v25
+; RV32-NEXT: addi a1, zero, 32
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT: vsrl.vx v25, v25, a1
+; RV32-NEXT: vmv.x.s a1, v25
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpreduce_xor_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT: vmv.v.x v25, a0
+; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu
+; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t
+; RV64-NEXT: vmv.x.s a0, v25
+; RV64-NEXT: ret
+ %r = call i64 @llvm.vp.reduce.xor.nxv4i64(i64 %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i64 %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
+
+declare i1 @llvm.vp.reduce.and.nxv1i1(i1, <vscale x 1 x i1>, <vscale x 1 x i1>, i32)
+
+define signext i1 @vpreduce_and_nxv1i1(i1 signext %s, <vscale x 1 x i1> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv1i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.nxv1i1(i1 %s, <vscale x 1 x i1> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.nxv1i1(i1, <vscale x 1 x i1>, <vscale x 1 x i1>, i32)
+
+define signext i1 @vpreduce_or_nxv1i1(i1 signext %s, <vscale x 1 x i1> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv1i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.nxv1i1(i1 %s, <vscale x 1 x i1> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.nxv1i1(i1, <vscale x 1 x i1>, <vscale x 1 x i1>, i32)
+
+define signext i1 @vpreduce_xor_nxv1i1(i1 signext %s, <vscale x 1 x i1> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv1i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.nxv1i1(i1 %s, <vscale x 1 x i1> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.nxv2i1(i1, <vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+
+define signext i1 @vpreduce_and_nxv2i1(i1 signext %s, <vscale x 2 x i1> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv2i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.nxv2i1(i1 %s, <vscale x 2 x i1> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.nxv2i1(i1, <vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+
+define signext i1 @vpreduce_or_nxv2i1(i1 signext %s, <vscale x 2 x i1> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv2i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.nxv2i1(i1 %s, <vscale x 2 x i1> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.nxv2i1(i1, <vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+
+define signext i1 @vpreduce_xor_nxv2i1(i1 signext %s, <vscale x 2 x i1> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv2i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.nxv2i1(i1 %s, <vscale x 2 x i1> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.nxv4i1(i1, <vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+
+define signext i1 @vpreduce_and_nxv4i1(i1 signext %s, <vscale x 4 x i1> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv4i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.nxv4i1(i1 %s, <vscale x 4 x i1> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.nxv4i1(i1, <vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+
+define signext i1 @vpreduce_or_nxv4i1(i1 signext %s, <vscale x 4 x i1> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv4i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.nxv4i1(i1 %s, <vscale x 4 x i1> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.nxv4i1(i1, <vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+
+define signext i1 @vpreduce_xor_nxv4i1(i1 signext %s, <vscale x 4 x i1> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv4i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.nxv4i1(i1 %s, <vscale x 4 x i1> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.nxv8i1(i1, <vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+
+define signext i1 @vpreduce_and_nxv8i1(i1 signext %s, <vscale x 8 x i1> %v, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.nxv8i1(i1 %s, <vscale x 8 x i1> %v, <vscale x 8 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.nxv8i1(i1, <vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+
+define signext i1 @vpreduce_or_nxv8i1(i1 signext %s, <vscale x 8 x i1> %v, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.nxv8i1(i1 %s, <vscale x 8 x i1> %v, <vscale x 8 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.nxv8i1(i1, <vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+
+define signext i1 @vpreduce_xor_nxv8i1(i1 signext %s, <vscale x 8 x i1> %v, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.nxv8i1(i1 %s, <vscale x 8 x i1> %v, <vscale x 8 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.nxv16i1(i1, <vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+
+define signext i1 @vpreduce_and_nxv16i1(i1 signext %s, <vscale x 16 x i1> %v, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.nxv16i1(i1 %s, <vscale x 16 x i1> %v, <vscale x 16 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.nxv16i1(i1, <vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+
+define signext i1 @vpreduce_or_nxv16i1(i1 signext %s, <vscale x 16 x i1> %v, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.nxv16i1(i1 %s, <vscale x 16 x i1> %v, <vscale x 16 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.nxv16i1(i1, <vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+
+define signext i1 @vpreduce_xor_nxv16i1(i1 signext %s, <vscale x 16 x i1> %v, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.nxv16i1(i1 %s, <vscale x 16 x i1> %v, <vscale x 16 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.nxv32i1(i1, <vscale x 32 x i1>, <vscale x 32 x i1>, i32)
+
+define signext i1 @vpreduce_and_nxv32i1(i1 signext %s, <vscale x 32 x i1> %v, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv32i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.nxv32i1(i1 %s, <vscale x 32 x i1> %v, <vscale x 32 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.nxv32i1(i1, <vscale x 32 x i1>, <vscale x 32 x i1>, i32)
+
+define signext i1 @vpreduce_or_nxv32i1(i1 signext %s, <vscale x 32 x i1> %v, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv32i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.nxv32i1(i1 %s, <vscale x 32 x i1> %v, <vscale x 32 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.nxv32i1(i1, <vscale x 32 x i1>, <vscale x 32 x i1>, i32)
+
+define signext i1 @vpreduce_xor_nxv32i1(i1 signext %s, <vscale x 32 x i1> %v, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv32i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.nxv32i1(i1 %s, <vscale x 32 x i1> %v, <vscale x 32 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.and.nxv64i1(i1, <vscale x 64 x i1>, <vscale x 64 x i1>, i32)
+
+define signext i1 @vpreduce_and_nxv64i1(i1 signext %s, <vscale x 64 x i1> %v, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_and_nxv64i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT: vmnand.mm v25, v0, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.and.nxv64i1(i1 %s, <vscale x 64 x i1> %v, <vscale x 64 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.or.nxv64i1(i1, <vscale x 64 x i1>, <vscale x 64 x i1>, i32)
+
+define signext i1 @vpreduce_or_nxv64i1(i1 signext %s, <vscale x 64 x i1> %v, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_or_nxv64i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: snez a1, a1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.or.nxv64i1(i1 %s, <vscale x 64 x i1> %v, <vscale x 64 x i1> %m, i32 %evl)
+ ret i1 %r
+}
+
+declare i1 @llvm.vp.reduce.xor.nxv64i1(i1, <vscale x 64 x i1>, <vscale x 64 x i1>, i32)
+
+define signext i1 @vpreduce_xor_nxv64i1(i1 signext %s, <vscale x 64 x i1> %v, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_xor_nxv64i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v25, v0
+; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vpopc.m a1, v25, v0.t
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: ret
+ %r = call i1 @llvm.vp.reduce.xor.nxv64i1(i1 %s, <vscale x 64 x i1> %v, <vscale x 64 x i1> %m, i32 %evl)
+ ret i1 %r
+}