return false;
}
+bool isVVPReductionOp(unsigned Opcode) {
+ switch (Opcode) {
+#define ADD_REDUCE_VVP_OP(VVP_NAME, SDNAME) case VEISD::VVP_NAME:
+#include "VVPNodes.def"
+ return true;
+ }
+ return false;
+}
+
// Return the AVL operand position for this VVP or VEC Op.
Optional<int> getAVLPos(unsigned Opc) {
// This is only available for VP SDNodes
}
// Translate to VVP where possible.
+ unsigned OriginalOC = OC;
if (auto VVPOpc = getVVPOpcode(OC))
OC = *VVPOpc;
+ if (isVVPReductionOp(OC))
+ return Op->getOperand(hasReductionStartParam(OriginalOC) ? 1 : 0)
+ .getValueType();
+
switch (OC) {
default:
case VEISD::VVP_SETCC:
return SDValue();
}
+bool hasReductionStartParam(unsigned OPC) {
+ // TODO: Ordered reduction opcodes.
+ if (ISD::isVPReduction(OPC))
+ return true;
+ return false;
+}
+
+unsigned getScalarReductionOpcode(unsigned VVPOC, bool IsMask) {
+ assert(!IsMask && "Mask reduction isel");
+
+ switch (VVPOC) {
+#define HANDLE_VVP_REDUCE_TO_SCALAR(VVP_RED_ISD, REDUCE_ISD) \
+ case VEISD::VVP_RED_ISD: \
+ return ISD::REDUCE_ISD;
+#include "VVPNodes.def"
+ default:
+ break;
+ }
+ llvm_unreachable("Cannot not scalarize this reduction Opcode!");
+}
+
/// } Node Properties
SDValue getNodeAVL(SDValue Op) {
return ResPtr;
}
+SDValue VECustomDAG::getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT,
+ SDValue StartV, SDValue VectorV,
+ SDValue Mask, SDValue AVL,
+ SDNodeFlags Flags) const {
+
+ // Optionally attach the start param with a scalar op (where it is
+ // unsupported).
+ bool scalarizeStartParam = StartV && !hasReductionStartParam(VVPOpcode);
+ bool IsMaskReduction = isMaskType(VectorV.getValueType());
+ assert(!IsMaskReduction && "TODO Implement");
+ auto AttachStartValue = [&](SDValue ReductionResV) {
+ if (!scalarizeStartParam)
+ return ReductionResV;
+ auto ScalarOC = getScalarReductionOpcode(VVPOpcode, IsMaskReduction);
+ return getNode(ScalarOC, ResVT, {StartV, ReductionResV});
+ };
+
+ // Fixup: Always Use sequential 'fmul' reduction.
+ if (!scalarizeStartParam && StartV) {
+ assert(hasReductionStartParam(VVPOpcode));
+ return AttachStartValue(
+ getNode(VVPOpcode, ResVT, {StartV, VectorV, Mask, AVL}, Flags));
+ } else
+ return AttachStartValue(
+ getNode(VVPOpcode, ResVT, {VectorV, Mask, AVL}, Flags));
+}
+
} // namespace llvm
Optional<unsigned> getVVPOpcode(unsigned Opcode);
bool isVVPBinaryOp(unsigned Opcode);
+bool isVVPReductionOp(unsigned Opcode);
MVT splitVectorType(MVT VT);
SDValue getGatherScatterScale(SDValue Op);
+unsigned getScalarReductionOpcode(unsigned VVPOC, bool IsMask);
+
+// Whether this VP_REDUCE_*/ VECREDUCE_*/VVP_REDUCE_* SDNode has a start
+// parameter.
+bool hasReductionStartParam(unsigned VVPOC);
+
/// } Node Properties
enum class Packing {
SDValue getUNDEF(EVT VT) const { return DAG.getUNDEF(VT); }
/// } getNode
+ /// Legalizing getNode {
+ SDValue getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT, SDValue StartV,
+ SDValue VectorV, SDValue Mask, SDValue AVL,
+ SDNodeFlags Flags) const;
+ /// } Legalizing getNode
+
/// Packing {
SDValue getUnpack(EVT DestVT, SDValue Vec, PackElem Part, SDValue AVL) const;
SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL) const;
for (unsigned MemOpc : {ISD::MLOAD, ISD::MSTORE, ISD::LOAD, ISD::STORE})
setOperationAction(MemOpc, VT, Custom);
+
+ const ISD::NodeType IntReductionOCs[] = {
+ ISD::VECREDUCE_ADD, ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND,
+ ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMIN,
+ ISD::VECREDUCE_SMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_UMAX};
+
+ for (unsigned IntRedOpc : IntReductionOCs)
+ setOperationAction(IntRedOpc, VT, Custom);
}
}
bool enableVPU() const { return getST()->enableVPU(); }
+ static bool isSupportedReduction(Intrinsic::ID ReductionID) {
+#define VEC_VP_CASE(SUFFIX) \
+ case Intrinsic::vp_reduce_##SUFFIX: \
+ case Intrinsic::vector_reduce_##SUFFIX:
+
+ switch (ReductionID) {
+ VEC_VP_CASE(add)
+ VEC_VP_CASE(and)
+ VEC_VP_CASE(or)
+ VEC_VP_CASE(xor)
+ VEC_VP_CASE(smax)
+ return true;
+
+ default:
+ return false;
+ }
+#undef VEC_VP_CASE
+ }
+
public:
explicit VETTIImpl(const VETargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
return isVectorLaneType(*getLaneType(DataType));
}
// } Load & Store
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const {
+ if (!enableVPU())
+ return true;
+ return !isSupportedReduction(II->getIntrinsicID());
+ }
};
} // namespace llvm
return lowerVVP_GATHER_SCATTER(Op, CDAG);
}
- EVT OpVecVT = Op.getValueType();
+ EVT OpVecVT = *getIdiomaticVectorType(Op.getNode());
EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
auto Packing = getTypePacking(LegalVecVT.getSimpleVT());
return CDAG.getNode(VVPOpcode, LegalVecVT,
{Op->getOperand(0), Op->getOperand(1), Mask, AVL});
}
+ if (isVVPReductionOp(VVPOpcode)) {
+ auto SrcHasStart = hasReductionStartParam(Op->getOpcode());
+ SDValue StartV = SrcHasStart ? Op->getOperand(0) : SDValue();
+ SDValue VectorV = Op->getOperand(SrcHasStart ? 1 : 0);
+ return CDAG.getLegalReductionOpVVP(VVPOpcode, Op.getValueType(), StartV,
+ VectorV, Mask, AVL, Op->getFlags());
+ }
+
if (VVPOpcode == VEISD::VVP_SELECT) {
auto Mask = Op->getOperand(0);
auto OnTrue = Op->getOperand(1);
return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
}
if (VVPOpcode == VEISD::VVP_SETCC) {
+ EVT LegalResVT = getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
auto LHS = Op->getOperand(0);
auto RHS = Op->getOperand(1);
auto Pred = Op->getOperand(2);
- return CDAG.getNode(VVPOpcode, LegalVecVT, {LHS, RHS, Pred, Mask, AVL});
+ return CDAG.getNode(VVPOpcode, LegalResVT, {LHS, RHS, Pred, Mask, AVL});
}
llvm_unreachable("lowerToVVP called for unexpected SDNode.");
}
IsVLVT<3>
]>;
-// Binary Operators {
-
// BinaryOp(x,y,mask,vl)
def SDTIntBinOpVVP : SDTypeProfile<1, 4, [ // vp_add, vp_and, etc.
SDTCisSameAs<0, 1>,
IsVLVT<5>
]>;
+// vvp_reduce(vector, mask, vl)
+def SDTReduceVVP : SDTypeProfile<1, 3, [
+ SDTCisVec<1>,
+ SDTCisInt<2>,
+ SDTCisVec<2>,
+ SDTCisSameNumEltsAs<1,2>,
+ IsVLVT<3>
+]>;
+
// Binary operator commutative pattern.
class vvp_commutative<SDNode RootOp> :
def c_vvp_fmul : vvp_commutative<vvp_fmul>;
def vvp_fdiv : SDNode<"VEISD::VVP_FDIV", SDTFPBinOpVVP>;
-// } Binary Operators
-
def vvp_scatter : SDNode<"VEISD::VVP_SCATTER", SDTScatterVVP,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def vvp_gather : SDNode<"VEISD::VVP_GATHER", SDTGatherVVP,
def vvp_store : SDNode<"VEISD::VVP_STORE", SDTStoreVVP,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+// Reductions
+
+// int reductions
+def vvp_reduce_add : SDNode<"VEISD::VVP_REDUCE_ADD", SDTReduceVVP>;
+def vvp_reduce_and : SDNode<"VEISD::VVP_REDUCE_AND", SDTReduceVVP>;
+def vvp_reduce_or : SDNode<"VEISD::VVP_REDUCE_OR", SDTReduceVVP>;
+def vvp_reduce_xor : SDNode<"VEISD::VVP_REDUCE_XOR", SDTReduceVVP>;
+def vvp_reduce_smax : SDNode<"VEISD::VVP_REDUCE_SMAX", SDTReduceVVP>;
+
def vvp_select : SDNode<"VEISD::VVP_SELECT", SDTSelectVVP>;
defm : Set_CC<v256i32,"VFMKW","VCMPUW",CCUIOp,icond2cc>;
defm : Set_CC<v256i32,"VFMKW","VCMPSWZX",CCSIOp,icond2cc>;
defm : Set_CC<v256f32,"VFMKS","VFCMPS",cond,fcond2cc>;
+
+multiclass Reduce_GenericInt<ValueType VectorVT,
+ RegisterClass ResRC, ValueType ResVT,
+ string VVPRedOp, string RedInstName> {
+ // Unmasked.
+ def : Pat <(ResVT (!cast<SDPatternOperator>("vvp_reduce_"#VVPRedOp)
+ VectorVT:$vx, (v256i1 true_mask), i32:$vl)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>("LVSvi")
+ (!cast<Instruction>(RedInstName#"vl") $vx, $vl), 0),
+ ResRC)>;
+
+ // Masked.
+ def : Pat <(ResVT (!cast<SDPatternOperator>("vvp_reduce_"#VVPRedOp)
+ VectorVT:$vx, v256i1:$vm, i32:$vl)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>("LVSvi")
+ (!cast<Instruction>(RedInstName#"vml") $vx, $vm, $vl), 0),
+ ResRC)>;
+}
+
+multiclass IntReduce_ShortLong<ValueType VectorVT,
+ RegisterClass ResRC, ValueType ResVT,
+ string SumSuffix, string MinMaxSuffix> {
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "or", "VROR">;
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "and", "VRAND">;
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "xor", "VRXOR">;
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "add", "VSUM"#SumSuffix>;
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "smax", "VRMAX"#MinMaxSuffix>;
+}
+
+defm: IntReduce_ShortLong<v256i64, I64, i64, "L","SLFST">;
+defm: IntReduce_ShortLong<v256i32, I32, i32, "WSX","SWFSTSX">;
#define REGISTER_PACKED(OPC)
#endif
-ADD_VVP_OP(VVP_GATHER, MGATHER) HANDLE_VP_TO_VVP(VP_GATHER, VVP_GATHER)
-ADD_VVP_OP(VVP_SCATTER, MSCATTER) HANDLE_VP_TO_VVP(VP_SCATTER, VVP_SCATTER)
+/// ADD_REDUCE_VVP_OP(OPC)
+/// \p OPC The VVP opcode of the operation.
+/// \p SDNAME The standard opcode of the operation.
+#ifndef ADD_REDUCE_VVP_OP
+#define ADD_REDUCE_VVP_OP(OPC, SDNAME) ADD_VVP_OP(OPC, SDNAME)
+#endif
+
+// Scalar standard ISD to perform this reduction.
+#ifndef HANDLE_VVP_REDUCE_TO_SCALAR
+#define HANDLE_VVP_REDUCE_TO_SCALAR(VVP_RED_ISD, REDUCE_ISD)
+#endif
+
+/// Reductions.
+#define HELPER_REDUCTION(OPC, SCALAR_OPC) \
+ ADD_REDUCE_VVP_OP(VVP_REDUCE_##OPC,VECREDUCE_##OPC) \
+ HANDLE_VP_TO_VVP(VP_REDUCE_##OPC, VVP_REDUCE_##OPC) \
+ HANDLE_VVP_REDUCE_TO_SCALAR(VVP_REDUCE_##OPC, SCALAR_OPC)
+
+HELPER_REDUCTION(ADD, ADD)
+HELPER_REDUCTION(AND, AND)
+HELPER_REDUCTION(OR, OR)
+HELPER_REDUCTION(XOR, XOR)
+HELPER_REDUCTION(SMAX, SMAX)
+
+#undef HELPER_REDUCTION
ADD_VVP_OP(VVP_LOAD,LOAD) HANDLE_VP_TO_VVP(VP_LOAD, VVP_LOAD) REGISTER_PACKED(VVP_LOAD)
ADD_VVP_OP(VVP_STORE,STORE) HANDLE_VP_TO_VVP(VP_STORE, VVP_STORE) REGISTER_PACKED(VVP_STORE)
+ADD_VVP_OP(VVP_GATHER, MGATHER) HANDLE_VP_TO_VVP(VP_GATHER, VVP_GATHER)
+ADD_VVP_OP(VVP_SCATTER, MSCATTER) HANDLE_VP_TO_VVP(VP_SCATTER, VVP_SCATTER)
+
// Integer arithmetic.
ADD_BINARY_VVP_OP_COMPACT(ADD) REGISTER_PACKED(VVP_ADD)
ADD_BINARY_VVP_OP_COMPACT(SUB) REGISTER_PACKED(VVP_SUB)
HANDLE_VP_TO_VVP(VP_SELECT, VVP_SELECT)
HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT)
+
#undef ADD_BINARY_VVP_OP
#undef ADD_BINARY_VVP_OP_COMPACT
+#undef ADD_REDUCE_VVP_OP
#undef ADD_VVP_OP
#undef HANDLE_VP_TO_VVP
+#undef HANDLE_VVP_REDUCE_TO_SCALAR
#undef REGISTER_PACKED
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.add.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_add_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_add_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vsum.l %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.add.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.add.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_add_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_add_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vsum.w.sx %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.add.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.and.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_and_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_and_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrand %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.and.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.and.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_and_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_and_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrand %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.and.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.or.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_or_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_or_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vror %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.or.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.or.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_or_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_or_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vror %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.or.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.smax.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_smax_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_smax_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrmaxs.l.fst %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.smax.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.smax.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_smax_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_smax_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrmaxs.w.fst.sx %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.smax.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.xor.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_xor_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_xor_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrxor %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.xor.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.xor.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_xor_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_xor_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrxor %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.xor.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.add.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_add_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_add_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vsum.l %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: adds.l %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.add.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.add.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_add_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_add_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vsum.w.sx %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s1
+; CHECK-NEXT: adds.w.sx %s1, %s0, %s1
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.add.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.and.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_and_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_and_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrand %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: and %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.and.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.and.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_and_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_and_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s2, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw2 killed $sw2 killed $sx2
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrand %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s2, 0, %s1
+; CHECK-NEXT: # implicit-def: $sx1
+; CHECK-NEXT: or %s1, 0, %s2
+; CHECK-NEXT: and %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.and.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.or.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_or_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_or_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vror %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.or.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.or.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_or_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_or_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s2, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw2 killed $sw2 killed $sx2
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vror %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s2, 0, %s1
+; CHECK-NEXT: # implicit-def: $sx1
+; CHECK-NEXT: or %s1, 0, %s2
+; CHECK-NEXT: or %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.or.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.smax.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_smax_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_smax_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrmaxs.l.fst %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: maxs.l %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.smax.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.smax.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_smax_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_smax_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrmaxs.w.fst.sx %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s1
+; CHECK-NEXT: maxs.w.sx %s1, %s0, %s1
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.smax.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.xor.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_xor_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_xor_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrxor %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: xor %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.xor.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.xor.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_xor_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_xor_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s2, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw2 killed $sw2 killed $sx2
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrxor %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s2, 0, %s1
+; CHECK-NEXT: # implicit-def: $sx1
+; CHECK-NEXT: or %s1, 0, %s2
+; CHECK-NEXT: xor %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.xor.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+