setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
if (!HasMVEFP) {
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
setOperationAction(ISD::FMINNUM, VT, Legal);
setOperationAction(ISD::FMAXNUM, VT, Legal);
setOperationAction(ISD::FROUND, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
// No native support for these.
setOperationAction(ISD::FDIV, VT, Expand);
}
}
+ // Custom Expand smaller than legal vector reductions to prevent false zero
+ // items being added.
+ setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
+
// We 'support' these types up to bitcast/load/store level, regardless of
// MVE integer-only / float support. Only doing FP data processing on the FP
// vector types is inhibited at integer-only level.
return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
}
+static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ if (!ST->hasMVEIntegerOps())
+ return SDValue();
+
+ SDLoc dl(Op);
+ unsigned BaseOpcode = 0;
+ switch (Op->getOpcode()) {
+ default: llvm_unreachable("Expected VECREDUCE opcode");
+ case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
+ case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
+ case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
+ case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
+ case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
+ case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
+ case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
+ case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
+ }
+
+ SDValue Op0 = Op->getOperand(0);
+ EVT VT = Op0.getValueType();
+ EVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumActiveLanes = NumElts;
+
+ assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
+ NumActiveLanes == 2) &&
+ "Only expected a power 2 vector size");
+
+ // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
+ // allows us to easily extract vector elements from the lanes.
+ while (NumActiveLanes > 4) {
+ unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
+ SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
+ Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
+ NumActiveLanes /= 2;
+ }
+
+ SDValue Res;
+ if (NumActiveLanes == 4) {
+ // The remaining 4 elements are summed sequentially
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
+ SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
+ SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
+ SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
+ SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
+ Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
+ } else {
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(1, dl, MVT::i32));
+ Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
+ }
+
+ // Result type may be wider than element type.
+ if (EltVT != Op->getValueType(0))
+ Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
+ return Res;
+}
+
+static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ if (!ST->hasMVEFloatOps())
+ return SDValue();
+ return LowerVecReduce(Op, DAG, ST);
+}
+
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
// Acquire/Release load/store is not legal for targets without a dmb or
return LowerSTORE(Op, DAG, Subtarget);
case ISD::MLOAD:
return LowerMLOAD(Op, DAG);
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ return LowerVecReduce(Op, DAG, Subtarget);
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAX:
+ return LowerVecReduceF(Op, DAG, Subtarget);
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
define arm_aapcs_vfpcc i32 @and_v4i32(<4 x i32> %x) {
; CHECK-LABEL: and_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: and_v8i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i16 @and_v4i16(<4 x i16> %x) {
; CHECK-LABEL: and_v4i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i16 @and_v8i16(<8 x i16> %x) {
; CHECK-LABEL: and_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: and_v16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i8 @and_v8i8(<8 x i8> %x) {
; CHECK-LABEL: and_v8i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i8 @and_v16i8(<16 x i8> %x) {
; CHECK-LABEL: and_v16i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[9]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[10]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[11]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[12]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[13]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[14]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: vmov.u8 r2, q0[0]
+; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: and_v32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[9]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[10]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[11]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[12]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[13]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[14]
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: vmov.u8 r2, q0[0]
+; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i32 @and_v4i32_acc(<4 x i32> %x, i32 %y) {
; CHECK-LABEL: and_v4i32_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: ands r1, r2
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: ands r2, r3
; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
; CHECK-LABEL: and_v8i32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: ands r1, r2
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: ands r2, r3
; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i16 @and_v4i16_acc(<4 x i16> %x, i16 %y) {
; CHECK-LABEL: and_v4i16_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: ands r1, r2
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: ands r2, r3
; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i16 @and_v8i16_acc(<8 x i16> %x, i16 %y) {
; CHECK-LABEL: and_v8i16_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: ands r1, r2
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: ands r2, r3
; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
; CHECK-LABEL: and_v16i16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: ands r1, r2
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: ands r2, r3
; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i8 @and_v8i8_acc(<8 x i8> %x, i8 %y) {
; CHECK-LABEL: and_v8i8_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: ands r1, r2
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: ands r2, r3
; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i8 @and_v16i8_acc(<16 x i8> %x, i8 %y) {
; CHECK-LABEL: and_v16i8_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.u8 r2, q0[0]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[2]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[3]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[4]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[5]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[6]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[7]
-; CHECK-NEXT: ands r1, r2
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[9]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[10]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[11]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[12]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[13]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[14]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[15]
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: ands r2, r3
; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
; CHECK-LABEL: and_v32i8_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.u8 r2, q0[0]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[2]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[3]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[4]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[5]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[6]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[7]
-; CHECK-NEXT: ands r1, r2
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[9]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[10]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[11]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[12]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[13]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[14]
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[15]
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: ands r2, r3
; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i32 @or_v4i32(<4 x i32> %x) {
; CHECK-LABEL: or_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: or_v8i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vorr q0, q0, q1
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i16 @or_v4i16(<4 x i16> %x) {
; CHECK-LABEL: or_v4i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i16 @or_v8i16(<8 x i16> %x) {
; CHECK-LABEL: or_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: or_v16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vorr q0, q0, q1
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i8 @or_v8i8(<8 x i8> %x) {
; CHECK-LABEL: or_v8i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i8 @or_v16i8(<16 x i8> %x) {
; CHECK-LABEL: or_v16i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[9]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[10]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[11]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[12]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[13]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[14]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: vmov.u8 r2, q0[0]
+; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: or_v32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vorr q0, q0, q1
-; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[9]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[10]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[11]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[12]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[13]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[14]
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: vmov.u8 r2, q0[0]
+; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i32 @or_v4i32_acc(<4 x i32> %x, i32 %y) {
; CHECK-LABEL: or_v4i32_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: orrs r1, r2
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: orrs r2, r3
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
; CHECK-LABEL: or_v8i32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vorr q0, q0, q1
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: orrs r1, r2
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: orrs r2, r3
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i16 @or_v4i16_acc(<4 x i16> %x, i16 %y) {
; CHECK-LABEL: or_v4i16_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: orrs r1, r2
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: orrs r2, r3
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i16 @or_v8i16_acc(<8 x i16> %x, i16 %y) {
; CHECK-LABEL: or_v8i16_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: orrs r1, r2
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: orrs r2, r3
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
; CHECK-LABEL: or_v16i16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vorr q0, q0, q1
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: orrs r1, r2
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: orrs r2, r3
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i8 @or_v8i8_acc(<8 x i8> %x, i8 %y) {
; CHECK-LABEL: or_v8i8_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: orrs r1, r2
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: orrs r2, r3
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i8 @or_v16i8_acc(<16 x i8> %x, i8 %y) {
; CHECK-LABEL: or_v16i8_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.u8 r2, q0[0]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[2]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[3]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[4]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[5]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[6]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[7]
-; CHECK-NEXT: orrs r1, r2
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[9]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[10]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[11]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[12]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[13]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[14]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[15]
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: orrs r2, r3
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
; CHECK-LABEL: or_v32i8_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vorr q0, q0, q1
-; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.u8 r2, q0[0]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[2]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[3]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[4]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[5]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[6]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[7]
-; CHECK-NEXT: orrs r1, r2
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[9]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[10]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[11]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[12]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[13]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[14]
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[15]
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: orrs r2, r3
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i32 @xor_v4i32(<4 x i32> %x) {
; CHECK-LABEL: xor_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: xor_v8i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: veor q0, q0, q1
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i16 @xor_v4i16(<4 x i16> %x) {
; CHECK-LABEL: xor_v4i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i16 @xor_v8i16(<8 x i16> %x) {
; CHECK-LABEL: xor_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: xor_v16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: veor q0, q0, q1
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i8 @xor_v8i8(<8 x i8> %x) {
; CHECK-LABEL: xor_v8i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i8 @xor_v16i8(<16 x i8> %x) {
; CHECK-LABEL: xor_v16i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[9]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[10]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[11]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[12]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[13]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[14]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: vmov.u8 r2, q0[0]
+; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: xor_v32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: veor q0, q0, q1
-; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[9]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[10]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[11]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[12]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[13]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[14]
-; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: vmov.u8 r2, q0[0]
+; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i32 @xor_v4i32_acc(<4 x i32> %x, i32 %y) {
; CHECK-LABEL: xor_v4i32_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: eors r2, r3
; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
; CHECK-LABEL: xor_v8i32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: veor q0, q0, q1
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: eors r2, r3
; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i16 @xor_v4i16_acc(<4 x i16> %x, i16 %y) {
; CHECK-LABEL: xor_v4i16_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: eors r2, r3
; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i16 @xor_v8i16_acc(<8 x i16> %x, i16 %y) {
; CHECK-LABEL: xor_v8i16_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: eors r2, r3
; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
; CHECK-LABEL: xor_v16i16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: veor q0, q0, q1
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: eors r2, r3
; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i8 @xor_v8i8_acc(<8 x i8> %x, i8 %y) {
; CHECK-LABEL: xor_v8i8_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: eors r2, r3
; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i8 @xor_v16i8_acc(<16 x i8> %x, i8 %y) {
; CHECK-LABEL: xor_v16i8_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.u8 r2, q0[0]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[2]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[3]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[4]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[5]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[6]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[7]
-; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[9]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[10]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[11]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[12]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[13]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[14]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[15]
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: eors r2, r3
; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
; CHECK-LABEL: xor_v32i8_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: veor q0, q0, q1
-; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.u8 r2, q0[0]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[2]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[3]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[4]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[5]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[6]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[7]
-; CHECK-NEXT: eors r1, r2
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: veor q0, q0, q1
+; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[9]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[10]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[11]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[12]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[13]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[14]
-; CHECK-NEXT: eors r1, r2
-; CHECK-NEXT: vmov.u8 r2, q0[15]
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: eors r2, r3
; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
; CHECK-NEXT: bx lr
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
define arm_aapcs_vfpcc float @fadd_v2f32(<2 x float> %x, float %y) {
-; CHECK-LABEL: fadd_v2f32:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vadd.f32 s0, s0, s1
-; CHECK-NEXT: vldr s2, .LCPI0_0
-; CHECK-NEXT: vadd.f32 s0, s0, s2
-; CHECK-NEXT: vadd.f32 s0, s0, s2
-; CHECK-NEXT: vadd.f32 s0, s4, s0
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI0_0:
-; CHECK-NEXT: .long 0x00000000 @ float 0
+; CHECK-FP-LABEL: fadd_v2f32:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vadd.f32 s0, s0, s1
+; CHECK-FP-NEXT: vldr s2, .LCPI0_0
+; CHECK-FP-NEXT: vadd.f32 s0, s0, s2
+; CHECK-FP-NEXT: vadd.f32 s0, s4, s0
+; CHECK-FP-NEXT: bx lr
+; CHECK-FP-NEXT: .p2align 2
+; CHECK-FP-NEXT: @ %bb.1:
+; CHECK-FP-NEXT: .LCPI0_0:
+; CHECK-FP-NEXT: .long 0x00000000 @ float 0
+;
+; CHECK-NOFP-LABEL: fadd_v2f32:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NOFP-NEXT: vldr s2, .LCPI0_0
+; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 2
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI0_0:
+; CHECK-NOFP-NEXT: .long 0x00000000 @ float 0
entry:
%z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) {
-; CHECK-LABEL: fadd_v4f32:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vadd.f32 s6, s0, s1
-; CHECK-NEXT: vadd.f32 s6, s6, s2
-; CHECK-NEXT: vadd.f32 s0, s6, s3
-; CHECK-NEXT: vadd.f32 s0, s4, s0
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fadd_v4f32:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vadd.f32 s6, s2, s3
+; CHECK-FP-NEXT: vadd.f32 s0, s0, s1
+; CHECK-FP-NEXT: vadd.f32 s0, s0, s6
+; CHECK-FP-NEXT: vadd.f32 s0, s4, s0
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fadd_v4f32:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vadd.f32 s6, s0, s1
+; CHECK-NOFP-NEXT: vadd.f32 s6, s6, s2
+; CHECK-NOFP-NEXT: vadd.f32 s0, s6, s3
+; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0
+; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %y, <4 x float> %x)
ret float %z
; CHECK-FP-LABEL: fadd_v8f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vadd.f32 q0, q0, q1
-; CHECK-FP-NEXT: vadd.f32 s4, s0, s1
-; CHECK-FP-NEXT: vadd.f32 s4, s4, s2
-; CHECK-FP-NEXT: vadd.f32 s0, s4, s3
+; CHECK-FP-NEXT: vadd.f32 s4, s2, s3
+; CHECK-FP-NEXT: vadd.f32 s0, s0, s1
+; CHECK-FP-NEXT: vadd.f32 s0, s0, s4
; CHECK-FP-NEXT: vadd.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
ret float %z
}
+define arm_aapcs_vfpcc void @fadd_v2f16(<2 x half> %x, half* %yy) {
+; CHECK-FP-LABEL: fadd_v2f16:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmovx.f16 s4, s0
+; CHECK-FP-NEXT: vadd.f16 s0, s0, s4
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vadd.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fadd_v2f16:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4
+; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI3_0
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vadd.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 1
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI3_0:
+; CHECK-NOFP-NEXT: .short 0x0000 @ half 0
+entry:
+ %y = load half, half* %yy
+ %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half %y, <2 x half> %x)
+ store half %z, half* %yy
+ ret void
+}
+
define arm_aapcs_vfpcc void @fadd_v4f16(<4 x half> %x, half* %yy) {
-; CHECK-LABEL: fadd_v4f16:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vadd.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s1
-; CHECK-NEXT: vadd.f16 s4, s4, s1
-; CHECK-NEXT: vldr.16 s2, .LCPI3_0
-; CHECK-NEXT: vadd.f16 s0, s4, s0
-; CHECK-NEXT: vadd.f16 s0, s0, s2
-; CHECK-NEXT: vadd.f16 s0, s0, s2
-; CHECK-NEXT: vadd.f16 s0, s0, s2
-; CHECK-NEXT: vadd.f16 s0, s0, s2
-; CHECK-NEXT: vldr.16 s2, [r0]
-; CHECK-NEXT: vadd.f16 s0, s2, s0
-; CHECK-NEXT: vstr.16 s0, [r0]
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 1
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI3_0:
-; CHECK-NEXT: .short 0x0000 @ half 0
+; CHECK-FP-LABEL: fadd_v4f16:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmovx.f16 s4, s1
+; CHECK-FP-NEXT: vmovx.f16 s6, s0
+; CHECK-FP-NEXT: vadd.f16 s0, s0, s6
+; CHECK-FP-NEXT: vadd.f16 s4, s1, s4
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vadd.f16 s0, s0, s4
+; CHECK-FP-NEXT: vadd.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fadd_v4f16:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vadd.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI4_0
+; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vadd.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 1
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI4_0:
+; CHECK-NOFP-NEXT: .short 0x0000 @ half 0
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x)
}
define arm_aapcs_vfpcc void @fadd_v8f16(<8 x half> %x, half* %yy) {
-; CHECK-LABEL: fadd_v8f16:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vadd.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s3
-; CHECK-NEXT: vadd.f16 s4, s4, s1
-; CHECK-NEXT: vadd.f16 s4, s4, s6
-; CHECK-NEXT: vmovx.f16 s6, s2
-; CHECK-NEXT: vadd.f16 s4, s4, s2
-; CHECK-NEXT: vldr.16 s2, [r0]
-; CHECK-NEXT: vadd.f16 s4, s4, s6
-; CHECK-NEXT: vadd.f16 s4, s4, s3
-; CHECK-NEXT: vadd.f16 s0, s4, s0
-; CHECK-NEXT: vadd.f16 s0, s2, s0
-; CHECK-NEXT: vstr.16 s0, [r0]
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fadd_v8f16:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vadd.f16 q0, q0, q1
+; CHECK-FP-NEXT: vadd.f16 s4, s2, s3
+; CHECK-FP-NEXT: vadd.f16 s0, s0, s1
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vadd.f16 s0, s0, s4
+; CHECK-FP-NEXT: vadd.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fadd_v8f16:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT: vadd.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s3
+; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vadd.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half %y, <8 x half> %x)
; CHECK-FP-LABEL: fadd_v16f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vadd.f16 q0, q0, q1
-; CHECK-FP-NEXT: vmovx.f16 s4, s0
-; CHECK-FP-NEXT: vmovx.f16 s6, s1
-; CHECK-FP-NEXT: vadd.f16 s4, s0, s4
-; CHECK-FP-NEXT: vmovx.f16 s0, s3
-; CHECK-FP-NEXT: vadd.f16 s4, s4, s1
-; CHECK-FP-NEXT: vadd.f16 s4, s4, s6
-; CHECK-FP-NEXT: vmovx.f16 s6, s2
-; CHECK-FP-NEXT: vadd.f16 s4, s4, s2
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vadd.f16 q0, q0, q1
+; CHECK-FP-NEXT: vadd.f16 s4, s2, s3
+; CHECK-FP-NEXT: vadd.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
-; CHECK-FP-NEXT: vadd.f16 s4, s4, s6
-; CHECK-FP-NEXT: vadd.f16 s4, s4, s3
-; CHECK-FP-NEXT: vadd.f16 s0, s4, s0
+; CHECK-FP-NEXT: vadd.f16 s0, s0, s4
; CHECK-FP-NEXT: vadd.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>)
+declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half, <2 x half>)
declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>)
; FIXME minnum nonan X, +Inf -> X ?
define arm_aapcs_vfpcc float @fmin_v2f32(<2 x float> %x) {
-; CHECK-LABEL: fmin_v2f32:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldr s4, .LCPI0_0
-; CHECK-NEXT: vminnm.f32 s0, s0, s1
-; CHECK-NEXT: vminnm.f32 s0, s0, s4
-; CHECK-NEXT: vminnm.f32 s0, s0, s4
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI0_0:
-; CHECK-NEXT: .long 0x7f800000 @ float +Inf
+; CHECK-FP-LABEL: fmin_v2f32:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vldr s4, .LCPI0_0
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT: bx lr
+; CHECK-FP-NEXT: .p2align 2
+; CHECK-FP-NEXT: @ %bb.1:
+; CHECK-FP-NEXT: .LCPI0_0:
+; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf
+;
+; CHECK-NOFP-LABEL: fmin_v2f32:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vldr s4, .LCPI0_0
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 2
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI0_0:
+; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fmin_v4f32(<4 x float> %x) {
-; CHECK-LABEL: fmin_v4f32:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vminnm.f32 s4, s0, s1
-; CHECK-NEXT: vminnm.f32 s4, s4, s2
-; CHECK-NEXT: vminnm.f32 s0, s4, s3
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmin_v4f32:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v4f32:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1
+; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
ret float %z
; CHECK-FP-LABEL: fmin_v8f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT: vminnm.f32 s4, s0, s1
-; CHECK-FP-NEXT: vminnm.f32 s4, s4, s2
-; CHECK-FP-NEXT: vminnm.f32 s0, s4, s3
+; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v8f32:
}
define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) {
-; CHECK-LABEL: fmin_v4f16:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vminnm.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s1
-; CHECK-NEXT: vminnm.f16 s4, s4, s1
-; CHECK-NEXT: vldr.16 s2, .LCPI3_0
-; CHECK-NEXT: vminnm.f16 s0, s4, s0
-; CHECK-NEXT: vminnm.f16 s0, s0, s2
-; CHECK-NEXT: vminnm.f16 s0, s0, s2
-; CHECK-NEXT: vminnm.f16 s0, s0, s2
-; CHECK-NEXT: vminnm.f16 s0, s0, s2
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 1
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI3_0:
-; CHECK-NEXT: .short 0x7c00 @ half +Inf
+; CHECK-FP-LABEL: fmin_v4f16:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmovx.f16 s4, s1
+; CHECK-FP-NEXT: vmovx.f16 s6, s0
+; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v4f16:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI3_0
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 1
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI3_0:
+; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmin_v8f16(<8 x half> %x) {
-; CHECK-LABEL: fmin_v8f16:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vminnm.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s3
-; CHECK-NEXT: vminnm.f16 s4, s4, s1
-; CHECK-NEXT: vminnm.f16 s4, s4, s6
-; CHECK-NEXT: vmovx.f16 s6, s2
-; CHECK-NEXT: vminnm.f16 s4, s4, s2
-; CHECK-NEXT: vminnm.f16 s4, s4, s6
-; CHECK-NEXT: vminnm.f16 s4, s4, s3
-; CHECK-NEXT: vminnm.f16 s0, s4, s0
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmin_v8f16:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
+; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v8f16:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
ret half %z
; CHECK-FP-LABEL: fmin_v16f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT: vmovx.f16 s4, s0
-; CHECK-FP-NEXT: vmovx.f16 s6, s1
-; CHECK-FP-NEXT: vminnm.f16 s4, s0, s4
-; CHECK-FP-NEXT: vmovx.f16 s0, s3
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s1
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s6
-; CHECK-FP-NEXT: vmovx.f16 s6, s2
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s2
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s6
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s3
-; CHECK-FP-NEXT: vminnm.f16 s0, s4, s0
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
+; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmin_v16f16:
}
define arm_aapcs_vfpcc float @fmin_v2f32_acc(<2 x float> %x, float %y) {
-; CHECK-LABEL: fmin_v2f32_acc:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldr s6, .LCPI18_0
-; CHECK-NEXT: vminnm.f32 s0, s0, s1
-; CHECK-NEXT: vminnm.f32 s0, s0, s6
-; CHECK-NEXT: vminnm.f32 s0, s0, s6
-; CHECK-NEXT: vminnm.f32 s0, s4, s0
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI18_0:
-; CHECK-NEXT: .long 0x7f800000 @ float +Inf
+; CHECK-FP-LABEL: fmin_v2f32_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vldr s6, .LCPI18_0
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6
+; CHECK-FP-NEXT: vminnm.f32 s0, s4, s0
+; CHECK-FP-NEXT: bx lr
+; CHECK-FP-NEXT: .p2align 2
+; CHECK-FP-NEXT: @ %bb.1:
+; CHECK-FP-NEXT: .LCPI18_0:
+; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf
+;
+; CHECK-NOFP-LABEL: fmin_v2f32_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vldr s6, .LCPI18_0
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 2
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI18_0:
+; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
%c = fcmp fast olt float %y, %z
}
define arm_aapcs_vfpcc float @fmin_v4f32_acc(<4 x float> %x, float %y) {
-; CHECK-LABEL: fmin_v4f32_acc:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vminnm.f32 s6, s0, s1
-; CHECK-NEXT: vminnm.f32 s6, s6, s2
-; CHECK-NEXT: vminnm.f32 s0, s6, s3
-; CHECK-NEXT: vminnm.f32 s0, s4, s0
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmin_v4f32_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6
+; CHECK-FP-NEXT: vminnm.f32 s0, s4, s0
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v4f32_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1
+; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0
+; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
%c = fcmp fast olt float %y, %z
; CHECK-FP-LABEL: fmin_v8f32_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT: vminnm.f32 s4, s0, s1
-; CHECK-FP-NEXT: vminnm.f32 s4, s4, s2
-; CHECK-FP-NEXT: vminnm.f32 s0, s4, s3
+; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4
; CHECK-FP-NEXT: vminnm.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
}
define arm_aapcs_vfpcc void @fmin_v4f16_acc(<4 x half> %x, half* %yy) {
-; CHECK-LABEL: fmin_v4f16_acc:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vminnm.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s1
-; CHECK-NEXT: vminnm.f16 s4, s4, s1
-; CHECK-NEXT: vldr.16 s2, .LCPI21_0
-; CHECK-NEXT: vminnm.f16 s0, s4, s0
-; CHECK-NEXT: vminnm.f16 s0, s0, s2
-; CHECK-NEXT: vminnm.f16 s0, s0, s2
-; CHECK-NEXT: vminnm.f16 s0, s0, s2
-; CHECK-NEXT: vminnm.f16 s0, s0, s2
-; CHECK-NEXT: vldr.16 s2, [r0]
-; CHECK-NEXT: vminnm.f16 s0, s2, s0
-; CHECK-NEXT: vstr.16 s0, [r0]
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 1
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI21_0:
-; CHECK-NEXT: .short 0x7c00 @ half +Inf
+; CHECK-FP-LABEL: fmin_v4f16_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmovx.f16 s4, s1
+; CHECK-FP-NEXT: vmovx.f16 s6, s0
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v4f16_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI21_0
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 1
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI21_0:
+; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
ret void
}
+define arm_aapcs_vfpcc void @fmin_v2f16_acc(<2 x half> %x, half* %yy) {
+; CHECK-FP-LABEL: fmin_v2f16_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmovx.f16 s4, s0
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v2f16_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI22_0
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 1
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI22_0:
+; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf
+entry:
+ %y = load half, half* %yy
+ %z = call fast half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %x)
+ %c = fcmp fast olt half %y, %z
+ %r = select i1 %c, half %y, half %z
+ store half %r, half* %yy
+ ret void
+}
+
define arm_aapcs_vfpcc void @fmin_v8f16_acc(<8 x half> %x, half* %yy) {
-; CHECK-LABEL: fmin_v8f16_acc:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vminnm.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s3
-; CHECK-NEXT: vminnm.f16 s4, s4, s1
-; CHECK-NEXT: vminnm.f16 s4, s4, s6
-; CHECK-NEXT: vmovx.f16 s6, s2
-; CHECK-NEXT: vminnm.f16 s4, s4, s2
-; CHECK-NEXT: vldr.16 s2, [r0]
-; CHECK-NEXT: vminnm.f16 s4, s4, s6
-; CHECK-NEXT: vminnm.f16 s4, s4, s3
-; CHECK-NEXT: vminnm.f16 s0, s4, s0
-; CHECK-NEXT: vminnm.f16 s0, s2, s0
-; CHECK-NEXT: vstr.16 s0, [r0]
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmin_v8f16_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
+; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v8f16_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
; CHECK-FP-LABEL: fmin_v16f16_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT: vmovx.f16 s4, s0
-; CHECK-FP-NEXT: vmovx.f16 s6, s1
-; CHECK-FP-NEXT: vminnm.f16 s4, s0, s4
-; CHECK-FP-NEXT: vmovx.f16 s0, s3
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s1
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s6
-; CHECK-FP-NEXT: vmovx.f16 s6, s2
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s2
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1
+; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s6
-; CHECK-FP-NEXT: vminnm.f16 s4, s4, s3
-; CHECK-FP-NEXT: vminnm.f16 s0, s4, s0
+; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
}
define arm_aapcs_vfpcc float @fmax_v2f32(<2 x float> %x) {
-; CHECK-LABEL: fmax_v2f32:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldr s4, .LCPI36_0
-; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
-; CHECK-NEXT: vmaxnm.f32 s0, s0, s4
-; CHECK-NEXT: vmaxnm.f32 s0, s0, s4
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI36_0:
-; CHECK-NEXT: .long 0xff800000 @ float -Inf
+; CHECK-FP-LABEL: fmax_v2f32:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vldr s4, .LCPI37_0
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT: bx lr
+; CHECK-FP-NEXT: .p2align 2
+; CHECK-FP-NEXT: @ %bb.1:
+; CHECK-FP-NEXT: .LCPI37_0:
+; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf
+;
+; CHECK-NOFP-LABEL: fmax_v2f32:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vldr s4, .LCPI37_0
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 2
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI37_0:
+; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
ret float %z
}
define arm_aapcs_vfpcc float @fmax_v4f32(<4 x float> %x) {
-; CHECK-LABEL: fmax_v4f32:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmaxnm.f32 s4, s0, s1
-; CHECK-NEXT: vmaxnm.f32 s4, s4, s2
-; CHECK-NEXT: vmaxnm.f32 s0, s4, s3
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmax_v4f32:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v4f32:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1
+; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
ret float %z
; CHECK-FP-LABEL: fmax_v8f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT: vmaxnm.f32 s4, s0, s1
-; CHECK-FP-NEXT: vmaxnm.f32 s4, s4, s2
-; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s3
+; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v8f32:
}
define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) {
-; CHECK-LABEL: fmax_v4f16:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmaxnm.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s1
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s1
-; CHECK-NEXT: vldr.16 s2, .LCPI39_0
-; CHECK-NEXT: vmaxnm.f16 s0, s4, s0
-; CHECK-NEXT: vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT: vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT: vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT: vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 1
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI39_0:
-; CHECK-NEXT: .short 0xfc00 @ half -Inf
+; CHECK-FP-LABEL: fmax_v4f16:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmovx.f16 s4, s1
+; CHECK-FP-NEXT: vmovx.f16 s6, s0
+; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v4f16:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI40_0
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 1
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI40_0:
+; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
ret half %z
}
define arm_aapcs_vfpcc half @fmax_v8f16(<8 x half> %x) {
-; CHECK-LABEL: fmax_v8f16:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vmaxnm.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s3
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s1
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s6
-; CHECK-NEXT: vmovx.f16 s6, s2
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s2
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s6
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s3
-; CHECK-NEXT: vmaxnm.f16 s0, s4, s0
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmax_v8f16:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
+; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v8f16:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
ret half %z
; CHECK-FP-LABEL: fmax_v16f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT: vmovx.f16 s4, s0
-; CHECK-FP-NEXT: vmovx.f16 s6, s1
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s0, s4
-; CHECK-FP-NEXT: vmovx.f16 s0, s3
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s1
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s6
-; CHECK-FP-NEXT: vmovx.f16 s6, s2
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s2
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s6
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s3
-; CHECK-FP-NEXT: vmaxnm.f16 s0, s4, s0
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
+; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmax_v16f16:
}
define arm_aapcs_vfpcc float @fmax_v2f32_acc(<2 x float> %x, float %y) {
-; CHECK-LABEL: fmax_v2f32_acc:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldr s6, .LCPI54_0
-; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
-; CHECK-NEXT: vmaxnm.f32 s0, s0, s6
-; CHECK-NEXT: vmaxnm.f32 s0, s0, s6
-; CHECK-NEXT: vmaxnm.f32 s0, s4, s0
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI54_0:
-; CHECK-NEXT: .long 0xff800000 @ float -Inf
+; CHECK-FP-LABEL: fmax_v2f32_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vldr s6, .LCPI55_0
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s0
+; CHECK-FP-NEXT: bx lr
+; CHECK-FP-NEXT: .p2align 2
+; CHECK-FP-NEXT: @ %bb.1:
+; CHECK-FP-NEXT: .LCPI55_0:
+; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf
+;
+; CHECK-NOFP-LABEL: fmax_v2f32_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vldr s6, .LCPI55_0
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 2
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI55_0:
+; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
%c = fcmp fast ogt float %y, %z
}
define arm_aapcs_vfpcc float @fmax_v4f32_acc(<4 x float> %x, float %y) {
-; CHECK-LABEL: fmax_v4f32_acc:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmaxnm.f32 s6, s0, s1
-; CHECK-NEXT: vmaxnm.f32 s6, s6, s2
-; CHECK-NEXT: vmaxnm.f32 s0, s6, s3
-; CHECK-NEXT: vmaxnm.f32 s0, s4, s0
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmax_v4f32_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s0
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v4f32_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1
+; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0
+; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
%c = fcmp fast ogt float %y, %z
; CHECK-FP-LABEL: fmax_v8f32_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT: vmaxnm.f32 s4, s0, s1
-; CHECK-FP-NEXT: vmaxnm.f32 s4, s4, s2
-; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s3
+; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-FP-NEXT: vmaxnm.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
ret float %r
}
+define arm_aapcs_vfpcc void @fmax_v2f16_acc(<2 x half> %x, half* %yy) {
+; CHECK-FP-LABEL: fmax_v2f16_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmovx.f16 s4, s0
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v2f16_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI58_0
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 1
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI58_0:
+; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf
+entry:
+ %y = load half, half* %yy
+ %z = call fast half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %x)
+ %c = fcmp fast ogt half %y, %z
+ %r = select i1 %c, half %y, half %z
+ store half %r, half* %yy
+ ret void
+}
+
define arm_aapcs_vfpcc void @fmax_v4f16_acc(<4 x half> %x, half* %yy) {
-; CHECK-LABEL: fmax_v4f16_acc:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmaxnm.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s1
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s1
-; CHECK-NEXT: vldr.16 s2, .LCPI57_0
-; CHECK-NEXT: vmaxnm.f16 s0, s4, s0
-; CHECK-NEXT: vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT: vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT: vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT: vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT: vldr.16 s2, [r0]
-; CHECK-NEXT: vmaxnm.f16 s0, s2, s0
-; CHECK-NEXT: vstr.16 s0, [r0]
-; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 1
-; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI57_0:
-; CHECK-NEXT: .short 0xfc00 @ half -Inf
+; CHECK-FP-LABEL: fmax_v4f16_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmovx.f16 s4, s1
+; CHECK-FP-NEXT: vmovx.f16 s6, s0
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v4f16_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI59_0
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
+; CHECK-NOFP-NEXT: .p2align 1
+; CHECK-NOFP-NEXT: @ %bb.1:
+; CHECK-NOFP-NEXT: .LCPI59_0:
+; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
}
define arm_aapcs_vfpcc void @fmax_v8f16_acc(<8 x half> %x, half* %yy) {
-; CHECK-LABEL: fmax_v8f16_acc:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vmaxnm.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s3
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s1
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s6
-; CHECK-NEXT: vmovx.f16 s6, s2
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s2
-; CHECK-NEXT: vldr.16 s2, [r0]
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s6
-; CHECK-NEXT: vmaxnm.f16 s4, s4, s3
-; CHECK-NEXT: vmaxnm.f16 s0, s4, s0
-; CHECK-NEXT: vmaxnm.f16 s0, s2, s0
-; CHECK-NEXT: vstr.16 s0, [r0]
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmax_v8f16_acc:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
+; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v8f16_acc:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
; CHECK-FP-LABEL: fmax_v16f16_acc:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT: vmovx.f16 s4, s0
-; CHECK-FP-NEXT: vmovx.f16 s6, s1
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s0, s4
-; CHECK-FP-NEXT: vmovx.f16 s0, s3
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s1
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s6
-; CHECK-FP-NEXT: vmovx.f16 s6, s2
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s2
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1
+; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s6
-; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s3
-; CHECK-FP-NEXT: vmaxnm.f16 s0, s4, s0
+; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4
; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>)
+declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>)
declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>)
declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>)
declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>)
+declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>)
declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>)
declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>)
}
define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) {
-; CHECK-LABEL: fmul_v4f32:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmul.f32 s6, s0, s1
-; CHECK-NEXT: vmul.f32 s6, s6, s2
-; CHECK-NEXT: vmul.f32 s0, s6, s3
-; CHECK-NEXT: vmul.f32 s0, s4, s0
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmul_v4f32:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmul.f32 s6, s2, s3
+; CHECK-FP-NEXT: vmul.f32 s0, s0, s1
+; CHECK-FP-NEXT: vmul.f32 s0, s0, s6
+; CHECK-FP-NEXT: vmul.f32 s0, s4, s0
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmul_v4f32:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmul.f32 s6, s0, s1
+; CHECK-NOFP-NEXT: vmul.f32 s6, s6, s2
+; CHECK-NOFP-NEXT: vmul.f32 s0, s6, s3
+; CHECK-NOFP-NEXT: vmul.f32 s0, s4, s0
+; CHECK-NOFP-NEXT: bx lr
entry:
%z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %y, <4 x float> %x)
ret float %z
; CHECK-FP-LABEL: fmul_v8f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmul.f32 q0, q0, q1
-; CHECK-FP-NEXT: vmul.f32 s4, s0, s1
-; CHECK-FP-NEXT: vmul.f32 s4, s4, s2
-; CHECK-FP-NEXT: vmul.f32 s0, s4, s3
+; CHECK-FP-NEXT: vmul.f32 s4, s2, s3
+; CHECK-FP-NEXT: vmul.f32 s0, s0, s1
+; CHECK-FP-NEXT: vmul.f32 s0, s0, s4
; CHECK-FP-NEXT: vmul.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
ret float %z
}
-define arm_aapcs_vfpcc void @fmul_v4f16(<4 x half> %x, half* %yy) {
-; CHECK-LABEL: fmul_v4f16:
+define arm_aapcs_vfpcc void @fmul_v2f16(<2 x half> %x, half* %yy) {
+; CHECK-LABEL: fmul_v2f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmul.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s1
-; CHECK-NEXT: vmul.f16 s4, s4, s1
+; CHECK-NEXT: vmul.f16 s0, s0, s4
; CHECK-NEXT: vldr.16 s2, [r0]
-; CHECK-NEXT: vmul.f16 s0, s4, s0
; CHECK-NEXT: vmul.f16 s0, s2, s0
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: bx lr
entry:
%y = load half, half* %yy
+ %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x)
+ store half %z, half* %yy
+ ret void
+}
+
+define arm_aapcs_vfpcc void @fmul_v4f16(<4 x half> %x, half* %yy) {
+; CHECK-FP-LABEL: fmul_v4f16:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vmovx.f16 s4, s1
+; CHECK-FP-NEXT: vmovx.f16 s6, s0
+; CHECK-FP-NEXT: vmul.f16 s0, s0, s6
+; CHECK-FP-NEXT: vmul.f16 s4, s1, s4
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vmul.f16 s0, s0, s4
+; CHECK-FP-NEXT: vmul.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmul_v4f16:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmul.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vmul.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
+entry:
+ %y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half %y, <4 x half> %x)
store half %z, half* %yy
ret void
}
define arm_aapcs_vfpcc void @fmul_v8f16(<8 x half> %x, half* %yy) {
-; CHECK-LABEL: fmul_v8f16:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vmul.f16 s4, s0, s4
-; CHECK-NEXT: vmovx.f16 s0, s3
-; CHECK-NEXT: vmul.f16 s4, s4, s1
-; CHECK-NEXT: vmul.f16 s4, s4, s6
-; CHECK-NEXT: vmovx.f16 s6, s2
-; CHECK-NEXT: vmul.f16 s4, s4, s2
-; CHECK-NEXT: vldr.16 s2, [r0]
-; CHECK-NEXT: vmul.f16 s4, s4, s6
-; CHECK-NEXT: vmul.f16 s4, s4, s3
-; CHECK-NEXT: vmul.f16 s0, s4, s0
-; CHECK-NEXT: vmul.f16 s0, s2, s0
-; CHECK-NEXT: vstr.16 s0, [r0]
-; CHECK-NEXT: bx lr
+; CHECK-FP-LABEL: fmul_v8f16:
+; CHECK-FP: @ %bb.0: @ %entry
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vmul.f16 q0, q0, q1
+; CHECK-FP-NEXT: vmul.f16 s4, s2, s3
+; CHECK-FP-NEXT: vmul.f16 s0, s0, s1
+; CHECK-FP-NEXT: vldr.16 s2, [r0]
+; CHECK-FP-NEXT: vmul.f16 s0, s0, s4
+; CHECK-FP-NEXT: vmul.f16 s0, s2, s0
+; CHECK-FP-NEXT: vstr.16 s0, [r0]
+; CHECK-FP-NEXT: bx lr
+;
+; CHECK-NOFP-LABEL: fmul_v8f16:
+; CHECK-NOFP: @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT: vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT: vmul.f16 s4, s0, s4
+; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s1
+; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s2
+; CHECK-NOFP-NEXT: vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s6
+; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s3
+; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0
+; CHECK-NOFP-NEXT: vmul.f16 s0, s2, s0
+; CHECK-NOFP-NEXT: vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT: bx lr
entry:
%y = load half, half* %yy
%z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half %y, <8 x half> %x)
; CHECK-FP-LABEL: fmul_v16f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmul.f16 q0, q0, q1
-; CHECK-FP-NEXT: vmovx.f16 s4, s0
-; CHECK-FP-NEXT: vmovx.f16 s6, s1
-; CHECK-FP-NEXT: vmul.f16 s4, s0, s4
-; CHECK-FP-NEXT: vmovx.f16 s0, s3
-; CHECK-FP-NEXT: vmul.f16 s4, s4, s1
-; CHECK-FP-NEXT: vmul.f16 s4, s4, s6
-; CHECK-FP-NEXT: vmovx.f16 s6, s2
-; CHECK-FP-NEXT: vmul.f16 s4, s4, s2
+; CHECK-FP-NEXT: vrev32.16 q1, q0
+; CHECK-FP-NEXT: vmul.f16 q0, q0, q1
+; CHECK-FP-NEXT: vmul.f16 s4, s2, s3
+; CHECK-FP-NEXT: vmul.f16 s0, s0, s1
; CHECK-FP-NEXT: vldr.16 s2, [r0]
-; CHECK-FP-NEXT: vmul.f16 s4, s4, s6
-; CHECK-FP-NEXT: vmul.f16 s4, s4, s3
-; CHECK-FP-NEXT: vmul.f16 s0, s4, s0
+; CHECK-FP-NEXT: vmul.f16 s0, s0, s4
; CHECK-FP-NEXT: vmul.f16 s0, s2, s0
; CHECK-FP-NEXT: vstr.16 s0, [r0]
; CHECK-FP-NEXT: bx lr
ret float %z
}
+define arm_aapcs_vfpcc void @fmul_v2f16_nofast(<2 x half> %x, half* %yy) {
+; CHECK-LABEL: fmul_v2f16_nofast:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldr.16 s4, [r0]
+; CHECK-NEXT: vmul.f16 s4, s4, s0
+; CHECK-NEXT: vmovx.f16 s0, s0
+; CHECK-NEXT: vmul.f16 s0, s4, s0
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %y = load half, half* %yy
+ %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x)
+ store half %z, half* %yy
+ ret void
+}
+
define arm_aapcs_vfpcc void @fmul_v4f16_nofast(<4 x half> %x, half* %yy) {
; CHECK-LABEL: fmul_v4f16_nofast:
; CHECK: @ %bb.0: @ %entry
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half, <16 x half>)
+declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half, <2 x half>)
declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>)
declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half, <8 x half>)
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB1_4
; CHECK-NEXT: @ %bb.5: @ %middle.block
-; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: vmov r2, s3
; CHECK-NEXT: cmp r12, r1
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: mul r2, r3, r2
; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: mul lr, r3, r2
+; CHECK-NEXT: vmov r3, s1
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: mul r2, r3, r2
-; CHECK-NEXT: vmov r3, s3
-; CHECK-NEXT: mul r2, r3, r2
+; CHECK-NEXT: mul r2, r2, lr
; CHECK-NEXT: beq .LBB1_8
; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r12
; CHECK-NEXT: vand q0, q1, q0
; CHECK-NEXT: le lr, .LBB2_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
-; CHECK-NEXT: vmov r12, s1
+; CHECK-NEXT: vmov r12, s3
; CHECK-NEXT: cmp r3, r1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: and.w r12, r12, r2
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov lr, s1
; CHECK-NEXT: and.w r12, r12, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: and.w r2, r2, lr
; CHECK-NEXT: and.w r2, r2, r12
; CHECK-NEXT: beq .LBB2_9
; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1
; CHECK-NEXT: vorr q0, q1, q0
; CHECK-NEXT: le lr, .LBB3_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
-; CHECK-NEXT: vmov r12, s1
+; CHECK-NEXT: vmov r12, s3
; CHECK-NEXT: cmp r3, r1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: orr.w r12, r12, r2
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov lr, s1
; CHECK-NEXT: orr.w r12, r12, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: orr.w r2, r2, lr
; CHECK-NEXT: orr.w r2, r2, r12
; CHECK-NEXT: beq .LBB3_9
; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1
; CHECK-NEXT: veor q0, q1, q0
; CHECK-NEXT: le lr, .LBB4_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
-; CHECK-NEXT: vmov r12, s1
+; CHECK-NEXT: vmov r12, s3
; CHECK-NEXT: cmp r3, r1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: eor.w r12, r12, r2
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov lr, s1
; CHECK-NEXT: eor.w r12, r12, r2
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: eor.w r2, r2, lr
; CHECK-NEXT: eor.w r2, r2, r12
; CHECK-NEXT: beq .LBB4_9
; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1
; CHECK-NEXT: vadd.f32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB5_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
-; CHECK-NEXT: vadd.f32 s4, s0, s1
+; CHECK-NEXT: vadd.f32 s4, s2, s3
; CHECK-NEXT: cmp r2, r1
-; CHECK-NEXT: vadd.f32 s4, s4, s2
-; CHECK-NEXT: vadd.f32 s0, s4, s3
+; CHECK-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: beq .LBB5_9
; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
; CHECK-NEXT: vmul.f32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB6_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
-; CHECK-NEXT: vmul.f32 s4, s0, s1
+; CHECK-NEXT: vmul.f32 s4, s2, s3
; CHECK-NEXT: cmp r2, r1
-; CHECK-NEXT: vmul.f32 s4, s4, s2
-; CHECK-NEXT: vmul.f32 s0, s4, s3
+; CHECK-NEXT: vmul.f32 s0, s0, s1
+; CHECK-NEXT: vmul.f32 s0, s0, s4
; CHECK-NEXT: beq .LBB6_9
; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
define arm_aapcs_vfpcc i32 @mul_v4i32(<4 x i32> %x) {
; CHECK-LABEL: mul_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: mul_v8i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i32 q0, q0, q1
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i16 @mul_v4i16(<4 x i16> %x) {
; CHECK-LABEL: mul_v4i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r1, s1
+; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i16 @mul_v8i16(<8 x i16> %x) {
; CHECK-LABEL: mul_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: mul_v16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i16 q0, q0, q1
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i8 @mul_v8i8(<8 x i8> %x) {
; CHECK-LABEL: mul_v8i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.u16 r1, q0[0]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[2]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[3]
-; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[5]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[6]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u16 r1, q0[7]
+; CHECK-NEXT: vmov.u16 r1, q0[2]
+; CHECK-NEXT: vmov.u16 r2, q0[0]
+; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i8 @mul_v16i8(<16 x i8> %x) {
; CHECK-LABEL: mul_v16i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vmul.i8 q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vmul.i8 q0, q0, q1
+; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[9]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[10]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[11]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[12]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[13]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[14]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: vmov.u8 r2, q0[0]
+; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: mul_v32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i8 q0, q0, q1
-; CHECK-NEXT: vmov.u8 r0, q0[1]
-; CHECK-NEXT: vmov.u8 r1, q0[0]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[2]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[3]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[4]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[5]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[6]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[7]
-; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vmul.i8 q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vmul.i8 q0, q0, q1
+; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[9]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[10]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[11]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[12]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[13]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[14]
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: vmov.u8 r1, q0[4]
+; CHECK-NEXT: vmov.u8 r2, q0[0]
+; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
entry:
define arm_aapcs_vfpcc i32 @mul_v4i32_acc(<4 x i32> %x, i32 %y) {
; CHECK-LABEL: mul_v4i32_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: muls r1, r2, r1
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: muls r2, r3, r2
; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
; CHECK-LABEL: mul_v8i32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i32 q0, q0, q1
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: muls r1, r2, r1
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: muls r2, r3, r2
; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i16 @mul_v4i16_acc(<4 x i16> %x, i16 %y) {
; CHECK-LABEL: mul_v4i16_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: muls r1, r2, r1
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: muls r2, r3, r2
; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i16 @mul_v8i16_acc(<8 x i16> %x, i16 %y) {
; CHECK-LABEL: mul_v8i16_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: muls r1, r2, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: muls r2, r3, r2
; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
; CHECK-LABEL: mul_v16i16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i16 q0, q0, q1
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: muls r1, r2, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: muls r2, r3, r2
; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i8 @mul_v8i8_acc(<8 x i8> %x, i8 %y) {
; CHECK-LABEL: mul_v8i8_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r1, q0[1]
-; CHECK-NEXT: vmov.u16 r2, q0[0]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[2]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: muls r1, r2, r1
+; CHECK-NEXT: vrev32.16 q1, q0
+; CHECK-NEXT: vmul.i16 q0, q0, q1
+; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[6]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u16 r2, q0[7]
+; CHECK-NEXT: vmov.u16 r2, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: muls r2, r3, r2
; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
define arm_aapcs_vfpcc i8 @mul_v16i8_acc(<16 x i8> %x, i8 %y) {
; CHECK-LABEL: mul_v16i8_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.u8 r2, q0[0]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[2]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[3]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[4]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[5]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[6]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[7]
-; CHECK-NEXT: muls r1, r2, r1
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vmul.i8 q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vmul.i8 q0, q0, q1
+; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[9]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[10]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[11]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[12]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[13]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[14]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[15]
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: muls r2, r3, r2
; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr
; CHECK-LABEL: mul_v32i8_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.i8 q0, q0, q1
-; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov.u8 r2, q0[0]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[2]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[3]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[4]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[5]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[6]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[7]
-; CHECK-NEXT: muls r1, r2, r1
+; CHECK-NEXT: vrev16.8 q1, q0
+; CHECK-NEXT: vmul.i8 q0, q0, q1
+; CHECK-NEXT: vrev32.8 q1, q0
+; CHECK-NEXT: vmul.i8 q0, q0, q1
+; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[9]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[10]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[11]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[12]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[13]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[14]
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: vmov.u8 r2, q0[15]
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: muls r2, r3, r2
; CHECK-NEXT: muls r1, r2, r1
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: bx lr