/// Try to select SBFX/UBFX instructions for ARM.
bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
+ bool tryInsertVectorElt(SDNode *N);
+
// Select special operations if node forms integer ABS pattern
bool tryABSOp(SDNode *N);
CurDAG->RemoveDeadNode(N);
}
+bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) {
+ if (!Subtarget->hasMVEIntegerOps())
+ return false;
+
+ SDLoc dl(N);
+
+ // We are trying to use VMOV/VMOVX/VINS to more efficiently lower insert and
+ // extracts of v8f16 and v8i16 vectors. Check that we have two adjacent
+ // inserts of the correct type:
+ SDValue Ins1 = SDValue(N, 0);
+ SDValue Ins2 = N->getOperand(0);
+ EVT VT = Ins1.getValueType();
+ if (Ins2.getOpcode() != ISD::INSERT_VECTOR_ELT || !Ins2.hasOneUse() ||
+ !isa<ConstantSDNode>(Ins1.getOperand(2)) ||
+ !isa<ConstantSDNode>(Ins2.getOperand(2)) ||
+ (VT != MVT::v8f16 && VT != MVT::v8i16) || (Ins2.getValueType() != VT))
+ return false;
+
+ unsigned Lane1 = Ins1.getConstantOperandVal(2);
+ unsigned Lane2 = Ins2.getConstantOperandVal(2);
+ if (Lane2 % 2 != 0 || Lane1 != Lane2 + 1)
+ return false;
+
+ // If the inserted values will be able to use T/B already, leave it to the
+ // existing tablegen patterns. For example VCVTT/VCVTB.
+ SDValue Val1 = Ins1.getOperand(1);
+ SDValue Val2 = Ins2.getOperand(1);
+ if (Val1.getOpcode() == ISD::FP_ROUND || Val2.getOpcode() == ISD::FP_ROUND)
+ return false;
+
+ // Check if the inserted values are both extracts.
+ if ((Val1.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ Val1.getOpcode() == ARMISD::VGETLANEu) &&
+ (Val2.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ Val2.getOpcode() == ARMISD::VGETLANEu) &&
+ isa<ConstantSDNode>(Val1.getOperand(1)) &&
+ isa<ConstantSDNode>(Val2.getOperand(1)) &&
+ (Val1.getOperand(0).getValueType() == MVT::v8f16 ||
+ Val1.getOperand(0).getValueType() == MVT::v8i16) &&
+ (Val2.getOperand(0).getValueType() == MVT::v8f16 ||
+ Val2.getOperand(0).getValueType() == MVT::v8i16)) {
+ unsigned ExtractLane1 = Val1.getConstantOperandVal(1);
+ unsigned ExtractLane2 = Val2.getConstantOperandVal(1);
+
+ // If the two extracted lanes are from the same place and adjacent, this
+ // simplifies into a f32 lane move.
+ if (Val1.getOperand(0) == Val2.getOperand(0) && ExtractLane2 % 2 == 0 &&
+ ExtractLane1 == ExtractLane2 + 1) {
+ SDValue NewExt = CurDAG->getTargetExtractSubreg(
+ ARM::ssub_0 + ExtractLane2 / 2, dl, MVT::f32, Val1.getOperand(0));
+ SDValue NewIns = CurDAG->getTargetInsertSubreg(
+ ARM::ssub_0 + Lane2 / 2, dl, VT, Ins2.getOperand(0),
+ NewExt);
+ ReplaceUses(Ins1, NewIns);
+ return true;
+ }
+
+ // Else v8i16 pattern of an extract and an insert, with a optional vmovx for
+ // extracting odd lanes.
+ if (VT == MVT::v8i16) {
+ SDValue Inp1 = CurDAG->getTargetExtractSubreg(
+ ARM::ssub_0 + ExtractLane1 / 2, dl, MVT::f32, Val1.getOperand(0));
+ SDValue Inp2 = CurDAG->getTargetExtractSubreg(
+ ARM::ssub_0 + ExtractLane2 / 2, dl, MVT::f32, Val2.getOperand(0));
+ if (ExtractLane1 % 2 != 0)
+ Inp1 = SDValue(CurDAG->getMachineNode(ARM::VMOVH, dl, MVT::f32, Inp1), 0);
+ if (ExtractLane2 % 2 != 0)
+ Inp2 = SDValue(CurDAG->getMachineNode(ARM::VMOVH, dl, MVT::f32, Inp2), 0);
+ SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Inp2, Inp1);
+ SDValue NewIns =
+ CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32,
+ Ins2.getOperand(0), SDValue(VINS, 0));
+ ReplaceUses(Ins1, NewIns);
+ return true;
+ }
+ }
+
+ // The inserted values are not extracted - if they are f16 then insert them
+ // directly using a VINS.
+ if (VT == MVT::v8f16) {
+ auto F32RC = CurDAG->getTargetConstant(ARM::SPRRegClassID, dl, MVT::i32);
+ SDNode *Val1Copy = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, MVT::f32, Val1, F32RC);
+ SDNode *Val2Copy = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, MVT::f32, Val2, F32RC);
+ auto MQPRRC = CurDAG->getTargetConstant(ARM::MQPRRegClassID, dl, MVT::i32);
+ SDNode *VecCopy =
+ CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, MVT::v4f32,
+ Ins2.getOperand(0), MQPRRC);
+
+ SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Val2, Val1);
+ SDValue NewIns =
+ CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32,
+ Ins2.getOperand(0), SDValue(VINS, 0));
+ ReplaceUses(Ins1, NewIns);
+ return true;
+ }
+
+ return false;
+}
+
bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
if (!Subtarget->hasV6T2Ops())
return false;
return;
}
}
+ case ISD::INSERT_VECTOR_ELT: {
+ if (tryInsertVectorElt(N))
+ return;
+ break;
+ }
case ISD::SRL:
if (tryV6T2BitfieldExtractOp(N, false))
return;
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), (f16 HPR:$src), ssub_0)>;
def : Pat<(v8f16 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
-
- foreach LANE = [0, 2, 4, 6] in {
- defvar SSUB = !cast<SubRegIndex>("ssub_"#!srl(LANE, 1));
-
- // v8f16 pattern for inserting two lanes using a VINS
- def : Pat<(insertelt (insertelt (v8f16 MQPR:$srcV), (f16 HPR:$src1), LANE),
- (f16 HPR:$src2), !add(LANE,1)),
- (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)),
- (VINSH (COPY_TO_REGCLASS HPR:$src1, SPR),
- (COPY_TO_REGCLASS HPR:$src2, SPR)),
- SSUB), MQPR)>;
-
- // v8i16 pattern for extracting 2 even lane elements and inserting them using a VINS
- def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV),
- (ARMvgetlaneu (v8i16 MQPR:$src1), imm_even:$lane1),
- LANE),
- (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2),
- !add(LANE,1)),
- (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)),
- (VINSH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
- (SSubReg_f16_reg imm_even:$lane1)),
- (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)),
- (SSubReg_f16_reg imm_even:$lane2))),
- SSUB), MQPR)>;
-
- // v8i16 pattern for extracting an element using VMOVX and inserting another using a VINS
- def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV),
- (ARMvgetlaneu (v8i16 MQPR:$src1), imm_odd:$lane1),
- LANE),
- (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2),
- !add(LANE,1)),
- (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)),
- (VINSH (VMOVH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
- (SSubReg_f16_reg imm_odd:$lane1))),
- (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)),
- (SSubReg_f16_reg imm_even:$lane2))),
- SSUB), MQPR)>;
- }
}
// end of mve_bit instructions
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
-// AddedComplexity to use over the dual-insert MVE pattern
-let AddedComplexity = 6 in
def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
(v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2),
(SSubReg_f16_reg imm:$lane)))>;
; CHECK-NEXT: bpl .LBB2_8
; CHECK-NEXT: .LBB2_7: @ %cond.load12
; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT: vmovx.f16 s24, s20
-; CHECK-NEXT: vins.f16 s20, s24
; CHECK-NEXT: vldr.16 s24, [r0, #6]
; CHECK-NEXT: vins.f16 s21, s24
; CHECK-NEXT: .LBB2_8: @ %else13
; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1
; CHECK-NEXT: vldr.16 s24, [r0, #2]
; CHECK-NEXT: vins.f16 s20, s24
-; CHECK-NEXT: vmovx.f16 s24, s21
-; CHECK-NEXT: vins.f16 s21, s24
; CHECK-NEXT: lsls r4, r2, #29
; CHECK-NEXT: bpl .LBB2_6
; CHECK-NEXT: .LBB2_14: @ %cond.load9
; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT: vmovx.f16 s24, s20
-; CHECK-NEXT: vins.f16 s20, s24
; CHECK-NEXT: vmovx.f16 s24, s21
; CHECK-NEXT: vldr.16 s21, [r0, #4]
; CHECK-NEXT: vins.f16 s21, s24
; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1
; CHECK-NEXT: vldr.16 s28, [r1, #2]
; CHECK-NEXT: vins.f16 s24, s28
-; CHECK-NEXT: vmovx.f16 s28, s25
-; CHECK-NEXT: vins.f16 s25, s28
; CHECK-NEXT: lsls r4, r2, #29
; CHECK-NEXT: bpl .LBB2_11
; CHECK-NEXT: .LBB2_17: @ %cond.load22
; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT: vmovx.f16 s28, s24
-; CHECK-NEXT: vins.f16 s24, s28
; CHECK-NEXT: vmovx.f16 s28, s25
; CHECK-NEXT: vldr.16 s25, [r1, #4]
; CHECK-NEXT: vins.f16 s25, s28
; CHECK-NEXT: bpl.w .LBB2_2
; CHECK-NEXT: .LBB2_18: @ %cond.load25
; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT: vmovx.f16 s28, s24
-; CHECK-NEXT: vins.f16 s24, s28
; CHECK-NEXT: vldr.16 s28, [r1, #6]
; CHECK-NEXT: vins.f16 s25, s28
; CHECK-NEXT: b .LBB2_2
define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) {
; CHECK-LABEL: fdiv_f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s8, s4
-; CHECK-NEXT: vmovx.f16 s10, s0
-; CHECK-NEXT: vdiv.f16 s12, s10, s8
-; CHECK-NEXT: vdiv.f16 s8, s0, s4
-; CHECK-NEXT: vins.f16 s8, s12
+; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: vmovx.f16 s0, s4
+; CHECK-NEXT: vmovx.f16 s2, s8
+; CHECK-NEXT: vmovx.f16 s14, s9
+; CHECK-NEXT: vdiv.f16 s12, s2, s0
+; CHECK-NEXT: vdiv.f16 s0, s8, s4
+; CHECK-NEXT: vins.f16 s0, s12
; CHECK-NEXT: vmovx.f16 s12, s5
-; CHECK-NEXT: vmovx.f16 s14, s1
-; CHECK-NEXT: vdiv.f16 s9, s1, s5
; CHECK-NEXT: vdiv.f16 s12, s14, s12
-; CHECK-NEXT: vmovx.f16 s14, s2
-; CHECK-NEXT: vins.f16 s9, s12
+; CHECK-NEXT: vdiv.f16 s1, s9, s5
+; CHECK-NEXT: vins.f16 s1, s12
; CHECK-NEXT: vmovx.f16 s12, s6
+; CHECK-NEXT: vmovx.f16 s14, s10
+; CHECK-NEXT: vdiv.f16 s2, s10, s6
; CHECK-NEXT: vdiv.f16 s12, s14, s12
-; CHECK-NEXT: vdiv.f16 s10, s2, s6
-; CHECK-NEXT: vins.f16 s10, s12
+; CHECK-NEXT: vmovx.f16 s14, s11
+; CHECK-NEXT: vins.f16 s2, s12
; CHECK-NEXT: vmovx.f16 s12, s7
-; CHECK-NEXT: vmovx.f16 s14, s3
-; CHECK-NEXT: vdiv.f16 s11, s3, s7
; CHECK-NEXT: vdiv.f16 s12, s14, s12
-; CHECK-NEXT: vins.f16 s11, s12
-; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: vdiv.f16 s3, s11, s7
+; CHECK-NEXT: vins.f16 s3, s12
; CHECK-NEXT: bx lr
entry:
%out = fdiv <8 x half> %in1, %in2
; CHECK-NEXT: vmov.u16 r3, q2[1]
; CHECK-NEXT: vfma.f16 q2, q6, r3
; CHECK-NEXT: strh r3, [r5, #2]
-; CHECK-NEXT: vmovx.f16 s6, s9
; CHECK-NEXT: vmov.f32 s8, s9
-; CHECK-NEXT: vins.f16 s8, s6
; CHECK-NEXT: strh r7, [r5], #4
; CHECK-NEXT: vmov.16 q2[2], r4
; CHECK-NEXT: le lr, .LBB17_5
; CHECK-NEXT: vldr.16 s8, [r2]
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vldr.16 s0, [r2]
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: vins.f16 s0, s8
-; CHECK-NEXT: vldr.16 s8, [r2]
+; CHECK-NEXT: vmov r3, s7
; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT: vldr.16 s4, [r3]
+; CHECK-NEXT: vins.f16 s0, s8
; CHECK-NEXT: vldr.16 s1, [r2]
+; CHECK-NEXT: vins.f16 s1, s4
+; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
; CHECK-NEXT: vshl.i32 q1, q1, #1
; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: vins.f16 s1, s8
-; CHECK-NEXT: vmov r0, s5
-; CHECK-NEXT: vldr.16 s8, [r0]
; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov r1, s5
; CHECK-NEXT: vldr.16 s2, [r0]
+; CHECK-NEXT: vldr.16 s8, [r1]
; CHECK-NEXT: vmov r0, s7
; CHECK-NEXT: vins.f16 s2, s8
; CHECK-NEXT: vldr.16 s8, [r0]
; CHECK-NEXT: vldr.16 s8, [r1]
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: vldr.16 s0, [r1]
-; CHECK-NEXT: vmov r1, s7
-; CHECK-NEXT: vins.f16 s0, s8
-; CHECK-NEXT: vldr.16 s8, [r1]
+; CHECK-NEXT: vmov r2, s7
; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vldr.16 s4, [r2]
+; CHECK-NEXT: vins.f16 s0, s8
; CHECK-NEXT: vldr.16 s1, [r1]
-; CHECK-NEXT: vmov r0, s5
-; CHECK-NEXT: vins.f16 s1, s8
-; CHECK-NEXT: vldr.16 s8, [r0]
+; CHECK-NEXT: vins.f16 s1, s4
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov r1, s5
; CHECK-NEXT: vldr.16 s2, [r0]
+; CHECK-NEXT: vldr.16 s8, [r1]
; CHECK-NEXT: vmov r0, s7
; CHECK-NEXT: vins.f16 s2, s8
; CHECK-NEXT: vldr.16 s8, [r0]
; CHECK-LE-NEXT: lsls r1, r1, #28
; CHECK-LE-NEXT: bpl .LBB18_5
; CHECK-LE-NEXT: .LBB18_4: @ %cond.load7
-; CHECK-LE-NEXT: vmovx.f16 s4, s0
-; CHECK-LE-NEXT: vins.f16 s0, s4
; CHECK-LE-NEXT: vldr.16 s4, [r2, #6]
; CHECK-LE-NEXT: vins.f16 s1, s4
; CHECK-LE-NEXT: .LBB18_5: @ %else8
; CHECK-LE-NEXT: .LBB18_7: @ %cond.load1
; CHECK-LE-NEXT: vldr.16 s4, [r2, #2]
; CHECK-LE-NEXT: vins.f16 s0, s4
-; CHECK-LE-NEXT: vmovx.f16 s4, s1
-; CHECK-LE-NEXT: vins.f16 s1, s4
; CHECK-LE-NEXT: lsls r3, r1, #29
; CHECK-LE-NEXT: bpl .LBB18_3
; CHECK-LE-NEXT: .LBB18_8: @ %cond.load4
-; CHECK-LE-NEXT: vmovx.f16 s4, s0
-; CHECK-LE-NEXT: vins.f16 s0, s4
; CHECK-LE-NEXT: vmovx.f16 s4, s1
; CHECK-LE-NEXT: vldr.16 s1, [r2, #4]
; CHECK-LE-NEXT: vins.f16 s1, s4
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: beq .LBB18_5
; CHECK-BE-NEXT: .LBB18_4: @ %cond.load7
-; CHECK-BE-NEXT: vmovx.f16 s4, s0
-; CHECK-BE-NEXT: vins.f16 s0, s4
; CHECK-BE-NEXT: vldr.16 s4, [r2, #6]
; CHECK-BE-NEXT: vins.f16 s1, s4
; CHECK-BE-NEXT: .LBB18_5: @ %else8
; CHECK-BE-NEXT: .LBB18_7: @ %cond.load1
; CHECK-BE-NEXT: vldr.16 s4, [r2, #2]
; CHECK-BE-NEXT: vins.f16 s0, s4
-; CHECK-BE-NEXT: vmovx.f16 s4, s1
-; CHECK-BE-NEXT: vins.f16 s1, s4
; CHECK-BE-NEXT: lsls r3, r1, #30
; CHECK-BE-NEXT: bpl .LBB18_3
; CHECK-BE-NEXT: .LBB18_8: @ %cond.load4
-; CHECK-BE-NEXT: vmovx.f16 s4, s0
-; CHECK-BE-NEXT: vins.f16 s0, s4
; CHECK-BE-NEXT: vmovx.f16 s4, s1
; CHECK-BE-NEXT: vldr.16 s1, [r2, #4]
; CHECK-BE-NEXT: vins.f16 s1, s4
; CHECK-LE-NEXT: lsls r1, r1, #28
; CHECK-LE-NEXT: bpl .LBB19_5
; CHECK-LE-NEXT: .LBB19_4: @ %cond.load7
-; CHECK-LE-NEXT: vmovx.f16 s4, s0
-; CHECK-LE-NEXT: vins.f16 s0, s4
; CHECK-LE-NEXT: vldr.16 s4, [r2, #6]
; CHECK-LE-NEXT: vins.f16 s1, s4
; CHECK-LE-NEXT: .LBB19_5: @ %else8
; CHECK-LE-NEXT: .LBB19_7: @ %cond.load1
; CHECK-LE-NEXT: vldr.16 s4, [r2, #2]
; CHECK-LE-NEXT: vins.f16 s0, s4
-; CHECK-LE-NEXT: vmovx.f16 s4, s1
-; CHECK-LE-NEXT: vins.f16 s1, s4
; CHECK-LE-NEXT: lsls r3, r1, #29
; CHECK-LE-NEXT: bpl .LBB19_3
; CHECK-LE-NEXT: .LBB19_8: @ %cond.load4
-; CHECK-LE-NEXT: vmovx.f16 s4, s0
-; CHECK-LE-NEXT: vins.f16 s0, s4
; CHECK-LE-NEXT: vmovx.f16 s4, s1
; CHECK-LE-NEXT: vldr.16 s1, [r2, #4]
; CHECK-LE-NEXT: vins.f16 s1, s4
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: beq .LBB19_5
; CHECK-BE-NEXT: .LBB19_4: @ %cond.load7
-; CHECK-BE-NEXT: vmovx.f16 s4, s0
-; CHECK-BE-NEXT: vins.f16 s0, s4
; CHECK-BE-NEXT: vldr.16 s4, [r2, #6]
; CHECK-BE-NEXT: vins.f16 s1, s4
; CHECK-BE-NEXT: .LBB19_5: @ %else8
; CHECK-BE-NEXT: .LBB19_7: @ %cond.load1
; CHECK-BE-NEXT: vldr.16 s4, [r2, #2]
; CHECK-BE-NEXT: vins.f16 s0, s4
-; CHECK-BE-NEXT: vmovx.f16 s4, s1
-; CHECK-BE-NEXT: vins.f16 s1, s4
; CHECK-BE-NEXT: lsls r3, r1, #30
; CHECK-BE-NEXT: bpl .LBB19_3
; CHECK-BE-NEXT: .LBB19_8: @ %cond.load4
-; CHECK-BE-NEXT: vmovx.f16 s4, s0
-; CHECK-BE-NEXT: vins.f16 s0, s4
; CHECK-BE-NEXT: vmovx.f16 s4, s1
; CHECK-BE-NEXT: vldr.16 s1, [r2, #4]
; CHECK-BE-NEXT: vins.f16 s1, s4
define arm_aapcs_vfpcc <8 x half> @minnm_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: minnm_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
-; CHECK-MVE-NEXT: vmovx.f16 s8, s0
-; CHECK-MVE-NEXT: vmovx.f16 s10, s4
-; CHECK-MVE-NEXT: vminnm.f16 s12, s10, s8
-; CHECK-MVE-NEXT: vminnm.f16 s8, s4, s0
-; CHECK-MVE-NEXT: vins.f16 s8, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s1
+; CHECK-MVE-NEXT: vmov q2, q0
+; CHECK-MVE-NEXT: vmovx.f16 s2, s4
+; CHECK-MVE-NEXT: vmovx.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
-; CHECK-MVE-NEXT: vminnm.f16 s9, s5, s1
+; CHECK-MVE-NEXT: vminnm.f16 s12, s2, s0
+; CHECK-MVE-NEXT: vminnm.f16 s0, s4, s8
+; CHECK-MVE-NEXT: vins.f16 s0, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s9
; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12
+; CHECK-MVE-NEXT: vminnm.f16 s1, s5, s9
+; CHECK-MVE-NEXT: vins.f16 s1, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s10
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
-; CHECK-MVE-NEXT: vins.f16 s9, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s2
+; CHECK-MVE-NEXT: vminnm.f16 s2, s6, s10
; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT: vminnm.f16 s10, s6, s2
-; CHECK-MVE-NEXT: vins.f16 s10, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s3
; CHECK-MVE-NEXT: vmovx.f16 s14, s7
-; CHECK-MVE-NEXT: vminnm.f16 s11, s7, s3
+; CHECK-MVE-NEXT: vins.f16 s2, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s11
; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT: vins.f16 s11, s12
-; CHECK-MVE-NEXT: vmov q0, q2
+; CHECK-MVE-NEXT: vminnm.f16 s3, s7, s11
+; CHECK-MVE-NEXT: vins.f16 s3, s12
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: minnm_float16_t:
define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {
; CHECK-LABEL: shuffle1_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s3
-; CHECK-NEXT: vins.f16 s4, s3
-; CHECK-NEXT: vmovx.f16 s5, s2
-; CHECK-NEXT: vins.f16 s5, s2
-; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vins.f16 s6, s1
-; CHECK-NEXT: vmovx.f16 s7, s0
-; CHECK-NEXT: vins.f16 s7, s0
-; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmovx.f16 s0, s7
+; CHECK-NEXT: vins.f16 s0, s7
+; CHECK-NEXT: vmovx.f16 s1, s6
+; CHECK-NEXT: vins.f16 s1, s6
+; CHECK-NEXT: vmovx.f16 s2, s5
+; CHECK-NEXT: vins.f16 s2, s5
+; CHECK-NEXT: vmovx.f16 s3, s4
+; CHECK-NEXT: vins.f16 s3, s4
; CHECK-NEXT: bx lr
entry:
%out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
; CHECK-LABEL: shuffle3_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s5, s3
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vins.f16 s5, s3
-; CHECK-NEXT: vmov.16 q1[4], r0
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.16 q1[5], r0
-; CHECK-NEXT: vins.f16 s1, s0
-; CHECK-NEXT: vmov.f32 s7, s1
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmovx.f16 s1, s7
+; CHECK-NEXT: vmovx.f16 s8, s4
+; CHECK-NEXT: vins.f16 s1, s7
+; CHECK-NEXT: vmovx.f16 s2, s5
+; CHECK-NEXT: vins.f16 s5, s4
+; CHECK-NEXT: vins.f16 s2, s8
+; CHECK-NEXT: vmov.f32 s3, s5
+; CHECK-NEXT: vmov.f32 s0, s6
; CHECK-NEXT: bx lr
entry:
%out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) {
; CHECK-LABEL: shuffle2step_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.f32 s8, s0
-; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vins.f16 s8, s1
-; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.f32 s9, s2
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vins.f16 s9, s3
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.f32 s10, s4
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vins.f16 s10, s5
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.f32 s11, s6
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vins.f16 s11, s7
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vadd.i16 q0, q2, q3
+; CHECK-NEXT: .vsave {d8}
+; CHECK-NEXT: vpush {d8}
+; CHECK-NEXT: vmovx.f16 s8, s0
+; CHECK-NEXT: vins.f16 s0, s1
+; CHECK-NEXT: vmovx.f16 s9, s2
+; CHECK-NEXT: vins.f16 s2, s3
+; CHECK-NEXT: vmovx.f16 s10, s4
+; CHECK-NEXT: vmovx.f16 s16, s1
+; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: vins.f16 s8, s16
+; CHECK-NEXT: vmovx.f16 s0, s3
+; CHECK-NEXT: vins.f16 s4, s5
+; CHECK-NEXT: vins.f16 s9, s0
+; CHECK-NEXT: vmovx.f16 s0, s5
+; CHECK-NEXT: vins.f16 s10, s0
+; CHECK-NEXT: vmov.f32 s13, s2
+; CHECK-NEXT: vmov.f32 s14, s4
+; CHECK-NEXT: vmovx.f16 s0, s7
+; CHECK-NEXT: vmovx.f16 s11, s6
+; CHECK-NEXT: vins.f16 s6, s7
+; CHECK-NEXT: vins.f16 s11, s0
+; CHECK-NEXT: vmov.f32 s15, s6
+; CHECK-NEXT: vadd.i16 q0, q3, q2
+; CHECK-NEXT: vpop {d8}
; CHECK-NEXT: bx lr
entry:
%s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) {
; CHECK-LABEL: shuffle3step_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vmov.u16 r0, q0[0]
-; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u16 r0, q0[6]
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u16 r0, q2[2]
-; CHECK-NEXT: vmov.16 q4[6], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.16 q4[7], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u16 r0, q2[4]
-; CHECK-NEXT: vmov.16 q5[6], r0
-; CHECK-NEXT: vmov.u16 r0, q0[2]
-; CHECK-NEXT: vmov.f32 s15, s19
-; CHECK-NEXT: vmov.16 q4[0], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.16 q4[1], r0
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vmov.16 q4[2], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.16 q4[3], r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vmov.16 q5[7], r0
-; CHECK-NEXT: vmov.f32 s18, s7
-; CHECK-NEXT: vmov.f32 s22, s8
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vmovx.f16 s16, s2
+; CHECK-NEXT: vmov.f32 s12, s1
+; CHECK-NEXT: vins.f16 s12, s16
+; CHECK-NEXT: vmovx.f16 s16, s5
+; CHECK-NEXT: vmov.f32 s13, s4
+; CHECK-NEXT: vmovx.f16 s20, s11
+; CHECK-NEXT: vins.f16 s13, s16
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vins.f16 s19, s20
+; CHECK-NEXT: vmov.f32 s14, s7
+; CHECK-NEXT: vmov.f32 s18, s8
; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov q6, q5
-; CHECK-NEXT: vmovnb.i32 q6, q4
-; CHECK-NEXT: vmov.f32 s18, s26
-; CHECK-NEXT: vmov.f32 s19, s23
-; CHECK-NEXT: vins.f16 s22, s8
-; CHECK-NEXT: vmovx.f16 s23, s9
-; CHECK-NEXT: vins.f16 s23, s11
-; CHECK-NEXT: vmovx.f16 s8, s0
-; CHECK-NEXT: vins.f16 s8, s2
-; CHECK-NEXT: vmovx.f16 s9, s3
-; CHECK-NEXT: vmov q0, q5
-; CHECK-NEXT: vins.f16 s9, s5
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmovnb.i32 q0, q2
-; CHECK-NEXT: vmov.f32 s10, s2
-; CHECK-NEXT: vmov.f32 s11, s23
-; CHECK-NEXT: vadd.i16 q0, q3, q2
-; CHECK-NEXT: vadd.i16 q0, q0, q4
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vmov q5, q4
+; CHECK-NEXT: vmovnb.i32 q5, q3
+; CHECK-NEXT: vmov.f32 s14, s22
+; CHECK-NEXT: vmovx.f16 s20, s1
+; CHECK-NEXT: vmov.f32 s15, s19
+; CHECK-NEXT: vmov.f32 s16, s0
+; CHECK-NEXT: vins.f16 s16, s20
+; CHECK-NEXT: vmovx.f16 s20, s4
+; CHECK-NEXT: vmov.f32 s17, s3
+; CHECK-NEXT: vins.f16 s17, s20
+; CHECK-NEXT: vmovx.f16 s20, s7
+; CHECK-NEXT: vmov.f32 s18, s6
+; CHECK-NEXT: vins.f16 s18, s20
+; CHECK-NEXT: vmovx.f16 s20, s10
+; CHECK-NEXT: vmov.f32 s19, s9
+; CHECK-NEXT: vins.f16 s19, s20
+; CHECK-NEXT: vmovx.f16 s20, s0
+; CHECK-NEXT: vins.f16 s20, s2
+; CHECK-NEXT: vmovx.f16 s21, s3
+; CHECK-NEXT: vins.f16 s2, s8
+; CHECK-NEXT: vmovx.f16 s3, s9
+; CHECK-NEXT: vins.f16 s21, s5
+; CHECK-NEXT: vins.f16 s3, s11
+; CHECK-NEXT: vmov.16 q5[4], r0
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmovnb.i32 q1, q5
+; CHECK-NEXT: vmov.f32 s22, s6
+; CHECK-NEXT: vmov.f32 s23, s3
+; CHECK-NEXT: vadd.i16 q0, q4, q5
+; CHECK-NEXT: vadd.i16 q0, q0, q3
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
%s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) {
; CHECK-LABEL: shuffle4step_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vmov.u16 r0, q0[3]
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT: vmovx.f16 s20, s11
+; CHECK-NEXT: vmovx.f16 s18, s9
+; CHECK-NEXT: vins.f16 s18, s20
+; CHECK-NEXT: vmovx.f16 s20, s15
+; CHECK-NEXT: vmovx.f16 s19, s13
+; CHECK-NEXT: vins.f16 s9, s11
+; CHECK-NEXT: vins.f16 s19, s20
+; CHECK-NEXT: vmovx.f16 s20, s3
+; CHECK-NEXT: vmovx.f16 s16, s1
+; CHECK-NEXT: vins.f16 s13, s15
+; CHECK-NEXT: vins.f16 s16, s20
+; CHECK-NEXT: vmovx.f16 s20, s7
+; CHECK-NEXT: vmovx.f16 s17, s5
+; CHECK-NEXT: vins.f16 s1, s3
+; CHECK-NEXT: vins.f16 s17, s20
; CHECK-NEXT: vmov.f32 s22, s9
-; CHECK-NEXT: vmov.16 q6[0], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vmov.16 q6[1], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.16 q6[2], r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vins.f16 s22, s11
-; CHECK-NEXT: vmov.16 q7[4], r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
; CHECK-NEXT: vmov.f32 s23, s13
-; CHECK-NEXT: vmov.16 q7[5], r0
-; CHECK-NEXT: vmov.u16 r0, q3[3]
-; CHECK-NEXT: vins.f16 s23, s15
-; CHECK-NEXT: vmov.16 q7[6], r0
-; CHECK-NEXT: vmov.u16 r0, q3[7]
+; CHECK-NEXT: vins.f16 s5, s7
; CHECK-NEXT: vmov.f32 s20, s1
-; CHECK-NEXT: vmov.16 q7[7], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.16 q6[3], r0
-; CHECK-NEXT: vins.f16 s20, s3
-; CHECK-NEXT: vmov.f32 s18, s8
-; CHECK-NEXT: vmov.u16 r0, q0[1]
+; CHECK-NEXT: vmovx.f16 s24, s10
; CHECK-NEXT: vmov.f32 s21, s5
-; CHECK-NEXT: vins.f16 s18, s10
-; CHECK-NEXT: vmov.f32 s26, s30
-; CHECK-NEXT: vins.f16 s21, s7
-; CHECK-NEXT: vmov.f32 s27, s31
-; CHECK-NEXT: vmov.f32 s19, s12
-; CHECK-NEXT: vadd.i16 q5, q5, q6
-; CHECK-NEXT: vmov.16 q6[0], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vins.f16 s19, s14
-; CHECK-NEXT: vmov.16 q6[1], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.f32 s16, s0
-; CHECK-NEXT: vmov.16 q6[2], r0
-; CHECK-NEXT: vmov.u16 r0, q2[1]
-; CHECK-NEXT: vins.f16 s16, s2
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.f32 s17, s4
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov.u16 r0, q3[1]
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov.u16 r0, q3[5]
-; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.16 q6[3], r0
-; CHECK-NEXT: vins.f16 s17, s6
-; CHECK-NEXT: vmov.f32 s26, s2
-; CHECK-NEXT: vmov.f32 s27, s3
-; CHECK-NEXT: vadd.i16 q0, q4, q6
+; CHECK-NEXT: vadd.i16 q4, q5, q4
+; CHECK-NEXT: vmovx.f16 s22, s8
+; CHECK-NEXT: vins.f16 s22, s24
+; CHECK-NEXT: vins.f16 s8, s10
+; CHECK-NEXT: vmovx.f16 s24, s14
+; CHECK-NEXT: vmovx.f16 s23, s12
+; CHECK-NEXT: vins.f16 s12, s14
+; CHECK-NEXT: vmov.f32 s10, s8
+; CHECK-NEXT: vmov.f32 s11, s12
+; CHECK-NEXT: vins.f16 s23, s24
+; CHECK-NEXT: vmovx.f16 s24, s2
+; CHECK-NEXT: vmovx.f16 s20, s0
+; CHECK-NEXT: vins.f16 s20, s24
+; CHECK-NEXT: vmovx.f16 s24, s6
+; CHECK-NEXT: vmovx.f16 s21, s4
+; CHECK-NEXT: vins.f16 s0, s2
+; CHECK-NEXT: vins.f16 s4, s6
+; CHECK-NEXT: vins.f16 s21, s24
+; CHECK-NEXT: vmov.f32 s1, s4
+; CHECK-NEXT: vmov.f32 s2, s10
+; CHECK-NEXT: vmov.f32 s3, s11
; CHECK-NEXT: vadd.i16 q0, q0, q5
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vadd.i16 q0, q0, q4
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12}
; CHECK-NEXT: bx lr
entry:
%s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) {
; CHECKFP-LABEL: shuffle2step_f16:
; CHECKFP: @ %bb.0: @ %entry
-; CHECKFP-NEXT: .vsave {d8}
-; CHECKFP-NEXT: vpush {d8}
-; CHECKFP-NEXT: vmovx.f16 s16, s1
-; CHECKFP-NEXT: vmovx.f16 s12, s0
-; CHECKFP-NEXT: vmov.f32 s8, s0
-; CHECKFP-NEXT: vins.f16 s12, s16
-; CHECKFP-NEXT: vins.f16 s8, s1
-; CHECKFP-NEXT: vmovx.f16 s13, s2
-; CHECKFP-NEXT: vmovx.f16 s16, s3
-; CHECKFP-NEXT: vmov.f32 s9, s2
-; CHECKFP-NEXT: vins.f16 s13, s16
-; CHECKFP-NEXT: vins.f16 s9, s3
-; CHECKFP-NEXT: vmovx.f16 s0, s5
-; CHECKFP-NEXT: vmovx.f16 s14, s4
-; CHECKFP-NEXT: vmov.f32 s10, s4
-; CHECKFP-NEXT: vins.f16 s14, s0
-; CHECKFP-NEXT: vins.f16 s10, s5
-; CHECKFP-NEXT: vmovx.f16 s0, s7
-; CHECKFP-NEXT: vmovx.f16 s15, s6
+; CHECKFP-NEXT: vmovx.f16 s12, s1
+; CHECKFP-NEXT: vmovx.f16 s8, s0
+; CHECKFP-NEXT: vins.f16 s8, s12
+; CHECKFP-NEXT: vmovx.f16 s12, s3
+; CHECKFP-NEXT: vmovx.f16 s9, s2
+; CHECKFP-NEXT: vins.f16 s0, s1
+; CHECKFP-NEXT: vins.f16 s9, s12
+; CHECKFP-NEXT: vins.f16 s2, s3
+; CHECKFP-NEXT: vmovx.f16 s12, s5
+; CHECKFP-NEXT: vmovx.f16 s10, s4
+; CHECKFP-NEXT: vins.f16 s10, s12
+; CHECKFP-NEXT: vins.f16 s4, s5
+; CHECKFP-NEXT: vmov.f32 s1, s2
+; CHECKFP-NEXT: vmovx.f16 s12, s7
+; CHECKFP-NEXT: vmovx.f16 s11, s6
; CHECKFP-NEXT: vins.f16 s6, s7
-; CHECKFP-NEXT: vins.f16 s15, s0
-; CHECKFP-NEXT: vmov.f32 s11, s6
-; CHECKFP-NEXT: vadd.f16 q0, q2, q3
-; CHECKFP-NEXT: vpop {d8}
+; CHECKFP-NEXT: vmov.f32 s2, s4
+; CHECKFP-NEXT: vins.f16 s11, s12
+; CHECKFP-NEXT: vmov.f32 s3, s6
+; CHECKFP-NEXT: vadd.f16 q0, q0, q2
; CHECKFP-NEXT: bx lr
entry:
%s1 = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) {
; CHECKFP-LABEL: shuffle3step_f16:
; CHECKFP: @ %bb.0: @ %entry
-; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13}
+; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECKFP-NEXT: vmovx.f16 s16, s2
; CHECKFP-NEXT: vmov.f32 s12, s1
; CHECKFP-NEXT: vins.f16 s12, s16
; CHECKFP-NEXT: vins.f16 s19, s20
; CHECKFP-NEXT: vmov.f32 s14, s7
; CHECKFP-NEXT: vmovx.f16 s20, s8
-; CHECKFP-NEXT: vmovx.f16 s24, s1
+; CHECKFP-NEXT: vmov.f32 s28, s6
; CHECKFP-NEXT: vins.f16 s14, s20
-; CHECKFP-NEXT: vmov.f32 s20, s0
-; CHECKFP-NEXT: vins.f16 s20, s24
-; CHECKFP-NEXT: vmovx.f16 s24, s4
-; CHECKFP-NEXT: vmov.f32 s21, s3
-; CHECKFP-NEXT: vins.f16 s21, s24
-; CHECKFP-NEXT: vmovx.f16 s24, s7
-; CHECKFP-NEXT: vmov.f32 s22, s6
-; CHECKFP-NEXT: vins.f16 s22, s24
-; CHECKFP-NEXT: vmovx.f16 s24, s0
-; CHECKFP-NEXT: vins.f16 s24, s2
+; CHECKFP-NEXT: vmovx.f16 s20, s7
+; CHECKFP-NEXT: vins.f16 s28, s20
+; CHECKFP-NEXT: vmovx.f16 s24, s1
+; CHECKFP-NEXT: vmovx.f16 s20, s0
+; CHECKFP-NEXT: vins.f16 s0, s24
+; CHECKFP-NEXT: vins.f16 s20, s2
+; CHECKFP-NEXT: vmovx.f16 s26, s4
+; CHECKFP-NEXT: vmovx.f16 s21, s3
+; CHECKFP-NEXT: vins.f16 s3, s26
+; CHECKFP-NEXT: vins.f16 s21, s5
+; CHECKFP-NEXT: vmovx.f16 s30, s10
+; CHECKFP-NEXT: vmovx.f16 s23, s9
; CHECKFP-NEXT: vmov.f32 s18, s8
-; CHECKFP-NEXT: vmovx.f16 s25, s3
-; CHECKFP-NEXT: vmovx.f16 s0, s10
-; CHECKFP-NEXT: vins.f16 s25, s5
+; CHECKFP-NEXT: vmov.f32 s1, s3
+; CHECKFP-NEXT: vins.f16 s9, s30
+; CHECKFP-NEXT: vins.f16 s23, s11
+; CHECKFP-NEXT: vmov.f32 s2, s28
+; CHECKFP-NEXT: vmovx.f16 s22, s6
+; CHECKFP-NEXT: vmov.f32 s3, s9
+; CHECKFP-NEXT: vins.f16 s22, s8
; CHECKFP-NEXT: vmov.f32 s15, s19
-; CHECKFP-NEXT: vmovx.f16 s27, s9
-; CHECKFP-NEXT: vins.f16 s9, s0
-; CHECKFP-NEXT: vins.f16 s27, s11
-; CHECKFP-NEXT: vmov.f32 s23, s9
-; CHECKFP-NEXT: vmovx.f16 s26, s6
-; CHECKFP-NEXT: vins.f16 s26, s8
-; CHECKFP-NEXT: vadd.f16 q0, q5, q6
+; CHECKFP-NEXT: vadd.f16 q0, q0, q5
; CHECKFP-NEXT: vadd.f16 q0, q0, q3
-; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECKFP-NEXT: bx lr
entry:
%s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) {
; CHECKFP-LABEL: shuffle4step_f16:
; CHECKFP: @ %bb.0: @ %entry
-; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14}
-; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
+; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECKFP-NEXT: vmovx.f16 s20, s11
; CHECKFP-NEXT: vmovx.f16 s18, s9
; CHECKFP-NEXT: vins.f16 s18, s20
; CHECKFP-NEXT: vins.f16 s19, s20
; CHECKFP-NEXT: vmovx.f16 s20, s3
; CHECKFP-NEXT: vmovx.f16 s16, s1
-; CHECKFP-NEXT: vmovx.f16 s28, s10
+; CHECKFP-NEXT: vmovx.f16 s24, s10
; CHECKFP-NEXT: vins.f16 s16, s20
-; CHECKFP-NEXT: vmovx.f16 s26, s8
; CHECKFP-NEXT: vmovx.f16 s20, s7
; CHECKFP-NEXT: vmovx.f16 s17, s5
-; CHECKFP-NEXT: vins.f16 s17, s20
-; CHECKFP-NEXT: vmov.f32 s22, s9
-; CHECKFP-NEXT: vins.f16 s8, s10
; CHECKFP-NEXT: vins.f16 s13, s15
-; CHECKFP-NEXT: vins.f16 s26, s28
-; CHECKFP-NEXT: vmov.f32 s23, s13
-; CHECKFP-NEXT: vmovx.f16 s28, s14
-; CHECKFP-NEXT: vmovx.f16 s27, s12
-; CHECKFP-NEXT: vmov.f32 s10, s8
-; CHECKFP-NEXT: vins.f16 s12, s14
-; CHECKFP-NEXT: vmov.f32 s11, s12
-; CHECKFP-NEXT: vins.f16 s27, s28
+; CHECKFP-NEXT: vins.f16 s17, s20
+; CHECKFP-NEXT: vmovx.f16 s22, s8
+; CHECKFP-NEXT: vins.f16 s22, s24
+; CHECKFP-NEXT: vmovx.f16 s24, s14
+; CHECKFP-NEXT: vmovx.f16 s23, s12
; CHECKFP-NEXT: vins.f16 s1, s3
-; CHECKFP-NEXT: vmovx.f16 s28, s2
-; CHECKFP-NEXT: vmovx.f16 s24, s0
-; CHECKFP-NEXT: vmov.f32 s20, s1
+; CHECKFP-NEXT: vins.f16 s23, s24
+; CHECKFP-NEXT: vmovx.f16 s24, s2
+; CHECKFP-NEXT: vmovx.f16 s20, s0
; CHECKFP-NEXT: vins.f16 s5, s7
-; CHECKFP-NEXT: vins.f16 s24, s28
-; CHECKFP-NEXT: vmov.f32 s21, s5
-; CHECKFP-NEXT: vmovx.f16 s28, s6
-; CHECKFP-NEXT: vmovx.f16 s25, s4
+; CHECKFP-NEXT: vins.f16 s20, s24
+; CHECKFP-NEXT: vmovx.f16 s24, s6
+; CHECKFP-NEXT: vmovx.f16 s21, s4
+; CHECKFP-NEXT: vins.f16 s8, s10
+; CHECKFP-NEXT: vins.f16 s21, s24
+; CHECKFP-NEXT: vmov.f32 s26, s9
+; CHECKFP-NEXT: vins.f16 s12, s14
; CHECKFP-NEXT: vins.f16 s0, s2
+; CHECKFP-NEXT: vmov.f32 s27, s13
; CHECKFP-NEXT: vins.f16 s4, s6
-; CHECKFP-NEXT: vins.f16 s25, s28
+; CHECKFP-NEXT: vmov.f32 s24, s1
+; CHECKFP-NEXT: vmov.f32 s2, s8
+; CHECKFP-NEXT: vmov.f32 s3, s12
; CHECKFP-NEXT: vmov.f32 s1, s4
-; CHECKFP-NEXT: vadd.f16 q1, q5, q4
-; CHECKFP-NEXT: vmov.f32 s2, s10
-; CHECKFP-NEXT: vmov.f32 s3, s11
-; CHECKFP-NEXT: vadd.f16 q0, q0, q6
-; CHECKFP-NEXT: vadd.f16 q0, q0, q1
-; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECKFP-NEXT: vmov.f32 s25, s5
+; CHECKFP-NEXT: vadd.f16 q0, q0, q5
+; CHECKFP-NEXT: vadd.f16 q4, q6, q4
+; CHECKFP-NEXT: vadd.f16 q0, q0, q4
+; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECKFP-NEXT: bx lr
entry:
%s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16> %s2) {
; CHECK-LABEL: shuffle_i16_76543210:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmovx.f16 s4, s3
-; CHECK-NEXT: vins.f16 s4, s3
-; CHECK-NEXT: vmovx.f16 s5, s2
-; CHECK-NEXT: vins.f16 s5, s2
-; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vins.f16 s6, s1
-; CHECK-NEXT: vmovx.f16 s7, s0
-; CHECK-NEXT: vins.f16 s7, s0
-; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmovx.f16 s0, s7
+; CHECK-NEXT: vins.f16 s0, s7
+; CHECK-NEXT: vmovx.f16 s1, s6
+; CHECK-NEXT: vins.f16 s1, s6
+; CHECK-NEXT: vmovx.f16 s2, s5
+; CHECK-NEXT: vins.f16 s2, s5
+; CHECK-NEXT: vmovx.f16 s3, s4
+; CHECK-NEXT: vins.f16 s3, s4
; CHECK-NEXT: bx lr
entry:
%out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: add_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
-; CHECK-MVE-NEXT: vmovx.f16 s8, s0
-; CHECK-MVE-NEXT: vmovx.f16 s10, s4
-; CHECK-MVE-NEXT: vadd.f16 s12, s10, s8
-; CHECK-MVE-NEXT: vadd.f16 s8, s4, s0
-; CHECK-MVE-NEXT: vins.f16 s8, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s1
+; CHECK-MVE-NEXT: vmov q2, q0
+; CHECK-MVE-NEXT: vmovx.f16 s2, s4
+; CHECK-MVE-NEXT: vmovx.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
-; CHECK-MVE-NEXT: vadd.f16 s9, s5, s1
+; CHECK-MVE-NEXT: vadd.f16 s12, s2, s0
+; CHECK-MVE-NEXT: vadd.f16 s0, s4, s8
+; CHECK-MVE-NEXT: vins.f16 s0, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s9
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
+; CHECK-MVE-NEXT: vadd.f16 s1, s5, s9
+; CHECK-MVE-NEXT: vins.f16 s1, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s10
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
-; CHECK-MVE-NEXT: vins.f16 s9, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s2
+; CHECK-MVE-NEXT: vadd.f16 s2, s6, s10
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT: vadd.f16 s10, s6, s2
-; CHECK-MVE-NEXT: vins.f16 s10, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s3
; CHECK-MVE-NEXT: vmovx.f16 s14, s7
-; CHECK-MVE-NEXT: vadd.f16 s11, s7, s3
+; CHECK-MVE-NEXT: vins.f16 s2, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s11
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT: vins.f16 s11, s12
-; CHECK-MVE-NEXT: vmov q0, q2
+; CHECK-MVE-NEXT: vadd.f16 s3, s7, s11
+; CHECK-MVE-NEXT: vins.f16 s3, s12
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: add_float16_t:
define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: sub_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
-; CHECK-MVE-NEXT: vmovx.f16 s8, s0
-; CHECK-MVE-NEXT: vmovx.f16 s10, s4
-; CHECK-MVE-NEXT: vsub.f16 s12, s10, s8
-; CHECK-MVE-NEXT: vsub.f16 s8, s4, s0
-; CHECK-MVE-NEXT: vins.f16 s8, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s1
+; CHECK-MVE-NEXT: vmov q2, q0
+; CHECK-MVE-NEXT: vmovx.f16 s2, s4
+; CHECK-MVE-NEXT: vmovx.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
-; CHECK-MVE-NEXT: vsub.f16 s9, s5, s1
+; CHECK-MVE-NEXT: vsub.f16 s12, s2, s0
+; CHECK-MVE-NEXT: vsub.f16 s0, s4, s8
+; CHECK-MVE-NEXT: vins.f16 s0, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s9
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
+; CHECK-MVE-NEXT: vsub.f16 s1, s5, s9
+; CHECK-MVE-NEXT: vins.f16 s1, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s10
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
-; CHECK-MVE-NEXT: vins.f16 s9, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s2
+; CHECK-MVE-NEXT: vsub.f16 s2, s6, s10
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT: vsub.f16 s10, s6, s2
-; CHECK-MVE-NEXT: vins.f16 s10, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s3
; CHECK-MVE-NEXT: vmovx.f16 s14, s7
-; CHECK-MVE-NEXT: vsub.f16 s11, s7, s3
+; CHECK-MVE-NEXT: vins.f16 s2, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s11
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT: vins.f16 s11, s12
-; CHECK-MVE-NEXT: vmov q0, q2
+; CHECK-MVE-NEXT: vsub.f16 s3, s7, s11
+; CHECK-MVE-NEXT: vins.f16 s3, s12
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: sub_float16_t:
define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: mul_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
-; CHECK-MVE-NEXT: vmovx.f16 s8, s0
-; CHECK-MVE-NEXT: vmovx.f16 s10, s4
-; CHECK-MVE-NEXT: vmul.f16 s12, s10, s8
-; CHECK-MVE-NEXT: vmul.f16 s8, s4, s0
-; CHECK-MVE-NEXT: vins.f16 s8, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s1
+; CHECK-MVE-NEXT: vmov q2, q0
+; CHECK-MVE-NEXT: vmovx.f16 s2, s4
+; CHECK-MVE-NEXT: vmovx.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
-; CHECK-MVE-NEXT: vmul.f16 s9, s5, s1
+; CHECK-MVE-NEXT: vmul.f16 s12, s2, s0
+; CHECK-MVE-NEXT: vmul.f16 s0, s4, s8
+; CHECK-MVE-NEXT: vins.f16 s0, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s9
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
+; CHECK-MVE-NEXT: vmul.f16 s1, s5, s9
+; CHECK-MVE-NEXT: vins.f16 s1, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s10
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
-; CHECK-MVE-NEXT: vins.f16 s9, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s2
+; CHECK-MVE-NEXT: vmul.f16 s2, s6, s10
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT: vmul.f16 s10, s6, s2
-; CHECK-MVE-NEXT: vins.f16 s10, s12
-; CHECK-MVE-NEXT: vmovx.f16 s12, s3
; CHECK-MVE-NEXT: vmovx.f16 s14, s7
-; CHECK-MVE-NEXT: vmul.f16 s11, s7, s3
+; CHECK-MVE-NEXT: vins.f16 s2, s12
+; CHECK-MVE-NEXT: vmovx.f16 s12, s11
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT: vins.f16 s11, s12
-; CHECK-MVE-NEXT: vmov q0, q2
+; CHECK-MVE-NEXT: vmul.f16 s3, s7, s11
+; CHECK-MVE-NEXT: vins.f16 s3, s12
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: mul_float16_t:
; CHECK-MVE-LABEL: foo_half_int16:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov q1, q0
-; CHECK-MVE-NEXT: vmov.s16 r0, q0[1]
+; CHECK-MVE-NEXT: vmov.s16 r0, q0[0]
; CHECK-MVE-NEXT: vmov s0, r0
-; CHECK-MVE-NEXT: vmov.s16 r0, q1[0]
-; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s0
-; CHECK-MVE-NEXT: vmov s0, r0
-; CHECK-MVE-NEXT: vcvt.f16.s32 s0, s0
+; CHECK-MVE-NEXT: vmov.s16 r0, q1[1]
+; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[3]
+; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s2
+; CHECK-MVE-NEXT: vcvt.f16.s32 s0, s0
; CHECK-MVE-NEXT: vins.f16 s0, s8
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[2]
; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8
; CHECK-MVE-NEXT: vmov s10, r0
-; CHECK-MVE-NEXT: vmov.s16 r0, q1[5]
+; CHECK-MVE-NEXT: vmov.s16 r0, q1[4]
; CHECK-MVE-NEXT: vcvt.f16.s32 s1, s10
; CHECK-MVE-NEXT: vins.f16 s1, s8
; CHECK-MVE-NEXT: vmov s8, r0
-; CHECK-MVE-NEXT: vmov.s16 r0, q1[4]
-; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8
+; CHECK-MVE-NEXT: vmov.s16 r0, q1[5]
+; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s8
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[7]
-; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s10
-; CHECK-MVE-NEXT: vins.f16 s2, s8
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[6]
-; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8
+; CHECK-MVE-NEXT: vcvt.f16.s32 s10, s10
; CHECK-MVE-NEXT: vmov s4, r0
+; CHECK-MVE-NEXT: vins.f16 s2, s10
+; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8
; CHECK-MVE-NEXT: vcvt.f16.s32 s3, s4
; CHECK-MVE-NEXT: vins.f16 s3, s8
; CHECK-MVE-NEXT: bx lr
; CHECK-MVE-LABEL: foo_half_uint16:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov q1, q0
-; CHECK-MVE-NEXT: vmov.u16 r0, q0[1]
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov s0, r0
-; CHECK-MVE-NEXT: vmov.u16 r0, q1[0]
-; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s0
-; CHECK-MVE-NEXT: vmov s0, r0
-; CHECK-MVE-NEXT: vcvt.f16.u32 s0, s0
+; CHECK-MVE-NEXT: vmov.u16 r0, q1[1]
+; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
+; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s2
+; CHECK-MVE-NEXT: vcvt.f16.u32 s0, s0
; CHECK-MVE-NEXT: vins.f16 s0, s8
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[2]
; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8
; CHECK-MVE-NEXT: vmov s10, r0
-; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
+; CHECK-MVE-NEXT: vmov.u16 r0, q1[4]
; CHECK-MVE-NEXT: vcvt.f16.u32 s1, s10
; CHECK-MVE-NEXT: vins.f16 s1, s8
; CHECK-MVE-NEXT: vmov s8, r0
-; CHECK-MVE-NEXT: vmov.u16 r0, q1[4]
-; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8
+; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
+; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s8
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
-; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s10
-; CHECK-MVE-NEXT: vins.f16 s2, s8
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[6]
-; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8
+; CHECK-MVE-NEXT: vcvt.f16.u32 s10, s10
; CHECK-MVE-NEXT: vmov s4, r0
+; CHECK-MVE-NEXT: vins.f16 s2, s10
+; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8
; CHECK-MVE-NEXT: vcvt.f16.u32 s3, s4
; CHECK-MVE-NEXT: vins.f16 s3, s8
; CHECK-MVE-NEXT: bx lr
define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) {
; CHECK-LABEL: vld2_v8i16_align1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u8 q2, [r0]
-; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
-; CHECK-NEXT: vmov.f64 d2, d4
-; CHECK-NEXT: vmov.u16 r0, q2[1]
-; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vins.f16 s4, s9
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.f32 s5, s10
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vins.f16 s5, s11
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.f32 s6, s0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vins.f16 s6, s1
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.f32 s7, s2
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vins.f16 s7, s3
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vadd.i16 q0, q1, q3
+; CHECK-NEXT: vldrb.u8 q0, [r0]
+; CHECK-NEXT: vmovx.f16 s8, s1
+; CHECK-NEXT: vmovx.f16 s4, s0
+; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vmovx.f16 s8, s3
+; CHECK-NEXT: vmovx.f16 s5, s2
+; CHECK-NEXT: vins.f16 s0, s1
+; CHECK-NEXT: vins.f16 s5, s8
+; CHECK-NEXT: vldrb.u8 q2, [r0, #16]
+; CHECK-NEXT: vins.f16 s2, s3
+; CHECK-NEXT: vmovx.f16 s12, s9
+; CHECK-NEXT: vmovx.f16 s6, s8
+; CHECK-NEXT: vins.f16 s6, s12
+; CHECK-NEXT: vmovx.f16 s12, s11
+; CHECK-NEXT: vmovx.f16 s7, s10
+; CHECK-NEXT: vmov.f32 s1, s2
+; CHECK-NEXT: vins.f16 s8, s9
+; CHECK-NEXT: vins.f16 s7, s12
+; CHECK-NEXT: vmov.f32 s2, s8
+; CHECK-NEXT: vins.f16 s10, s11
+; CHECK-NEXT: vmov.f32 s3, s10
+; CHECK-NEXT: vadd.i16 q0, q0, q1
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
; CHECK-LABEL: vld2_v4f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r0]
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vins.f16 s0, s1
; CHECK-NEXT: vmovx.f16 s8, s1
+; CHECK-NEXT: vmovx.f16 s4, s0
; CHECK-NEXT: vins.f16 s4, s8
; CHECK-NEXT: vmovx.f16 s8, s3
; CHECK-NEXT: vmovx.f16 s5, s2
-; CHECK-NEXT: vmov.f32 s1, s2
+; CHECK-NEXT: vins.f16 s0, s1
+; CHECK-NEXT: vins.f16 s2, s3
; CHECK-NEXT: vins.f16 s5, s8
-; CHECK-NEXT: vins.f16 s1, s3
+; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vmov r2, s1
; CHECK-NEXT: vmov r0, s0
define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) {
; CHECK-LABEL: vld2_v8f16_align1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8}
-; CHECK-NEXT: vpush {d8}
-; CHECK-NEXT: vldrb.u8 q3, [r0]
-; CHECK-NEXT: vldrb.u8 q1, [r0, #16]
-; CHECK-NEXT: vmov.f64 d0, d6
-; CHECK-NEXT: vmovx.f16 s16, s13
-; CHECK-NEXT: vmovx.f16 s8, s12
-; CHECK-NEXT: vins.f16 s8, s16
-; CHECK-NEXT: vmovx.f16 s16, s15
-; CHECK-NEXT: vmovx.f16 s9, s14
-; CHECK-NEXT: vmovx.f16 s12, s5
-; CHECK-NEXT: vins.f16 s0, s13
-; CHECK-NEXT: vins.f16 s9, s16
-; CHECK-NEXT: vmov.f32 s1, s14
-; CHECK-NEXT: vmovx.f16 s10, s4
-; CHECK-NEXT: vins.f16 s1, s15
-; CHECK-NEXT: vins.f16 s10, s12
-; CHECK-NEXT: vmov.f32 s2, s4
-; CHECK-NEXT: vmovx.f16 s12, s7
-; CHECK-NEXT: vmovx.f16 s11, s6
-; CHECK-NEXT: vins.f16 s2, s5
-; CHECK-NEXT: vins.f16 s6, s7
-; CHECK-NEXT: vins.f16 s11, s12
-; CHECK-NEXT: vmov.f32 s3, s6
-; CHECK-NEXT: vadd.f16 q0, q0, q2
+; CHECK-NEXT: vldrb.u8 q0, [r0]
+; CHECK-NEXT: vmovx.f16 s8, s1
+; CHECK-NEXT: vmovx.f16 s4, s0
+; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vmovx.f16 s8, s3
+; CHECK-NEXT: vmovx.f16 s5, s2
+; CHECK-NEXT: vins.f16 s0, s1
+; CHECK-NEXT: vins.f16 s5, s8
+; CHECK-NEXT: vldrb.u8 q2, [r0, #16]
+; CHECK-NEXT: vins.f16 s2, s3
+; CHECK-NEXT: vmovx.f16 s12, s9
+; CHECK-NEXT: vmovx.f16 s6, s8
+; CHECK-NEXT: vins.f16 s6, s12
+; CHECK-NEXT: vmovx.f16 s12, s11
+; CHECK-NEXT: vmovx.f16 s7, s10
+; CHECK-NEXT: vins.f16 s8, s9
+; CHECK-NEXT: vmov.f32 s1, s2
+; CHECK-NEXT: vins.f16 s10, s11
+; CHECK-NEXT: vmov.f32 s2, s8
+; CHECK-NEXT: vins.f16 s7, s12
+; CHECK-NEXT: vmov.f32 s3, s10
+; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vpop {d8}
; CHECK-NEXT: bx lr
entry:
%l1 = load <16 x half>, <16 x half>* %src, align 1
define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
; CHECK-LABEL: vld3_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmovx.f16 s8, s6
+; CHECK-NEXT: vmov.f32 s0, s5
+; CHECK-NEXT: vins.f16 s0, s8
; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT: vmov.u16 r0, q3[2]
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov.u16 r0, q3[5]
-; CHECK-NEXT: vmov.16 q0[1], r0
-; CHECK-NEXT: vmov.u16 r0, q2[0]
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.16 q5[6], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.16 q5[7], r0
-; CHECK-NEXT: vmov.u16 r0, q3[0]
-; CHECK-NEXT: vmov.16 q4[0], r0
-; CHECK-NEXT: vmov.u16 r0, q3[3]
-; CHECK-NEXT: vmov.f32 s22, s4
-; CHECK-NEXT: vmov.16 q4[1], r0
-; CHECK-NEXT: vmov.u16 r0, q3[6]
+; CHECK-NEXT: vmovx.f16 s12, s9
+; CHECK-NEXT: vmov.f32 s1, s8
+; CHECK-NEXT: vins.f16 s1, s12
+; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
; CHECK-NEXT: vmov.f32 s2, s11
-; CHECK-NEXT: vmov q6, q5
-; CHECK-NEXT: vmov.16 q4[2], r0
-; CHECK-NEXT: vmov.u16 r0, q2[1]
-; CHECK-NEXT: vmovnb.i32 q6, q0
-; CHECK-NEXT: vmov.16 q4[3], r0
-; CHECK-NEXT: vmov.u16 r0, q2[4]
-; CHECK-NEXT: vmov.f32 s2, s26
-; CHECK-NEXT: vmov.16 q4[4], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.f32 s3, s23
-; CHECK-NEXT: vmov.16 q5[6], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.16 q5[7], r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vmov.16 q4[5], r0
; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.f32 s19, s23
-; CHECK-NEXT: vmovx.f16 s20, s12
-; CHECK-NEXT: vins.f16 s20, s14
-; CHECK-NEXT: vmovx.f16 s21, s15
+; CHECK-NEXT: vmovx.f16 s20, s15
+; CHECK-NEXT: vmov.f32 s19, s14
+; CHECK-NEXT: vins.f16 s19, s20
+; CHECK-NEXT: vmov.f32 s18, s12
+; CHECK-NEXT: vmov q5, q4
+; CHECK-NEXT: vmovnb.i32 q5, q0
+; CHECK-NEXT: vmov.f32 s2, s22
+; CHECK-NEXT: vmovx.f16 s20, s5
+; CHECK-NEXT: vmov.f32 s3, s19
+; CHECK-NEXT: vmov.f64 d8, d2
+; CHECK-NEXT: vins.f16 s16, s20
+; CHECK-NEXT: vmovx.f16 s20, s8
+; CHECK-NEXT: vmov.f32 s17, s7
+; CHECK-NEXT: vins.f16 s17, s20
+; CHECK-NEXT: vmovx.f16 s20, s11
+; CHECK-NEXT: vmov.f32 s18, s10
+; CHECK-NEXT: vins.f16 s18, s20
+; CHECK-NEXT: vmovx.f16 s20, s14
+; CHECK-NEXT: vmov.f32 s19, s13
+; CHECK-NEXT: vins.f16 s19, s20
+; CHECK-NEXT: vmovx.f16 s20, s4
+; CHECK-NEXT: vins.f16 s20, s6
+; CHECK-NEXT: vmovx.f16 s21, s7
+; CHECK-NEXT: vins.f16 s6, s12
+; CHECK-NEXT: vmovx.f16 s7, s13
; CHECK-NEXT: vins.f16 s21, s9
-; CHECK-NEXT: vins.f16 s10, s4
-; CHECK-NEXT: vmovx.f16 s11, s5
+; CHECK-NEXT: vins.f16 s7, s15
; CHECK-NEXT: vmov.16 q5[4], r0
-; CHECK-NEXT: vins.f16 s11, s7
-; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: vmovnb.i32 q1, q5
-; CHECK-NEXT: vmov.f32 s22, s6
-; CHECK-NEXT: vmov.f32 s23, s11
+; CHECK-NEXT: vmov q2, q1
+; CHECK-NEXT: vmovnb.i32 q2, q5
+; CHECK-NEXT: vmov.f32 s22, s10
+; CHECK-NEXT: vmov.f32 s23, s7
; CHECK-NEXT: vadd.i16 q1, q4, q5
; CHECK-NEXT: vadd.i16 q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
%l1 = load <24 x i16>, <24 x i16>* %src, align 4
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vmovx.f16 s8, s6
+; CHECK-NEXT: vmov.f32 s0, s5
+; CHECK-NEXT: vins.f16 s0, s8
+; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT: vmovx.f16 s12, s9
+; CHECK-NEXT: vmov.f32 s1, s8
+; CHECK-NEXT: vins.f16 s1, s12
; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT: vmov.u16 r2, q2[0]
-; CHECK-NEXT: vmov.16 q0[0], r2
-; CHECK-NEXT: vmov.u16 r2, q2[3]
-; CHECK-NEXT: vmov.16 q0[1], r2
-; CHECK-NEXT: vmov.u16 r2, q2[6]
-; CHECK-NEXT: vmov.16 q0[2], r2
-; CHECK-NEXT: vmov.u16 r2, q1[1]
-; CHECK-NEXT: vmov.16 q0[3], r2
-; CHECK-NEXT: vmov.u16 r2, q1[4]
-; CHECK-NEXT: vmov.16 q0[4], r2
-; CHECK-NEXT: vmov.u16 r2, q3[2]
-; CHECK-NEXT: vmov.16 q4[6], r2
-; CHECK-NEXT: vmov.u16 r2, q3[5]
-; CHECK-NEXT: vmov.16 q4[7], r2
-; CHECK-NEXT: vmov.u16 r2, q1[7]
-; CHECK-NEXT: vmov.16 q0[5], r2
-; CHECK-NEXT: vmov.u16 r2, q3[4]
-; CHECK-NEXT: vmov.16 q5[6], r2
-; CHECK-NEXT: vmov.u16 r2, q2[2]
-; CHECK-NEXT: vmov.f32 s3, s19
-; CHECK-NEXT: vmov.16 q4[0], r2
+; CHECK-NEXT: vmov.f32 s2, s11
; CHECK-NEXT: vmov.u16 r2, q2[5]
-; CHECK-NEXT: vmov.16 q4[1], r2
-; CHECK-NEXT: vmov.u16 r2, q1[0]
-; CHECK-NEXT: vmov.16 q4[2], r2
-; CHECK-NEXT: vmov.u16 r2, q1[3]
-; CHECK-NEXT: vmov.16 q4[3], r2
-; CHECK-NEXT: vmov.u16 r2, q3[7]
-; CHECK-NEXT: vmov.16 q5[7], r2
-; CHECK-NEXT: vmov.f32 s18, s7
-; CHECK-NEXT: vmov.f32 s22, s12
-; CHECK-NEXT: vmov.u16 r2, q1[5]
-; CHECK-NEXT: vmov q6, q5
-; CHECK-NEXT: vmovnb.i32 q6, q4
-; CHECK-NEXT: vmov.f32 s18, s26
-; CHECK-NEXT: vmov.f32 s19, s23
-; CHECK-NEXT: vins.f16 s22, s12
-; CHECK-NEXT: vmovx.f16 s23, s13
-; CHECK-NEXT: vins.f16 s23, s15
-; CHECK-NEXT: vmovx.f16 s12, s8
-; CHECK-NEXT: vins.f16 s12, s10
-; CHECK-NEXT: vmovx.f16 s13, s11
-; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT: vins.f16 s13, s5
-; CHECK-NEXT: vmov q1, q5
-; CHECK-NEXT: vmov.16 q3[4], r2
-; CHECK-NEXT: vmovnb.i32 q1, q3
-; CHECK-NEXT: vmov.f32 s14, s6
-; CHECK-NEXT: vmov.f32 s15, s23
-; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT: vadd.i16 q0, q0, q3
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vmov.u16 r0, q5[4]
-; CHECK-NEXT: vadd.i16 q0, q0, q4
-; CHECK-NEXT: vmov.u16 r2, q3[2]
-; CHECK-NEXT: vmov.16 q6[6], r0
-; CHECK-NEXT: vmov.16 q1[0], r2
-; CHECK-NEXT: vmov.u16 r2, q3[5]
-; CHECK-NEXT: vmov.16 q1[1], r2
-; CHECK-NEXT: vmov.u16 r2, q2[0]
-; CHECK-NEXT: vmov.u16 r0, q5[7]
-; CHECK-NEXT: vmov.16 q1[2], r2
-; CHECK-NEXT: vmov.16 q6[7], r0
-; CHECK-NEXT: vmov.u16 r2, q2[3]
-; CHECK-NEXT: vmov.16 q1[3], r2
-; CHECK-NEXT: vmov.f32 s26, s20
-; CHECK-NEXT: vmov.u16 r0, q3[0]
-; CHECK-NEXT: vmov.f32 s6, s11
+; CHECK-NEXT: vmovx.f16 s20, s15
+; CHECK-NEXT: vmov.f32 s19, s14
+; CHECK-NEXT: vins.f16 s19, s20
+; CHECK-NEXT: vmov.f32 s18, s12
+; CHECK-NEXT: vmov q5, q4
+; CHECK-NEXT: vmovnb.i32 q5, q0
+; CHECK-NEXT: vmov.f32 s2, s22
+; CHECK-NEXT: vmovx.f16 s20, s5
+; CHECK-NEXT: vmov.f32 s3, s19
+; CHECK-NEXT: vmov.f64 d8, d2
+; CHECK-NEXT: vins.f16 s16, s20
+; CHECK-NEXT: vmovx.f16 s20, s8
+; CHECK-NEXT: vmov.f32 s17, s7
+; CHECK-NEXT: vins.f16 s17, s20
+; CHECK-NEXT: vmovx.f16 s20, s11
+; CHECK-NEXT: vmov.f32 s18, s10
+; CHECK-NEXT: vins.f16 s18, s20
+; CHECK-NEXT: vmovx.f16 s20, s14
+; CHECK-NEXT: vmov.f32 s19, s13
+; CHECK-NEXT: vins.f16 s19, s20
+; CHECK-NEXT: vmovx.f16 s20, s4
+; CHECK-NEXT: vins.f16 s20, s6
+; CHECK-NEXT: vmovx.f16 s21, s7
+; CHECK-NEXT: vins.f16 s6, s12
+; CHECK-NEXT: vmovx.f16 s7, s13
+; CHECK-NEXT: vins.f16 s21, s9
+; CHECK-NEXT: vins.f16 s7, s15
+; CHECK-NEXT: vmov.16 q5[4], r2
+; CHECK-NEXT: vmov q2, q1
+; CHECK-NEXT: vmovnb.i32 q2, q5
+; CHECK-NEXT: vmov.f32 s22, s10
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vmov.f32 s23, s7
+; CHECK-NEXT: vadd.i16 q1, q4, q5
+; CHECK-NEXT: vmovx.f16 s12, s10
+; CHECK-NEXT: vadd.i16 q0, q1, q0
+; CHECK-NEXT: vmov.f32 s4, s9
+; CHECK-NEXT: vins.f16 s4, s12
+; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmovx.f16 s0, s9
+; CHECK-NEXT: vmovx.f16 s16, s13
+; CHECK-NEXT: vmov.f32 s5, s12
+; CHECK-NEXT: vins.f16 s5, s16
+; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT: vmov.f32 s6, s15
+; CHECK-NEXT: vmov.u16 r0, q3[5]
+; CHECK-NEXT: vmovx.f16 s20, s19
+; CHECK-NEXT: vmov.f32 s27, s18
+; CHECK-NEXT: vins.f16 s27, s20
+; CHECK-NEXT: vmov.f64 d10, d4
+; CHECK-NEXT: vins.f16 s20, s0
+; CHECK-NEXT: vmov.f32 s26, s16
+; CHECK-NEXT: vmovx.f16 s0, s12
+; CHECK-NEXT: vmov.f32 s21, s11
+; CHECK-NEXT: vins.f16 s21, s0
; CHECK-NEXT: vmov q7, q6
-; CHECK-NEXT: vmov.16 q4[0], r0
-; CHECK-NEXT: vmov.u16 r0, q3[3]
; CHECK-NEXT: vmovnb.i32 q7, q1
-; CHECK-NEXT: vmov.16 q4[1], r0
-; CHECK-NEXT: vmov.u16 r0, q3[6]
+; CHECK-NEXT: vmovx.f16 s0, s15
+; CHECK-NEXT: vmov.f32 s22, s14
+; CHECK-NEXT: vins.f16 s22, s0
; CHECK-NEXT: vmov.f32 s6, s30
-; CHECK-NEXT: vmov.16 q4[2], r0
-; CHECK-NEXT: vmov.u16 r0, q2[1]
; CHECK-NEXT: vmov.f32 s7, s27
-; CHECK-NEXT: vins.f16 s26, s20
-; CHECK-NEXT: vmov.16 q4[3], r0
-; CHECK-NEXT: vmov.u16 r0, q2[4]
-; CHECK-NEXT: vmovx.f16 s27, s21
-; CHECK-NEXT: vmov.16 q4[4], r0
-; CHECK-NEXT: vmov.u16 r0, q5[2]
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov.u16 r0, q5[5]
-; CHECK-NEXT: vins.f16 s27, s23
-; CHECK-NEXT: vmovx.f16 s20, s12
-; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vins.f16 s20, s14
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vmovx.f16 s21, s15
-; CHECK-NEXT: vmov.16 q4[5], r0
-; CHECK-NEXT: vins.f16 s21, s9
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.f32 s19, s3
-; CHECK-NEXT: vmov.16 q5[4], r0
-; CHECK-NEXT: vmov q0, q6
-; CHECK-NEXT: vmovnb.i32 q0, q5
-; CHECK-NEXT: vmov.f32 s22, s2
-; CHECK-NEXT: vmov.f32 s23, s27
-; CHECK-NEXT: vadd.i16 q0, q4, q5
+; CHECK-NEXT: vmovx.f16 s24, s8
+; CHECK-NEXT: vmovx.f16 s0, s18
+; CHECK-NEXT: vmov.f32 s23, s17
+; CHECK-NEXT: vins.f16 s24, s10
+; CHECK-NEXT: vins.f16 s23, s0
+; CHECK-NEXT: vins.f16 s2, s16
+; CHECK-NEXT: vmovx.f16 s25, s11
+; CHECK-NEXT: vmovx.f16 s3, s17
+; CHECK-NEXT: vins.f16 s25, s13
+; CHECK-NEXT: vins.f16 s3, s19
+; CHECK-NEXT: vmov.16 q6[4], r0
+; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: vmovnb.i32 q2, q6
+; CHECK-NEXT: vmov.f32 s26, s10
+; CHECK-NEXT: vmov.f32 s27, s3
+; CHECK-NEXT: vadd.i16 q0, q5, q6
; CHECK-NEXT: vadd.i16 q0, q0, q1
; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
; CHECK-NEXT: vstrw.32 q0, [r1]
define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
; CHECK-LABEL: vld3_v8i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vldrb.u16 q0, [r0, #16]
-; CHECK-NEXT: vmov.u8 r0, q1[1]
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vmov.u8 r0, q1[4]
-; CHECK-NEXT: vmov.16 q2[1], r0
-; CHECK-NEXT: vmov.u8 r0, q1[7]
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov.u8 r0, q1[10]
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov.u8 r0, q1[13]
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov.u8 r0, q1[0]
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT: vmov.u8 r2, q0[1]
+; CHECK-NEXT: vmov.u8 r0, q0[0]
+; CHECK-NEXT: vmov.16 q2[0], r2
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vmov.16 q2[1], r2
+; CHECK-NEXT: vmov.u8 r2, q0[7]
; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u8 r0, q1[3]
+; CHECK-NEXT: vmov.u8 r0, q0[3]
+; CHECK-NEXT: vmov.16 q2[2], r2
+; CHECK-NEXT: vmov.u8 r2, q0[10]
; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u8 r0, q1[6]
+; CHECK-NEXT: vmov.u8 r0, q0[6]
+; CHECK-NEXT: vmov.16 q2[3], r2
+; CHECK-NEXT: vmov.u8 r2, q0[13]
; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u8 r0, q1[9]
+; CHECK-NEXT: vmov.u8 r0, q0[9]
+; CHECK-NEXT: vmov.16 q2[4], r2
; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u8 r0, q1[12]
+; CHECK-NEXT: vmov.u8 r0, q0[12]
+; CHECK-NEXT: vins.f16 s10, s4
; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u8 r0, q1[15]
-; CHECK-NEXT: vins.f16 s10, s0
+; CHECK-NEXT: vmov.u8 r0, q0[15]
+; CHECK-NEXT: vmovx.f16 s16, s6
+; CHECK-NEXT: vmov.f32 s18, s5
+; CHECK-NEXT: vmovx.f16 s11, s5
; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u16 r0, q0[2]
-; CHECK-NEXT: vmovx.f16 s11, s1
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vins.f16 s11, s3
-; CHECK-NEXT: vmov.u8 r0, q1[2]
+; CHECK-NEXT: vins.f16 s18, s16
+; CHECK-NEXT: vins.f16 s11, s7
+; CHECK-NEXT: vmov.f32 s15, s18
+; CHECK-NEXT: vmov.u8 r0, q0[2]
; CHECK-NEXT: vadd.i16 q2, q3, q2
; CHECK-NEXT: vmov.16 q3[0], r0
-; CHECK-NEXT: vmov.u8 r0, q1[5]
+; CHECK-NEXT: vmov.u8 r0, q0[5]
; CHECK-NEXT: vmov.16 q3[1], r0
-; CHECK-NEXT: vmov.u8 r0, q1[8]
+; CHECK-NEXT: vmov.u8 r0, q0[8]
; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u8 r0, q1[11]
+; CHECK-NEXT: vmov.u8 r0, q0[11]
; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u8 r0, q1[14]
+; CHECK-NEXT: vmov.u8 r0, q0[14]
; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u16 r0, q0[1]
+; CHECK-NEXT: vmov.u16 r0, q1[1]
+; CHECK-NEXT: vmovx.f16 s0, s7
+; CHECK-NEXT: vmov.f32 s2, s6
+; CHECK-NEXT: vins.f16 s2, s0
; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u16 r0, q0[4]
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vmov.16 q3[7], r0
+; CHECK-NEXT: vmov.f32 s15, s2
; CHECK-NEXT: vadd.i16 q0, q2, q3
; CHECK-NEXT: vstrb.16 q0, [r1]
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%l1 = load <24 x i8>, <24 x i8>* %src, align 4
; CHECK-NEXT: vmov.32 q0[0], r2
; CHECK-NEXT: vmov.32 q0[1], r3
; CHECK-NEXT: vmov.32 q0[2], r0
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s8, s1
-; CHECK-NEXT: vins.f16 s4, s2
-; CHECK-NEXT: vins.f16 s0, s8
-; CHECK-NEXT: vmovx.f16 s8, s2
-; CHECK-NEXT: vadd.f16 q1, q0, q1
-; CHECK-NEXT: vins.f16 s1, s8
+; CHECK-NEXT: vmovx.f16 s8, s0
+; CHECK-NEXT: vmovx.f16 s4, s2
+; CHECK-NEXT: vins.f16 s8, s2
+; CHECK-NEXT: vmovx.f16 s6, s1
+; CHECK-NEXT: vins.f16 s1, s4
+; CHECK-NEXT: vins.f16 s0, s6
+; CHECK-NEXT: vadd.f16 q1, q0, q2
; CHECK-NEXT: vmov.f32 s0, s1
; CHECK-NEXT: vadd.f16 q0, q1, q0
; CHECK-NEXT: vmov r0, s0
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: ldrd r2, r0, [r0, #16]
-; CHECK-NEXT: vmovx.f16 s4, s0
-; CHECK-NEXT: vmovx.f16 s8, s1
-; CHECK-NEXT: vins.f16 s4, s2
-; CHECK-NEXT: vins.f16 s0, s8
+; CHECK-NEXT: ldrd r2, r3, [r0, #16]
; CHECK-NEXT: vmov.32 q2[0], r2
-; CHECK-NEXT: vmov.f32 s14, s3
-; CHECK-NEXT: vmov.32 q2[1], r0
-; CHECK-NEXT: vmovx.f16 s5, s3
-; CHECK-NEXT: vmovx.f16 s12, s8
-; CHECK-NEXT: vmov q4, q0
-; CHECK-NEXT: vins.f16 s14, s12
-; CHECK-NEXT: vmovx.f16 s12, s2
-; CHECK-NEXT: vins.f16 s1, s12
-; CHECK-NEXT: vmovx.f16 s12, s9
-; CHECK-NEXT: vins.f16 s5, s9
-; CHECK-NEXT: vmov.f32 s17, s14
-; CHECK-NEXT: vmov.f32 s0, s1
-; CHECK-NEXT: vins.f16 s8, s12
-; CHECK-NEXT: vadd.f16 q1, q4, q1
+; CHECK-NEXT: vmov.32 q2[1], r3
; CHECK-NEXT: vmov.f32 s1, s8
+; CHECK-NEXT: vmovx.f16 s4, s9
+; CHECK-NEXT: vins.f16 s1, s4
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmovx.f16 s8, s8
+; CHECK-NEXT: vmovx.f16 s12, s4
+; CHECK-NEXT: vmovx.f16 s16, s6
+; CHECK-NEXT: vins.f16 s12, s6
+; CHECK-NEXT: vmovx.f16 s18, s5
+; CHECK-NEXT: vins.f16 s5, s16
+; CHECK-NEXT: vins.f16 s4, s18
+; CHECK-NEXT: vmovx.f16 s13, s7
+; CHECK-NEXT: vins.f16 s7, s8
+; CHECK-NEXT: vmov.f32 s0, s5
+; CHECK-NEXT: vins.f16 s13, s9
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vadd.f16 q1, q1, q3
; CHECK-NEXT: vadd.f16 q0, q1, q0
; CHECK-NEXT: vmov r2, s1
; CHECK-NEXT: vmov r0, s0
define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
; CHECK-LABEL: vld3_v8f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT: vmovx.f16 s4, s14
-; CHECK-NEXT: vmov.f32 s0, s13
-; CHECK-NEXT: vins.f16 s0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT: vmovx.f16 s8, s2
+; CHECK-NEXT: vmov.f32 s4, s1
+; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vmovx.f16 s8, s17
+; CHECK-NEXT: vmov.f32 s5, s16
+; CHECK-NEXT: vmovx.f16 s24, s1
+; CHECK-NEXT: vins.f16 s5, s8
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT: vmov.f32 s6, s19
+; CHECK-NEXT: vmovx.f16 s26, s16
+; CHECK-NEXT: vmovx.f16 s20, s11
+; CHECK-NEXT: vmov.f32 s15, s10
+; CHECK-NEXT: vins.f16 s15, s20
+; CHECK-NEXT: vmovx.f16 s20, s8
+; CHECK-NEXT: vins.f16 s6, s20
; CHECK-NEXT: vmovx.f16 s20, s19
-; CHECK-NEXT: vmovx.f16 s24, s13
-; CHECK-NEXT: vmovx.f16 s8, s5
-; CHECK-NEXT: vmov.f32 s1, s4
-; CHECK-NEXT: vins.f16 s1, s8
-; CHECK-NEXT: vmov.f32 s11, s18
-; CHECK-NEXT: vins.f16 s11, s20
-; CHECK-NEXT: vmov.f32 s2, s7
-; CHECK-NEXT: vmovx.f16 s20, s16
-; CHECK-NEXT: vmov.f32 s10, s16
-; CHECK-NEXT: vins.f16 s2, s20
-; CHECK-NEXT: vmov.f64 d10, d6
-; CHECK-NEXT: vins.f16 s20, s24
-; CHECK-NEXT: vmovx.f16 s24, s4
-; CHECK-NEXT: vmov.f32 s21, s15
-; CHECK-NEXT: vins.f16 s21, s24
-; CHECK-NEXT: vmovx.f16 s24, s7
-; CHECK-NEXT: vmov.f32 s22, s6
-; CHECK-NEXT: vins.f16 s22, s24
-; CHECK-NEXT: vmovx.f16 s24, s12
-; CHECK-NEXT: vins.f16 s24, s14
-; CHECK-NEXT: vmov.f32 s3, s11
-; CHECK-NEXT: vmovx.f16 s25, s15
-; CHECK-NEXT: vmovx.f16 s12, s18
-; CHECK-NEXT: vins.f16 s25, s5
-; CHECK-NEXT: vmovx.f16 s27, s17
-; CHECK-NEXT: vins.f16 s17, s12
-; CHECK-NEXT: vins.f16 s27, s19
-; CHECK-NEXT: vmov.f32 s23, s17
-; CHECK-NEXT: vmovx.f16 s26, s6
-; CHECK-NEXT: vins.f16 s26, s16
-; CHECK-NEXT: vadd.f16 q1, q5, q6
-; CHECK-NEXT: vadd.f16 q0, q1, q0
+; CHECK-NEXT: vmov.f32 s28, s18
+; CHECK-NEXT: vmovx.f16 s30, s10
+; CHECK-NEXT: vins.f16 s28, s20
+; CHECK-NEXT: vmovx.f16 s20, s0
+; CHECK-NEXT: vins.f16 s0, s24
+; CHECK-NEXT: vins.f16 s20, s2
+; CHECK-NEXT: vmovx.f16 s21, s3
+; CHECK-NEXT: vins.f16 s3, s26
+; CHECK-NEXT: vins.f16 s21, s17
+; CHECK-NEXT: vmov.f32 s14, s8
+; CHECK-NEXT: vmovx.f16 s23, s9
+; CHECK-NEXT: vmov.f32 s1, s3
+; CHECK-NEXT: vins.f16 s9, s30
+; CHECK-NEXT: vins.f16 s23, s11
+; CHECK-NEXT: vmovx.f16 s22, s18
+; CHECK-NEXT: vmov.f32 s2, s28
+; CHECK-NEXT: vins.f16 s22, s8
+; CHECK-NEXT: vmov.f32 s3, s9
+; CHECK-NEXT: vmov.f32 s7, s15
+; CHECK-NEXT: vadd.f16 q0, q0, q5
+; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
entry:
%l1 = load <24 x half>, <24 x half>* %src, align 4
define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
; CHECK-LABEL: vld3_v16f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT: vmovx.f16 s4, s14
-; CHECK-NEXT: vmov.f32 s0, s13
-; CHECK-NEXT: vins.f16 s0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT: vmovx.f16 s20, s19
-; CHECK-NEXT: vmovx.f16 s24, s13
-; CHECK-NEXT: vmovx.f16 s8, s5
-; CHECK-NEXT: vmov.f32 s1, s4
-; CHECK-NEXT: vins.f16 s1, s8
-; CHECK-NEXT: vmov.f32 s11, s18
-; CHECK-NEXT: vins.f16 s11, s20
-; CHECK-NEXT: vmov.f32 s2, s7
-; CHECK-NEXT: vmovx.f16 s20, s16
-; CHECK-NEXT: vmov.f32 s10, s16
-; CHECK-NEXT: vins.f16 s2, s20
-; CHECK-NEXT: vmov.f64 d10, d6
-; CHECK-NEXT: vins.f16 s20, s24
-; CHECK-NEXT: vmovx.f16 s24, s4
-; CHECK-NEXT: vmov.f32 s21, s15
-; CHECK-NEXT: vins.f16 s21, s24
-; CHECK-NEXT: vmovx.f16 s24, s7
-; CHECK-NEXT: vmov.f32 s22, s6
-; CHECK-NEXT: vins.f16 s22, s24
-; CHECK-NEXT: vmovx.f16 s24, s12
-; CHECK-NEXT: vins.f16 s24, s14
-; CHECK-NEXT: vmov.f32 s3, s11
-; CHECK-NEXT: vmovx.f16 s25, s15
-; CHECK-NEXT: vmovx.f16 s12, s18
-; CHECK-NEXT: vins.f16 s25, s5
-; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT: vmovx.f16 s27, s17
-; CHECK-NEXT: vins.f16 s17, s12
-; CHECK-NEXT: vins.f16 s27, s19
-; CHECK-NEXT: vmov.f32 s23, s17
-; CHECK-NEXT: vmovx.f16 s26, s6
-; CHECK-NEXT: vins.f16 s26, s16
-; CHECK-NEXT: vldrw.u32 q4, [r0]
-; CHECK-NEXT: vadd.f16 q1, q5, q6
-; CHECK-NEXT: vadd.f16 q0, q1, q0
-; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT: vstrw.32 q0, [r1, #16]
-; CHECK-NEXT: vmovx.f16 s12, s18
-; CHECK-NEXT: vmov.f32 s0, s17
-; CHECK-NEXT: vmovx.f16 s20, s7
-; CHECK-NEXT: vins.f16 s0, s12
-; CHECK-NEXT: vmovx.f16 s12, s9
-; CHECK-NEXT: vmov.f32 s1, s8
-; CHECK-NEXT: vmovx.f16 s24, s17
-; CHECK-NEXT: vins.f16 s1, s12
-; CHECK-NEXT: vmov.f32 s15, s6
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT: vmovx.f16 s8, s2
+; CHECK-NEXT: vmov.f32 s4, s1
+; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vmovx.f16 s8, s17
+; CHECK-NEXT: vmov.f32 s5, s16
+; CHECK-NEXT: vmovx.f16 s24, s1
+; CHECK-NEXT: vins.f16 s5, s8
+; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT: vmov.f32 s6, s19
+; CHECK-NEXT: vmovx.f16 s26, s16
+; CHECK-NEXT: vmovx.f16 s20, s11
+; CHECK-NEXT: vmov.f32 s15, s10
; CHECK-NEXT: vins.f16 s15, s20
-; CHECK-NEXT: vmov.f32 s2, s11
-; CHECK-NEXT: vmovx.f16 s20, s4
-; CHECK-NEXT: vmov.f32 s14, s4
-; CHECK-NEXT: vins.f16 s2, s20
-; CHECK-NEXT: vmov.f64 d10, d8
-; CHECK-NEXT: vins.f16 s20, s24
-; CHECK-NEXT: vmovx.f16 s24, s8
-; CHECK-NEXT: vmov.f32 s21, s19
-; CHECK-NEXT: vins.f16 s21, s24
-; CHECK-NEXT: vmovx.f16 s24, s11
-; CHECK-NEXT: vmov.f32 s22, s10
-; CHECK-NEXT: vins.f16 s22, s24
-; CHECK-NEXT: vmovx.f16 s24, s16
-; CHECK-NEXT: vins.f16 s24, s18
-; CHECK-NEXT: vmov.f32 s3, s15
-; CHECK-NEXT: vmovx.f16 s25, s19
-; CHECK-NEXT: vmovx.f16 s16, s6
-; CHECK-NEXT: vins.f16 s25, s9
-; CHECK-NEXT: vmovx.f16 s27, s5
+; CHECK-NEXT: vmovx.f16 s20, s8
+; CHECK-NEXT: vins.f16 s6, s20
+; CHECK-NEXT: vmovx.f16 s20, s19
+; CHECK-NEXT: vmov.f32 s28, s18
+; CHECK-NEXT: vmovx.f16 s30, s10
+; CHECK-NEXT: vins.f16 s28, s20
+; CHECK-NEXT: vmovx.f16 s20, s0
+; CHECK-NEXT: vins.f16 s0, s24
+; CHECK-NEXT: vins.f16 s20, s2
+; CHECK-NEXT: vmovx.f16 s21, s3
+; CHECK-NEXT: vins.f16 s3, s26
+; CHECK-NEXT: vins.f16 s21, s17
+; CHECK-NEXT: vmov.f32 s14, s8
+; CHECK-NEXT: vmovx.f16 s23, s9
+; CHECK-NEXT: vmov.f32 s1, s3
+; CHECK-NEXT: vins.f16 s9, s30
+; CHECK-NEXT: vins.f16 s23, s11
+; CHECK-NEXT: vmovx.f16 s22, s18
+; CHECK-NEXT: vmov.f32 s2, s28
+; CHECK-NEXT: vins.f16 s22, s8
+; CHECK-NEXT: vmov.f32 s3, s9
+; CHECK-NEXT: vmov.f32 s7, s15
+; CHECK-NEXT: vadd.f16 q0, q0, q5
+; CHECK-NEXT: vadd.f16 q1, q0, q1
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT: vstrw.32 q1, [r1, #16]
+; CHECK-NEXT: vmovx.f16 s16, s2
+; CHECK-NEXT: vmov.f32 s4, s1
+; CHECK-NEXT: vmovx.f16 s20, s11
+; CHECK-NEXT: vins.f16 s4, s16
+; CHECK-NEXT: vmovx.f16 s16, s13
+; CHECK-NEXT: vmov.f32 s5, s12
+; CHECK-NEXT: vmovx.f16 s24, s1
; CHECK-NEXT: vins.f16 s5, s16
-; CHECK-NEXT: vins.f16 s27, s7
-; CHECK-NEXT: vmov.f32 s23, s5
-; CHECK-NEXT: vmovx.f16 s26, s10
-; CHECK-NEXT: vins.f16 s26, s4
-; CHECK-NEXT: vadd.f16 q1, q5, q6
-; CHECK-NEXT: vadd.f16 q0, q1, q0
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vins.f16 s19, s20
+; CHECK-NEXT: vmov.f32 s6, s15
+; CHECK-NEXT: vmovx.f16 s20, s8
+; CHECK-NEXT: vmov.f32 s28, s14
+; CHECK-NEXT: vins.f16 s6, s20
+; CHECK-NEXT: vmovx.f16 s20, s15
+; CHECK-NEXT: vins.f16 s28, s20
+; CHECK-NEXT: vmovx.f16 s20, s0
+; CHECK-NEXT: vins.f16 s0, s24
+; CHECK-NEXT: vins.f16 s20, s2
+; CHECK-NEXT: vmovx.f16 s21, s3
+; CHECK-NEXT: vmovx.f16 s26, s12
+; CHECK-NEXT: vins.f16 s21, s13
+; CHECK-NEXT: vins.f16 s3, s26
+; CHECK-NEXT: vmovx.f16 s30, s10
+; CHECK-NEXT: vmovx.f16 s23, s9
+; CHECK-NEXT: vmov.f32 s18, s8
+; CHECK-NEXT: vins.f16 s9, s30
+; CHECK-NEXT: vins.f16 s23, s11
+; CHECK-NEXT: vmov.f32 s1, s3
+; CHECK-NEXT: vmovx.f16 s22, s14
+; CHECK-NEXT: vmov.f32 s2, s28
+; CHECK-NEXT: vins.f16 s22, s8
+; CHECK-NEXT: vmov.f32 s3, s9
+; CHECK-NEXT: vmov.f32 s7, s19
+; CHECK-NEXT: vadd.f16 q0, q0, q5
+; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
entry:
%l1 = load <48 x half>, <48 x half>* %src, align 4
define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) {
; CHECK-LABEL: vld4_v8i16_align1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vldrb.u8 q3, [r0]
-; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
-; CHECK-NEXT: vldrb.u8 q1, [r0, #48]
-; CHECK-NEXT: vldrb.u8 q2, [r0, #16]
-; CHECK-NEXT: vmov.u16 r0, q3[3]
-; CHECK-NEXT: vmov.f32 s18, s1
-; CHECK-NEXT: vmov.16 q5[0], r0
-; CHECK-NEXT: vmov.u16 r0, q3[7]
-; CHECK-NEXT: vmov.16 q5[1], r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vmov.16 q5[2], r0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vins.f16 s18, s3
-; CHECK-NEXT: vmov.16 q6[4], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vmov.f32 s19, s5
-; CHECK-NEXT: vmov.16 q6[5], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vins.f16 s19, s7
-; CHECK-NEXT: vmov.16 q6[6], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.f32 s16, s13
-; CHECK-NEXT: vmov.16 q6[7], r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vmov.16 q5[3], r0
-; CHECK-NEXT: vins.f16 s16, s15
-; CHECK-NEXT: vmov.f32 s17, s9
-; CHECK-NEXT: vmov.u16 r0, q3[1]
-; CHECK-NEXT: vmov.f32 s22, s26
-; CHECK-NEXT: vins.f16 s17, s11
-; CHECK-NEXT: vmov.f32 s23, s27
-; CHECK-NEXT: vmov.16 q6[0], r0
-; CHECK-NEXT: vadd.i16 q4, q4, q5
-; CHECK-NEXT: vmov.f64 d11, d0
-; CHECK-NEXT: vmov.u16 r0, q3[5]
-; CHECK-NEXT: vmov.16 q6[1], r0
-; CHECK-NEXT: vmov.u16 r0, q2[1]
-; CHECK-NEXT: vmov.16 q6[2], r0
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vins.f16 s22, s2
-; CHECK-NEXT: vmov.f32 s23, s4
-; CHECK-NEXT: vins.f16 s23, s6
-; CHECK-NEXT: vmov.f32 s20, s12
-; CHECK-NEXT: vins.f16 s20, s14
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.f32 s21, s8
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.16 q6[3], r0
-; CHECK-NEXT: vins.f16 s21, s10
-; CHECK-NEXT: vmov.f32 s26, s14
-; CHECK-NEXT: vmov.f32 s27, s15
-; CHECK-NEXT: vadd.i16 q0, q5, q6
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT: vldrb.u8 q1, [r0, #32]
+; CHECK-NEXT: vldrb.u8 q2, [r0, #48]
+; CHECK-NEXT: vmovx.f16 s0, s7
+; CHECK-NEXT: vmovx.f16 s18, s5
+; CHECK-NEXT: vins.f16 s18, s0
+; CHECK-NEXT: vmovx.f16 s0, s11
+; CHECK-NEXT: vmovx.f16 s19, s9
+; CHECK-NEXT: vins.f16 s5, s7
+; CHECK-NEXT: vins.f16 s19, s0
+; CHECK-NEXT: vldrb.u8 q0, [r0]
+; CHECK-NEXT: vins.f16 s9, s11
+; CHECK-NEXT: vmovx.f16 s24, s6
+; CHECK-NEXT: vmovx.f16 s12, s3
+; CHECK-NEXT: vmovx.f16 s16, s1
+; CHECK-NEXT: vins.f16 s16, s12
+; CHECK-NEXT: vldrb.u8 q3, [r0, #16]
+; CHECK-NEXT: vins.f16 s1, s3
+; CHECK-NEXT: vmovx.f16 s20, s15
+; CHECK-NEXT: vmovx.f16 s17, s13
+; CHECK-NEXT: vins.f16 s17, s20
+; CHECK-NEXT: vmov.f32 s22, s5
+; CHECK-NEXT: vmov.f32 s23, s9
+; CHECK-NEXT: vins.f16 s13, s15
+; CHECK-NEXT: vmov.f32 s20, s1
+; CHECK-NEXT: vmov.f32 s21, s13
+; CHECK-NEXT: vadd.i16 q4, q5, q4
+; CHECK-NEXT: vmovx.f16 s22, s4
+; CHECK-NEXT: vins.f16 s22, s24
+; CHECK-NEXT: vins.f16 s4, s6
+; CHECK-NEXT: vmovx.f16 s24, s10
+; CHECK-NEXT: vmovx.f16 s23, s8
+; CHECK-NEXT: vins.f16 s8, s10
+; CHECK-NEXT: vmov.f32 s6, s4
+; CHECK-NEXT: vmov.f32 s7, s8
+; CHECK-NEXT: vins.f16 s23, s24
+; CHECK-NEXT: vmovx.f16 s24, s2
+; CHECK-NEXT: vmovx.f16 s20, s0
+; CHECK-NEXT: vins.f16 s20, s24
+; CHECK-NEXT: vmovx.f16 s24, s14
+; CHECK-NEXT: vmovx.f16 s21, s12
+; CHECK-NEXT: vins.f16 s0, s2
+; CHECK-NEXT: vins.f16 s12, s14
+; CHECK-NEXT: vins.f16 s21, s24
+; CHECK-NEXT: vmov.f32 s1, s12
+; CHECK-NEXT: vmov.f32 s2, s6
+; CHECK-NEXT: vmov.f32 s3, s7
+; CHECK-NEXT: vadd.i16 q0, q0, q5
; CHECK-NEXT: vadd.i16 q0, q0, q4
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12}
; CHECK-NEXT: bx lr
entry:
%l1 = load <32 x i16>, <32 x i16>* %src, align 1
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vins.f16 s1, s3
; CHECK-NEXT: vins.f16 s8, s4
-; CHECK-NEXT: vmov.f32 s4, s1
+; CHECK-NEXT: vmovx.f16 s4, s2
; CHECK-NEXT: vmovx.f16 s12, s0
-; CHECK-NEXT: vadd.f16 q1, q1, q2
-; CHECK-NEXT: vmovx.f16 s8, s2
-; CHECK-NEXT: vins.f16 s12, s8
+; CHECK-NEXT: vins.f16 s12, s4
; CHECK-NEXT: vins.f16 s0, s2
+; CHECK-NEXT: vmov.f32 s4, s1
; CHECK-NEXT: vadd.f16 q0, q0, q3
+; CHECK-NEXT: vadd.f16 q1, q1, q2
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: str r0, [r1]
define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
; CHECK-LABEL: vld4_v4f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8}
-; CHECK-NEXT: vpush {d8}
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrh.u16 q0, [r0]
-; CHECK-NEXT: vmovx.f16 s8, s1
-; CHECK-NEXT: vins.f16 s1, s3
; CHECK-NEXT: vmovx.f16 s4, s3
-; CHECK-NEXT: vmov.f32 s12, s1
+; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vins.f16 s8, s4
; CHECK-NEXT: vldrh.u16 q1, [r0, #16]
-; CHECK-NEXT: vmovx.f16 s16, s7
-; CHECK-NEXT: vmovx.f16 s9, s5
-; CHECK-NEXT: vins.f16 s5, s7
-; CHECK-NEXT: vins.f16 s9, s16
-; CHECK-NEXT: vmov.f32 s13, s5
+; CHECK-NEXT: vins.f16 s1, s3
; CHECK-NEXT: vmovx.f16 s16, s2
-; CHECK-NEXT: vadd.f16 q2, q3, q2
+; CHECK-NEXT: vmovx.f16 s12, s7
+; CHECK-NEXT: vmovx.f16 s9, s5
+; CHECK-NEXT: vins.f16 s9, s12
; CHECK-NEXT: vmovx.f16 s12, s0
; CHECK-NEXT: vins.f16 s12, s16
-; CHECK-NEXT: vins.f16 s0, s2
+; CHECK-NEXT: vins.f16 s5, s7
; CHECK-NEXT: vmovx.f16 s16, s6
; CHECK-NEXT: vmovx.f16 s13, s4
-; CHECK-NEXT: vins.f16 s4, s6
; CHECK-NEXT: vins.f16 s13, s16
+; CHECK-NEXT: vins.f16 s0, s2
+; CHECK-NEXT: vins.f16 s4, s6
+; CHECK-NEXT: vmov.f32 s16, s1
; CHECK-NEXT: vmov.f32 s1, s4
+; CHECK-NEXT: vmov.f32 s17, s5
; CHECK-NEXT: vadd.f16 q0, q0, q3
+; CHECK-NEXT: vadd.f16 q2, q4, q2
; CHECK-NEXT: vadd.f16 q0, q0, q2
; CHECK-NEXT: vmov r2, s1
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: strd r0, r2, [r1]
-; CHECK-NEXT: vpop {d8}
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%l1 = load <16 x half>, <16 x half>* %src, align 2
define void @vld4_v8f16_align1(<32 x half> *%src, <8 x half> *%dst) {
; CHECK-LABEL: vld4_v8f16_align1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT: vldrb.u8 q2, [r0, #32]
-; CHECK-NEXT: vldrb.u8 q3, [r0, #48]
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
+; CHECK-NEXT: vldrb.u8 q2, [r0, #48]
+; CHECK-NEXT: vmovx.f16 s4, s3
+; CHECK-NEXT: vmovx.f16 s18, s1
+; CHECK-NEXT: vins.f16 s18, s4
; CHECK-NEXT: vmovx.f16 s4, s11
-; CHECK-NEXT: vmovx.f16 s2, s9
-; CHECK-NEXT: vins.f16 s2, s4
-; CHECK-NEXT: vmovx.f16 s4, s15
-; CHECK-NEXT: vmovx.f16 s3, s13
-; CHECK-NEXT: vins.f16 s9, s11
-; CHECK-NEXT: vins.f16 s3, s4
+; CHECK-NEXT: vmovx.f16 s19, s9
+; CHECK-NEXT: vins.f16 s1, s3
+; CHECK-NEXT: vins.f16 s19, s4
; CHECK-NEXT: vldrb.u8 q1, [r0]
-; CHECK-NEXT: vmovx.f16 s28, s10
-; CHECK-NEXT: vmovx.f16 s26, s8
-; CHECK-NEXT: vmovx.f16 s16, s7
-; CHECK-NEXT: vmovx.f16 s0, s5
-; CHECK-NEXT: vins.f16 s0, s16
-; CHECK-NEXT: vldrb.u8 q4, [r0, #16]
+; CHECK-NEXT: vmovx.f16 s24, s2
+; CHECK-NEXT: vins.f16 s9, s11
+; CHECK-NEXT: vmovx.f16 s12, s7
+; CHECK-NEXT: vmovx.f16 s16, s5
+; CHECK-NEXT: vins.f16 s16, s12
+; CHECK-NEXT: vldrb.u8 q3, [r0, #16]
+; CHECK-NEXT: vins.f16 s5, s7
+; CHECK-NEXT: vmovx.f16 s20, s15
+; CHECK-NEXT: vmovx.f16 s17, s13
+; CHECK-NEXT: vins.f16 s17, s20
+; CHECK-NEXT: vmovx.f16 s22, s0
+; CHECK-NEXT: vins.f16 s22, s24
+; CHECK-NEXT: vmovx.f16 s24, s10
+; CHECK-NEXT: vmovx.f16 s23, s8
; CHECK-NEXT: vins.f16 s13, s15
-; CHECK-NEXT: vins.f16 s26, s28
-; CHECK-NEXT: vmovx.f16 s20, s19
-; CHECK-NEXT: vmovx.f16 s1, s17
-; CHECK-NEXT: vins.f16 s1, s20
-; CHECK-NEXT: vmov.f32 s22, s9
+; CHECK-NEXT: vins.f16 s23, s24
+; CHECK-NEXT: vmovx.f16 s24, s6
+; CHECK-NEXT: vmovx.f16 s20, s4
+; CHECK-NEXT: vins.f16 s0, s2
+; CHECK-NEXT: vins.f16 s20, s24
+; CHECK-NEXT: vmovx.f16 s24, s14
+; CHECK-NEXT: vmovx.f16 s21, s12
; CHECK-NEXT: vins.f16 s8, s10
-; CHECK-NEXT: vmov.f32 s23, s13
-; CHECK-NEXT: vmovx.f16 s28, s14
-; CHECK-NEXT: vmovx.f16 s27, s12
-; CHECK-NEXT: vmov.f32 s10, s8
-; CHECK-NEXT: vins.f16 s12, s14
-; CHECK-NEXT: vmov.f32 s11, s12
-; CHECK-NEXT: vins.f16 s27, s28
-; CHECK-NEXT: vins.f16 s5, s7
-; CHECK-NEXT: vmovx.f16 s28, s6
-; CHECK-NEXT: vmovx.f16 s24, s4
-; CHECK-NEXT: vmov.f32 s20, s5
-; CHECK-NEXT: vins.f16 s17, s19
-; CHECK-NEXT: vins.f16 s24, s28
-; CHECK-NEXT: vmov.f32 s21, s17
-; CHECK-NEXT: vmovx.f16 s28, s18
-; CHECK-NEXT: vmovx.f16 s25, s16
+; CHECK-NEXT: vins.f16 s21, s24
+; CHECK-NEXT: vmov.f32 s26, s1
; CHECK-NEXT: vins.f16 s4, s6
-; CHECK-NEXT: vins.f16 s16, s18
-; CHECK-NEXT: vins.f16 s25, s28
-; CHECK-NEXT: vmov.f32 s5, s16
-; CHECK-NEXT: vadd.f16 q0, q5, q0
-; CHECK-NEXT: vmov.f32 s6, s10
-; CHECK-NEXT: vmov.f32 s7, s11
-; CHECK-NEXT: vadd.f16 q1, q1, q6
-; CHECK-NEXT: vadd.f16 q0, q1, q0
+; CHECK-NEXT: vmov.f32 s27, s9
+; CHECK-NEXT: vmov.f32 s24, s5
+; CHECK-NEXT: vins.f16 s12, s14
+; CHECK-NEXT: vmov.f32 s6, s0
+; CHECK-NEXT: vmov.f32 s7, s8
+; CHECK-NEXT: vmov.f32 s25, s13
+; CHECK-NEXT: vmov.f32 s5, s12
+; CHECK-NEXT: vadd.f16 q4, q6, q4
+; CHECK-NEXT: vadd.f16 q0, q1, q5
+; CHECK-NEXT: vadd.f16 q0, q0, q4
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: bx lr
entry:
%l1 = load <32 x half>, <32 x half>* %src, align 1
; CHECK-NEXT: add.w lr, r3, r12, lsr #3
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u16 q3, [r0, #32]
+; CHECK-NEXT: vldrh.u16 q4, [r0, #32]
; CHECK-NEXT: vldrh.u16 q5, [r0, #48]
-; CHECK-NEXT: vldrh.u16 q4, [r0], #64
-; CHECK-NEXT: vmovx.f16 s4, s15
-; CHECK-NEXT: vmovx.f16 s2, s13
-; CHECK-NEXT: vins.f16 s2, s4
-; CHECK-NEXT: vmovx.f16 s4, s23
-; CHECK-NEXT: vmovx.f16 s3, s21
-; CHECK-NEXT: vldrh.u16 q7, [r0, #-48]
-; CHECK-NEXT: vins.f16 s3, s4
-; CHECK-NEXT: vmovx.f16 s4, s19
-; CHECK-NEXT: vmovx.f16 s0, s17
-; CHECK-NEXT: vins.f16 s13, s15
-; CHECK-NEXT: vins.f16 s0, s4
-; CHECK-NEXT: vmovx.f16 s4, s31
-; CHECK-NEXT: vmovx.f16 s1, s29
-; CHECK-NEXT: vins.f16 s21, s23
-; CHECK-NEXT: vins.f16 s1, s4
-; CHECK-NEXT: vins.f16 s17, s19
-; CHECK-NEXT: vmul.f16 q1, q0, r2
-; CHECK-NEXT: vmov.f32 s2, s13
+; CHECK-NEXT: vldrh.u16 q3, [r0], #64
+; CHECK-NEXT: vmov.f32 s2, s17
+; CHECK-NEXT: vmovx.f16 s8, s19
+; CHECK-NEXT: vldrh.u16 q6, [r0, #-48]
+; CHECK-NEXT: vins.f16 s2, s19
; CHECK-NEXT: vmov.f32 s3, s21
-; CHECK-NEXT: vins.f16 s29, s31
-; CHECK-NEXT: vmov.f32 s0, s17
-; CHECK-NEXT: vmov.f32 s1, s29
-; CHECK-NEXT: vmul.f16 q2, q0, r2
-; CHECK-NEXT: vmovx.f16 s0, s4
-; CHECK-NEXT: vmovx.f16 s27, s8
-; CHECK-NEXT: vins.f16 s8, s4
-; CHECK-NEXT: vmov.f32 s25, s8
-; CHECK-NEXT: vmovx.f16 s4, s7
-; CHECK-NEXT: vins.f16 s27, s0
-; CHECK-NEXT: vmovx.f16 s2, s12
-; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill
-; CHECK-NEXT: vmovx.f16 s24, s14
-; CHECK-NEXT: vins.f16 s2, s24
-; CHECK-NEXT: vmovx.f16 s24, s22
-; CHECK-NEXT: vmovx.f16 s3, s20
-; CHECK-NEXT: vins.f16 s12, s14
-; CHECK-NEXT: vins.f16 s3, s24
-; CHECK-NEXT: vmovx.f16 s24, s18
-; CHECK-NEXT: vmovx.f16 s0, s16
-; CHECK-NEXT: vins.f16 s20, s22
-; CHECK-NEXT: vins.f16 s0, s24
-; CHECK-NEXT: vmovx.f16 s24, s30
-; CHECK-NEXT: vmovx.f16 s1, s28
+; CHECK-NEXT: vmovx.f16 s5, s25
+; CHECK-NEXT: vins.f16 s3, s23
+; CHECK-NEXT: vmovx.f16 s6, s17
+; CHECK-NEXT: vmov.f32 s0, s13
+; CHECK-NEXT: vins.f16 s6, s8
+; CHECK-NEXT: vmovx.f16 s8, s23
+; CHECK-NEXT: vmovx.f16 s7, s21
+; CHECK-NEXT: vins.f16 s0, s15
+; CHECK-NEXT: vins.f16 s7, s8
+; CHECK-NEXT: vmovx.f16 s8, s15
+; CHECK-NEXT: vmovx.f16 s4, s13
+; CHECK-NEXT: vins.f16 s25, s27
+; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vmovx.f16 s8, s27
+; CHECK-NEXT: vins.f16 s5, s8
+; CHECK-NEXT: vmov.f32 s1, s25
+; CHECK-NEXT: vmul.f16 q2, q1, r2
+; CHECK-NEXT: vmul.f16 q0, q0, r2
+; CHECK-NEXT: vmovx.f16 s7, s0
+; CHECK-NEXT: vmovx.f16 s28, s8
+; CHECK-NEXT: vins.f16 s7, s28
+; CHECK-NEXT: vmovx.f16 s30, s16
+; CHECK-NEXT: vmovx.f16 s31, s20
+; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmovx.f16 s28, s12
; CHECK-NEXT: vins.f16 s16, s18
-; CHECK-NEXT: vins.f16 s1, s24
-; CHECK-NEXT: vins.f16 s28, s30
-; CHECK-NEXT: vmul.f16 q6, q0, r2
-; CHECK-NEXT: vmov.f32 s2, s12
-; CHECK-NEXT: vmov.f32 s3, s20
-; CHECK-NEXT: vmov.f32 s17, s28
-; CHECK-NEXT: vmov.f32 s18, s2
-; CHECK-NEXT: vmov.f32 s19, s3
-; CHECK-NEXT: vmovx.f16 s2, s24
-; CHECK-NEXT: vmul.f16 q5, q4, r2
-; CHECK-NEXT: vmovx.f16 s0, s20
-; CHECK-NEXT: vins.f16 s20, s24
-; CHECK-NEXT: vins.f16 s0, s2
-; CHECK-NEXT: vmov q3, q5
-; CHECK-NEXT: vmov.f32 s14, s0
-; CHECK-NEXT: vmovx.f16 s0, s9
-; CHECK-NEXT: vins.f16 s9, s5
-; CHECK-NEXT: vmovx.f16 s2, s5
-; CHECK-NEXT: vins.f16 s0, s2
-; CHECK-NEXT: vmov q4, q2
-; CHECK-NEXT: vmov.f32 s19, s0
-; CHECK-NEXT: vmovx.f16 s31, s10
-; CHECK-NEXT: vmovx.f16 s18, s21
-; CHECK-NEXT: vins.f16 s21, s25
-; CHECK-NEXT: vins.f16 s10, s6
-; CHECK-NEXT: vmov.f32 s16, s21
-; CHECK-NEXT: vmovx.f16 s0, s25
-; CHECK-NEXT: vmov.f32 s29, s10
-; CHECK-NEXT: vins.f16 s18, s0
-; CHECK-NEXT: vmovx.f16 s0, s6
-; CHECK-NEXT: vins.f16 s31, s0
-; CHECK-NEXT: vmovx.f16 s0, s26
-; CHECK-NEXT: vmovx.f16 s30, s22
-; CHECK-NEXT: vins.f16 s22, s26
-; CHECK-NEXT: vmov.f32 s28, s22
-; CHECK-NEXT: vins.f16 s30, s0
-; CHECK-NEXT: vmovx.f16 s3, s11
-; CHECK-NEXT: vins.f16 s11, s7
-; CHECK-NEXT: vstrh.16 q7, [r1, #32]
-; CHECK-NEXT: vmov.f32 s1, s11
-; CHECK-NEXT: vins.f16 s3, s4
-; CHECK-NEXT: vmovx.f16 s4, s27
-; CHECK-NEXT: vmovx.f16 s2, s23
-; CHECK-NEXT: vins.f16 s23, s27
-; CHECK-NEXT: vmov.f32 s0, s23
-; CHECK-NEXT: vins.f16 s2, s4
-; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT: vstrh.16 q0, [r1, #48]
-; CHECK-NEXT: vmov.f32 s13, s5
-; CHECK-NEXT: vmov.f32 s15, s7
-; CHECK-NEXT: vstrh.16 q3, [r1], #64
+; CHECK-NEXT: vmovx.f16 s29, s24
+; CHECK-NEXT: vmovx.f16 s4, s18
+; CHECK-NEXT: vins.f16 s20, s22
+; CHECK-NEXT: vins.f16 s30, s4
+; CHECK-NEXT: vmovx.f16 s4, s22
+; CHECK-NEXT: vins.f16 s12, s14
+; CHECK-NEXT: vins.f16 s31, s4
+; CHECK-NEXT: vmovx.f16 s4, s14
+; CHECK-NEXT: vmov.f32 s14, s16
+; CHECK-NEXT: vins.f16 s24, s26
+; CHECK-NEXT: vmov.f32 s15, s20
+; CHECK-NEXT: vins.f16 s28, s4
+; CHECK-NEXT: vmovx.f16 s4, s26
+; CHECK-NEXT: vmov.f32 s13, s24
+; CHECK-NEXT: vins.f16 s29, s4
+; CHECK-NEXT: vmul.f16 q3, q3, r2
+; CHECK-NEXT: vmul.f16 q7, q7, r2
+; CHECK-NEXT: vmovx.f16 s4, s12
+; CHECK-NEXT: vmovx.f16 s6, s28
+; CHECK-NEXT: vins.f16 s0, s8
+; CHECK-NEXT: vins.f16 s4, s6
+; CHECK-NEXT: vmovx.f16 s6, s1
+; CHECK-NEXT: vmovx.f16 s5, s9
+; CHECK-NEXT: vins.f16 s12, s28
+; CHECK-NEXT: vins.f16 s6, s5
+; CHECK-NEXT: vmovx.f16 s18, s13
+; CHECK-NEXT: vmovx.f16 s5, s29
+; CHECK-NEXT: vins.f16 s1, s9
+; CHECK-NEXT: vins.f16 s18, s5
+; CHECK-NEXT: vmovx.f16 s23, s2
+; CHECK-NEXT: vmovx.f16 s5, s10
+; CHECK-NEXT: vins.f16 s2, s10
+; CHECK-NEXT: vins.f16 s23, s5
+; CHECK-NEXT: vins.f16 s13, s29
+; CHECK-NEXT: vmovx.f16 s27, s3
+; CHECK-NEXT: vmovx.f16 s8, s11
+; CHECK-NEXT: vmovx.f16 s22, s14
+; CHECK-NEXT: vins.f16 s27, s8
+; CHECK-NEXT: vins.f16 s14, s30
+; CHECK-NEXT: vmovx.f16 s26, s15
+; CHECK-NEXT: vins.f16 s15, s31
+; CHECK-NEXT: vmovx.f16 s8, s31
+; CHECK-NEXT: vins.f16 s3, s11
+; CHECK-NEXT: vins.f16 s26, s8
+; CHECK-NEXT: vmov q2, q3
+; CHECK-NEXT: vmovx.f16 s5, s30
+; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s10, s4
+; CHECK-NEXT: vmov.f32 s29, s0
+; CHECK-NEXT: vins.f16 s22, s5
+; CHECK-NEXT: vmov.f32 s9, s0
+; CHECK-NEXT: vmov.f32 s11, s31
+; CHECK-NEXT: vmov q7, q0
+; CHECK-NEXT: vmov.f32 s31, s6
+; CHECK-NEXT: vmov.f32 s16, s13
+; CHECK-NEXT: vmov.f32 s21, s2
+; CHECK-NEXT: vmov.f32 s25, s3
+; CHECK-NEXT: vmov.f32 s17, s29
+; CHECK-NEXT: vmov.f32 s20, s14
+; CHECK-NEXT: vmov.f32 s24, s15
+; CHECK-NEXT: vstrh.16 q5, [r1, #32]
+; CHECK-NEXT: vstrh.16 q6, [r1, #48]
+; CHECK-NEXT: vstrh.16 q2, [r1], #64
+; CHECK-NEXT: vmov.f32 s19, s31
; CHECK-NEXT: vstrh.16 q4, [r1, #-48]
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: .LBB0_3: @ %while.end
define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) {
; CHECK-LABEL: vst2_v8i16_align1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vmov.f64 d4, d3
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vins.f16 s8, s2
-; CHECK-NEXT: vmov.f64 d6, d2
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vins.f16 s12, s0
-; CHECK-NEXT: vins.f16 s10, s3
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.16 q3[2], r0
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.f32 s14, s5
-; CHECK-NEXT: vstrb.8 q2, [r1, #16]
-; CHECK-NEXT: vins.f16 s14, s1
-; CHECK-NEXT: vmov.16 q3[6], r0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vmov.16 q3[7], r0
-; CHECK-NEXT: vstrb.8 q3, [r1]
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vmovx.f16 s1, s10
+; CHECK-NEXT: vmovx.f16 s12, s6
+; CHECK-NEXT: vins.f16 s1, s12
+; CHECK-NEXT: vins.f16 s10, s6
+; CHECK-NEXT: vmov.f32 s0, s10
+; CHECK-NEXT: vmovx.f16 s12, s7
+; CHECK-NEXT: vmovx.f16 s3, s11
+; CHECK-NEXT: vins.f16 s11, s7
+; CHECK-NEXT: vmov.f32 s2, s11
+; CHECK-NEXT: vmovx.f16 s14, s4
+; CHECK-NEXT: vins.f16 s3, s12
+; CHECK-NEXT: vmovx.f16 s12, s8
+; CHECK-NEXT: vins.f16 s8, s4
+; CHECK-NEXT: vins.f16 s12, s14
+; CHECK-NEXT: vmov q4, q2
+; CHECK-NEXT: vmovx.f16 s4, s5
+; CHECK-NEXT: vmov.f32 s17, s12
+; CHECK-NEXT: vstrb.8 q0, [r1, #16]
+; CHECK-NEXT: vmovx.f16 s19, s9
+; CHECK-NEXT: vins.f16 s9, s5
+; CHECK-NEXT: vmov.f32 s18, s9
+; CHECK-NEXT: vins.f16 s19, s4
+; CHECK-NEXT: vstrb.8 q4, [r1]
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
; CHECK-NEXT: vins.f16 s4, s0
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vins.f16 s5, s0
+; CHECK-NEXT: vmov r0, s5
+; CHECK-NEXT: str r0, [r1, #4]
; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r2, s5
; CHECK-NEXT: str r0, [r1]
-; CHECK-NEXT: str r2, [r1, #4]
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrd r2, r12, [r0]
; CHECK-NEXT: ldrd r3, r0, [r0, #8]
-; CHECK-NEXT: vmov.32 q1[0], r2
-; CHECK-NEXT: vmov.32 q0[0], r3
-; CHECK-NEXT: vmov.32 q1[1], r12
-; CHECK-NEXT: vmov.32 q0[1], r0
-; CHECK-NEXT: vmovx.f16 s12, s4
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: vmov.32 q1[0], r3
+; CHECK-NEXT: vmov.32 q0[1], r12
+; CHECK-NEXT: vmov.32 q1[1], r0
; CHECK-NEXT: vmovx.f16 s8, s0
-; CHECK-NEXT: vins.f16 s4, s0
-; CHECK-NEXT: vins.f16 s12, s8
-; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: vmov.f32 s9, s12
-; CHECK-NEXT: vmovx.f16 s0, s1
-; CHECK-NEXT: vmovx.f16 s11, s5
-; CHECK-NEXT: vins.f16 s5, s1
-; CHECK-NEXT: vmov.f32 s10, s5
-; CHECK-NEXT: vins.f16 s11, s0
-; CHECK-NEXT: vstrh.16 q2, [r1]
+; CHECK-NEXT: vmovx.f16 s10, s4
+; CHECK-NEXT: vins.f16 s0, s4
+; CHECK-NEXT: vins.f16 s8, s10
+; CHECK-NEXT: vmovx.f16 s10, s1
+; CHECK-NEXT: vmovx.f16 s4, s5
+; CHECK-NEXT: vins.f16 s1, s5
+; CHECK-NEXT: vins.f16 s10, s4
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmov.f32 s5, s8
+; CHECK-NEXT: vmov.f32 s6, s1
+; CHECK-NEXT: vmov.f32 s7, s10
+; CHECK-NEXT: vstrh.16 q1, [r1]
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) {
; CHECK-LABEL: vst2_v8f16_align1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmovx.f16 s9, s6
-; CHECK-NEXT: vins.f16 s6, s2
-; CHECK-NEXT: vmov.f32 s8, s6
-; CHECK-NEXT: vmovx.f16 s12, s2
-; CHECK-NEXT: vins.f16 s9, s12
-; CHECK-NEXT: vmovx.f16 s12, s3
-; CHECK-NEXT: vmovx.f16 s11, s7
-; CHECK-NEXT: vins.f16 s7, s3
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vins.f16 s11, s12
+; CHECK-NEXT: vmovx.f16 s1, s6
+; CHECK-NEXT: vmovx.f16 s12, s10
+; CHECK-NEXT: vins.f16 s1, s12
+; CHECK-NEXT: vins.f16 s6, s10
+; CHECK-NEXT: vmovx.f16 s3, s7
+; CHECK-NEXT: vmovx.f16 s12, s11
+; CHECK-NEXT: vins.f16 s7, s11
+; CHECK-NEXT: vins.f16 s3, s12
; CHECK-NEXT: vmovx.f16 s12, s4
-; CHECK-NEXT: vstrb.8 q2, [r1, #16]
-; CHECK-NEXT: vmovx.f16 s8, s0
-; CHECK-NEXT: vins.f16 s4, s0
-; CHECK-NEXT: vins.f16 s12, s8
+; CHECK-NEXT: vmovx.f16 s14, s8
+; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vins.f16 s12, s14
+; CHECK-NEXT: vmovx.f16 s14, s5
+; CHECK-NEXT: vins.f16 s5, s9
+; CHECK-NEXT: vmovx.f16 s8, s9
+; CHECK-NEXT: vmov.f32 s0, s6
+; CHECK-NEXT: vins.f16 s14, s8
; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vmov.f32 s9, s12
-; CHECK-NEXT: vmovx.f16 s11, s5
-; CHECK-NEXT: vins.f16 s5, s1
; CHECK-NEXT: vmov.f32 s10, s5
-; CHECK-NEXT: vins.f16 s11, s0
+; CHECK-NEXT: vmov.f32 s2, s7
+; CHECK-NEXT: vmov.f32 s11, s14
+; CHECK-NEXT: vstrb.8 q0, [r1, #16]
; CHECK-NEXT: vstrb.8 q2, [r1]
; CHECK-NEXT: bx lr
entry:
define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) {
; CHECK-LABEL: vst3_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT: vmov.f64 d0, d4
+; CHECK-NEXT: vmov.f64 d0, d6
; CHECK-NEXT: vmov.u16 r2, q1[1]
+; CHECK-NEXT: vmovx.f16 s20, s12
; CHECK-NEXT: vins.f16 s0, s4
-; CHECK-NEXT: vmov.f32 s17, s12
+; CHECK-NEXT: vmov.f32 s8, s13
+; CHECK-NEXT: vins.f16 s8, s5
; CHECK-NEXT: vmov.16 q0[4], r2
-; CHECK-NEXT: vmov.f32 s18, s12
-; CHECK-NEXT: vmov.f32 s3, s9
-; CHECK-NEXT: vmov.u16 r0, q4[2]
-; CHECK-NEXT: vins.f16 s3, s5
-; CHECK-NEXT: vmov.16 q5[2], r0
-; CHECK-NEXT: vmov.f32 s1, s8
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: vmov.u16 r0, q0[4]
-; CHECK-NEXT: vmov.16 q5[3], r2
-; CHECK-NEXT: vmov.16 q5[4], r0
-; CHECK-NEXT: vmov.u16 r0, q4[5]
-; CHECK-NEXT: vmov.16 q5[5], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.16 q4[0], r0
-; CHECK-NEXT: vmov.u16 r0, q3[5]
-; CHECK-NEXT: vmov.16 q4[1], r0
-; CHECK-NEXT: vmov.f32 s1, s21
+; CHECK-NEXT: vmov.f32 s3, s8
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT: vmov.f32 s1, s12
+; CHECK-NEXT: vmov.f32 s17, s8
+; CHECK-NEXT: vmov.f32 s18, s8
+; CHECK-NEXT: vins.f16 s17, s20
+; CHECK-NEXT: vmovx.f16 s20, s18
+; CHECK-NEXT: vins.f16 s2, s20
+; CHECK-NEXT: vmovx.f16 s20, s10
+; CHECK-NEXT: vmov.f32 s18, s2
+; CHECK-NEXT: vmov.f32 s1, s17
+; CHECK-NEXT: vmov.f32 s2, s18
+; CHECK-NEXT: vmovx.f16 s16, s6
+; CHECK-NEXT: vins.f16 s16, s20
+; CHECK-NEXT: vmovx.f16 s20, s11
; CHECK-NEXT: vins.f16 s17, s7
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.f32 s2, s22
-; CHECK-NEXT: vmov.16 q4[6], r0
-; CHECK-NEXT: vmov.f32 s21, s11
-; CHECK-NEXT: vmov.u16 r0, q3[7]
-; CHECK-NEXT: vmov.16 q4[7], r0
-; CHECK-NEXT: vmov.f32 s22, s11
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.u16 r2, q5[2]
-; CHECK-NEXT: vmov.u16 r0, q4[3]
-; CHECK-NEXT: vmov.16 q6[2], r2
-; CHECK-NEXT: vmov.16 q6[3], r0
-; CHECK-NEXT: vmov.u16 r0, q4[4]
-; CHECK-NEXT: vmov.16 q6[4], r0
-; CHECK-NEXT: vmov.u16 r0, q5[5]
-; CHECK-NEXT: vmov.16 q6[5], r0
-; CHECK-NEXT: vmov.u16 r0, q3[2]
-; CHECK-NEXT: vmov.16 q5[0], r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vmov.16 q5[1], r0
-; CHECK-NEXT: vmov.u16 r0, q3[4]
-; CHECK-NEXT: vmov.16 q5[6], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.16 q5[7], r0
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: vmovx.f16 s19, s7
; CHECK-NEXT: vrev32.16 q1, q1
-; CHECK-NEXT: vmov.f32 s21, s13
-; CHECK-NEXT: vmov.u16 r2, q1[2]
-; CHECK-NEXT: vmov.f32 s22, s10
-; CHECK-NEXT: vmov.16 q2[2], r2
-; CHECK-NEXT: vmov.u16 r0, q5[3]
-; CHECK-NEXT: vmov.f32 s17, s25
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov.u16 r0, q5[4]
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov.f32 s18, s26
-; CHECK-NEXT: vmov.f32 s21, s9
+; CHECK-NEXT: vins.f16 s19, s20
+; CHECK-NEXT: vmov.f32 s21, s15
+; CHECK-NEXT: vmov.f32 s18, s11
+; CHECK-NEXT: vmovx.f16 s24, s17
+; CHECK-NEXT: vmov.f32 s22, s15
+; CHECK-NEXT: vins.f16 s21, s24
+; CHECK-NEXT: vmovx.f16 s24, s22
+; CHECK-NEXT: vins.f16 s18, s24
+; CHECK-NEXT: vmov.f32 s8, s9
+; CHECK-NEXT: vmov.f32 s22, s18
+; CHECK-NEXT: vmov.f32 s17, s21
+; CHECK-NEXT: vmov.f32 s18, s22
+; CHECK-NEXT: vmovx.f16 s20, s13
+; CHECK-NEXT: vins.f16 s8, s20
+; CHECK-NEXT: vmovx.f16 s20, s14
+; CHECK-NEXT: vins.f16 s10, s20
; CHECK-NEXT: vstrw.32 q4, [r1, #32]
-; CHECK-NEXT: vmov.f32 s22, s10
-; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vstrw.32 q5, [r1, #16]
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vmov.f32 s11, s10
+; CHECK-NEXT: vmov.f32 s10, s14
+; CHECK-NEXT: vmovx.f16 s12, s9
+; CHECK-NEXT: vins.f16 s5, s12
+; CHECK-NEXT: vmovx.f16 s12, s6
+; CHECK-NEXT: vins.f16 s10, s12
+; CHECK-NEXT: vmov.f32 s6, s10
+; CHECK-NEXT: vmov.f32 s9, s5
+; CHECK-NEXT: vmov.f32 s10, s6
+; CHECK-NEXT: vstrw.32 q2, [r1, #16]
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12}
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #112
-; CHECK-NEXT: sub sp, #112
-; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q7, [r0, #64]
-; CHECK-NEXT: vmov.f64 d12, d2
-; CHECK-NEXT: vmov.u16 r2, q0[1]
-; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill
-; CHECK-NEXT: vins.f16 s24, s0
-; CHECK-NEXT: vmov.16 q6[4], r2
-; CHECK-NEXT: vmov.f32 s27, s5
-; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
-; CHECK-NEXT: vins.f16 s27, s1
-; CHECK-NEXT: vmov.f32 s13, s4
-; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s14, s4
-; CHECK-NEXT: vmov.f32 s25, s8
-; CHECK-NEXT: vmov.u16 r3, q3[2]
-; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vmov.u16 r2, q6[3]
-; CHECK-NEXT: vmov.16 q3[2], r3
-; CHECK-NEXT: vmov.16 q3[3], r2
-; CHECK-NEXT: vmov.u16 r2, q6[4]
-; CHECK-NEXT: vmov.16 q3[4], r2
-; CHECK-NEXT: vmov.u16 r2, q0[5]
-; CHECK-NEXT: vmov.16 q5[0], r2
-; CHECK-NEXT: vmov.u16 r2, q1[5]
-; CHECK-NEXT: vmov.16 q5[1], r2
-; CHECK-NEXT: vmov.u16 r2, q0[7]
-; CHECK-NEXT: vins.f16 s21, s3
-; CHECK-NEXT: vmov q0, q2
-; CHECK-NEXT: vmov.f32 s9, s3
-; CHECK-NEXT: vmov.16 q5[6], r2
-; CHECK-NEXT: vmov.u16 r2, q1[7]
-; CHECK-NEXT: vmov.f32 s10, s3
-; CHECK-NEXT: vmov.16 q5[7], r2
-; CHECK-NEXT: vmov.u16 r3, q2[2]
-; CHECK-NEXT: vmov.f32 s22, s7
-; CHECK-NEXT: vmov.16 q1[2], r3
-; CHECK-NEXT: vmov.u16 r2, q5[3]
-; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT: vmov.16 q1[3], r2
-; CHECK-NEXT: vmov.u16 r2, q5[4]
-; CHECK-NEXT: vmov.16 q1[4], r2
-; CHECK-NEXT: vmov.u16 r2, q2[5]
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vmov.16 q1[5], r2
-; CHECK-NEXT: vmov.u16 r2, q0[1]
-; CHECK-NEXT: vmov.f32 s21, s5
-; CHECK-NEXT: vmov.f64 d8, d4
-; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vins.f16 s16, s0
-; CHECK-NEXT: vmov.f32 s22, s6
-; CHECK-NEXT: vmov.16 q4[4], r2
-; CHECK-NEXT: vstrw.32 q5, [r1, #80]
-; CHECK-NEXT: vmov.f32 s19, s9
-; CHECK-NEXT: vins.f16 s19, s1
-; CHECK-NEXT: vmov.f32 s17, s8
-; CHECK-NEXT: vmov.f32 s9, s28
-; CHECK-NEXT: vmov.u16 r0, q4[3]
-; CHECK-NEXT: vmov.f32 s10, s28
-; CHECK-NEXT: vmov.u16 r2, q2[2]
-; CHECK-NEXT: vmov.16 q1[2], r2
-; CHECK-NEXT: vmov.16 q1[3], r0
-; CHECK-NEXT: vmov.u16 r0, q4[4]
-; CHECK-NEXT: vmov.16 q1[4], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov.16 q1[5], r0
-; CHECK-NEXT: vmov.f32 s17, s5
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.f32 s18, s6
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vmov.u16 r0, q7[5]
-; CHECK-NEXT: vmov.16 q2[1], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vins.f16 s9, s3
-; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov.u16 r0, q7[7]
-; CHECK-NEXT: vmov.f32 s5, s3
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vstrw.32 q4, [r1]
-; CHECK-NEXT: vmov.f32 s10, s31
-; CHECK-NEXT: vmov.u16 r2, q1[2]
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vmov.16 q0[2], r2
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov.u16 r0, q2[4]
-; CHECK-NEXT: vmov.f32 s25, s13
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s26, s14
-; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vrev32.16 q1, q1
-; CHECK-NEXT: vmov.u16 r0, q7[2]
+; CHECK-NEXT: .pad #80
+; CHECK-NEXT: sub sp, #80
+; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q5, [r0, #64]
+; CHECK-NEXT: vmovx.f16 s0, s14
+; CHECK-NEXT: vmovx.f16 s8, s6
+; CHECK-NEXT: vins.f16 s8, s0
+; CHECK-NEXT: vmovx.f16 s0, s15
+; CHECK-NEXT: vins.f16 s9, s7
; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vmov.16 q1[0], r0
-; CHECK-NEXT: vmov.u16 r0, q3[3]
-; CHECK-NEXT: vmov.f32 s9, s1
-; CHECK-NEXT: vmov.16 q1[1], r0
-; CHECK-NEXT: vmov.u16 r0, q7[4]
-; CHECK-NEXT: vmov.16 q1[6], r0
-; CHECK-NEXT: vmov.u16 r0, q3[5]
-; CHECK-NEXT: vmov.16 q1[7], r0
-; CHECK-NEXT: vmov.f32 s10, s2
-; CHECK-NEXT: vmov.f32 s5, s29
-; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s6, s14
-; CHECK-NEXT: vstrw.32 q2, [r1, #32]
-; CHECK-NEXT: vmov.u16 r2, q7[2]
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.16 q3[2], r2
-; CHECK-NEXT: vstrw.32 q6, [r1, #48]
-; CHECK-NEXT: vmov.16 q3[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u16 r0, q7[5]
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vldrw.u32 q7, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s5, s13
-; CHECK-NEXT: vmov.f32 s6, s14
-; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q1, [r1, #64]
-; CHECK-NEXT: vmov.u16 r0, q3[2]
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov.u16 r0, q7[3]
-; CHECK-NEXT: vmov.16 q0[1], r0
-; CHECK-NEXT: vmov.u16 r0, q3[4]
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov.u16 r0, q7[5]
-; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vmov.f32 s1, s13
-; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s2, s30
-; CHECK-NEXT: vrev32.16 q3, q3
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vmov.u16 r2, q3[2]
-; CHECK-NEXT: vmov.16 q7[2], r2
-; CHECK-NEXT: vmov.16 q7[3], r0
-; CHECK-NEXT: vmov.u16 r0, q0[4]
+; CHECK-NEXT: vmovx.f16 s11, s7
+; CHECK-NEXT: vmov.u16 r2, q6[1]
+; CHECK-NEXT: vins.f16 s11, s0
+; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.f32 s10, s15
+; CHECK-NEXT: vmovx.f16 s4, s9
+; CHECK-NEXT: vmov q4, q2
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT: vmov.f32 s1, s11
+; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vmov.f32 s2, s11
+; CHECK-NEXT: vins.f16 s1, s4
+; CHECK-NEXT: vmovx.f16 s4, s2
+; CHECK-NEXT: vins.f16 s18, s4
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmov.f32 s2, s18
+; CHECK-NEXT: vmov.f64 d4, d2
+; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: vmovx.f16 s28, s4
+; CHECK-NEXT: vins.f16 s8, s24
+; CHECK-NEXT: vmov.f32 s17, s1
+; CHECK-NEXT: vmov.16 q2[4], r2
+; CHECK-NEXT: vmov.f32 s11, s5
+; CHECK-NEXT: vins.f16 s11, s25
+; CHECK-NEXT: vmov.f32 s18, s2
+; CHECK-NEXT: vmov.f32 s9, s4
+; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vmov.f32 s5, s20
+; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s6, s20
+; CHECK-NEXT: vins.f16 s5, s28
+; CHECK-NEXT: vmovx.f16 s28, s6
+; CHECK-NEXT: vins.f16 s10, s28
+; CHECK-NEXT: vmov.f64 d14, d8
+; CHECK-NEXT: vmov.f32 s6, s10
+; CHECK-NEXT: vmov.f32 s9, s5
+; CHECK-NEXT: vmov.f32 s0, s17
+; CHECK-NEXT: vmov.f32 s10, s6
+; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q2, [r1]
+; CHECK-NEXT: vins.f16 s28, s4
+; CHECK-NEXT: vmov.u16 r0, q1[1]
+; CHECK-NEXT: vins.f16 s0, s5
; CHECK-NEXT: vmov.16 q7[4], r0
-; CHECK-NEXT: vmov.u16 r0, q3[5]
-; CHECK-NEXT: vmov.16 q7[5], r0
-; CHECK-NEXT: vmov.f32 s1, s29
+; CHECK-NEXT: vmov.f32 s31, s0
+; CHECK-NEXT: vmovx.f16 s4, s16
+; CHECK-NEXT: vmov.f32 s1, s12
+; CHECK-NEXT: vmov.f32 s2, s12
+; CHECK-NEXT: vins.f16 s1, s4
+; CHECK-NEXT: vmov.f32 s29, s16
+; CHECK-NEXT: vmovx.f16 s4, s2
+; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vins.f16 s30, s4
+; CHECK-NEXT: vmovx.f16 s4, s22
; CHECK-NEXT: vmov.f32 s2, s30
-; CHECK-NEXT: vstrw.32 q0, [r1, #16]
-; CHECK-NEXT: add sp, #112
+; CHECK-NEXT: vmov.f32 s29, s1
+; CHECK-NEXT: vmov.f32 s12, s13
+; CHECK-NEXT: vmov.f32 s30, s2
+; CHECK-NEXT: vmovx.f16 s0, s26
+; CHECK-NEXT: vins.f16 s0, s4
+; CHECK-NEXT: vmovx.f16 s4, s23
+; CHECK-NEXT: vins.f16 s1, s27
+; CHECK-NEXT: vstrw.32 q7, [r1, #48]
+; CHECK-NEXT: vmovx.f16 s3, s27
+; CHECK-NEXT: vins.f16 s3, s4
+; CHECK-NEXT: vmov.f32 s5, s19
+; CHECK-NEXT: vmov.f32 s2, s23
+; CHECK-NEXT: vmovx.f16 s24, s1
+; CHECK-NEXT: vmov.f32 s6, s19
+; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vins.f16 s5, s24
+; CHECK-NEXT: vmovx.f16 s24, s6
+; CHECK-NEXT: vins.f16 s2, s24
+; CHECK-NEXT: vmovx.f16 s24, s17
+; CHECK-NEXT: vmov.f32 s6, s2
+; CHECK-NEXT: vins.f16 s12, s24
+; CHECK-NEXT: vmovx.f16 s24, s18
+; CHECK-NEXT: vmov.f32 s1, s5
+; CHECK-NEXT: vins.f16 s14, s24
+; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s15, s14
+; CHECK-NEXT: vmov.f32 s14, s18
+; CHECK-NEXT: vmovx.f16 s16, s13
+; CHECK-NEXT: vrev32.16 q6, q6
+; CHECK-NEXT: vmov.f32 s20, s21
+; CHECK-NEXT: vins.f16 s25, s16
+; CHECK-NEXT: vmovx.f16 s16, s26
+; CHECK-NEXT: vins.f16 s14, s16
+; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s2, s6
+; CHECK-NEXT: vmovx.f16 s4, s17
+; CHECK-NEXT: vmov.f32 s26, s14
+; CHECK-NEXT: vins.f16 s20, s4
+; CHECK-NEXT: vmovx.f16 s4, s18
+; CHECK-NEXT: vins.f16 s22, s4
+; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s23, s22
+; CHECK-NEXT: vstrw.32 q0, [r1, #32]
+; CHECK-NEXT: vmov.f32 s22, s18
+; CHECK-NEXT: vmovx.f16 s16, s21
+; CHECK-NEXT: vrev32.16 q1, q1
+; CHECK-NEXT: vmov.f32 s13, s25
+; CHECK-NEXT: vins.f16 s5, s16
+; CHECK-NEXT: vmovx.f16 s16, s6
+; CHECK-NEXT: vins.f16 s22, s16
+; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s6, s22
+; CHECK-NEXT: vmov.f32 s21, s5
+; CHECK-NEXT: vstrw.32 q0, [r1, #80]
+; CHECK-NEXT: vmov.f32 s14, s26
+; CHECK-NEXT: vstrw.32 q3, [r1, #64]
+; CHECK-NEXT: vmov.f32 s22, s6
+; CHECK-NEXT: vstrw.32 q5, [r1, #16]
+; CHECK-NEXT: add sp, #80
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
entry:
define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) {
; CHECK-LABEL: vst3_v8i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: vldrb.u16 q1, [r0, #8]
-; CHECK-NEXT: vldrb.u16 q2, [r0, #16]
+; CHECK-NEXT: .vsave {d8, d9, d10}
+; CHECK-NEXT: vpush {d8, d9, d10}
+; CHECK-NEXT: vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT: vldrb.u16 q2, [r0, #8]
+; CHECK-NEXT: vmovx.f16 s12, s6
+; CHECK-NEXT: vmovx.f16 s0, s10
+; CHECK-NEXT: vins.f16 s0, s12
+; CHECK-NEXT: vmovx.f16 s12, s7
+; CHECK-NEXT: vins.f16 s1, s11
+; CHECK-NEXT: vmovx.f16 s3, s11
+; CHECK-NEXT: vins.f16 s3, s12
; CHECK-NEXT: vldrb.u16 q3, [r0]
-; CHECK-NEXT: vmov.u16 r2, q1[5]
-; CHECK-NEXT: vmov.16 q0[0], r2
-; CHECK-NEXT: vmov.u16 r2, q2[5]
-; CHECK-NEXT: vmov.16 q0[1], r2
-; CHECK-NEXT: vmov.u16 r2, q1[7]
-; CHECK-NEXT: vins.f16 s1, s7
+; CHECK-NEXT: vmov.f32 s2, s7
+; CHECK-NEXT: vmovx.f16 s20, s1
; CHECK-NEXT: vmov.f32 s17, s15
-; CHECK-NEXT: vmov.16 q0[6], r2
-; CHECK-NEXT: vmov.u16 r2, q2[7]
-; CHECK-NEXT: vmov.16 q0[7], r2
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.f32 s2, s11
-; CHECK-NEXT: vmov.u16 r0, q4[2]
-; CHECK-NEXT: vmov.16 q5[2], r0
-; CHECK-NEXT: vmov.u16 r2, q0[3]
-; CHECK-NEXT: vmov.16 q5[3], r2
-; CHECK-NEXT: vmov.u16 r0, q0[4]
-; CHECK-NEXT: vmov.16 q5[4], r0
-; CHECK-NEXT: vmov.u16 r0, q4[5]
-; CHECK-NEXT: vmov.16 q5[5], r0
; CHECK-NEXT: vmov.u16 r0, q3[0]
+; CHECK-NEXT: vmov.f32 s18, s15
+; CHECK-NEXT: vins.f16 s17, s20
+; CHECK-NEXT: vmovx.f16 s20, s18
+; CHECK-NEXT: vins.f16 s2, s20
+; CHECK-NEXT: vmov.f32 s18, s2
+; CHECK-NEXT: vmov.f32 s1, s17
+; CHECK-NEXT: vmov.f32 s2, s18
; CHECK-NEXT: vmov.8 q4[0], r0
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vmov.8 q4[1], r0
; CHECK-NEXT: vmov.u16 r0, q2[0]
+; CHECK-NEXT: vstrb.16 q0, [r1, #16]
+; CHECK-NEXT: vmov.8 q4[1], r0
+; CHECK-NEXT: vmov.u16 r0, q1[0]
; CHECK-NEXT: vmov.8 q4[2], r0
; CHECK-NEXT: vmov.u16 r0, q3[1]
; CHECK-NEXT: vmov.8 q4[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.8 q4[4], r0
; CHECK-NEXT: vmov.u16 r0, q2[1]
+; CHECK-NEXT: vmov.8 q4[4], r0
+; CHECK-NEXT: vmov.u16 r0, q1[1]
; CHECK-NEXT: vmov.8 q4[5], r0
; CHECK-NEXT: vmov.u16 r0, q3[2]
; CHECK-NEXT: vmov.8 q4[6], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.8 q4[7], r0
; CHECK-NEXT: vmov.u16 r0, q2[2]
+; CHECK-NEXT: vmov.8 q4[7], r0
+; CHECK-NEXT: vmov.u16 r0, q1[2]
; CHECK-NEXT: vmov.8 q4[8], r0
; CHECK-NEXT: vmov.u16 r0, q3[3]
; CHECK-NEXT: vmov.8 q4[9], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.8 q4[10], r0
; CHECK-NEXT: vmov.u16 r0, q2[3]
+; CHECK-NEXT: vmov.8 q4[10], r0
+; CHECK-NEXT: vmov.u16 r0, q1[3]
; CHECK-NEXT: vmov.8 q4[11], r0
; CHECK-NEXT: vmov.u16 r0, q3[4]
; CHECK-NEXT: vmov.8 q4[12], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.8 q4[13], r0
; CHECK-NEXT: vmov.u16 r0, q2[4]
-; CHECK-NEXT: vmov.f32 s1, s21
+; CHECK-NEXT: vmov.8 q4[13], r0
+; CHECK-NEXT: vmov.u16 r0, q1[4]
; CHECK-NEXT: vmov.8 q4[14], r0
; CHECK-NEXT: vmov.u16 r0, q3[5]
-; CHECK-NEXT: vmov.f32 s2, s22
; CHECK-NEXT: vmov.8 q4[15], r0
-; CHECK-NEXT: vstrb.16 q0, [r1, #16]
; CHECK-NEXT: vstrw.32 q4, [r1]
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: vpop {d8, d9, d10}
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldmia r0, {s0, s1}
; CHECK-NEXT: ldr r0, [r0, #8]
-; CHECK-NEXT: vmovx.f16 s4, s0
+; CHECK-NEXT: vmovx.f16 s8, s0
; CHECK-NEXT: vins.f16 s0, s1
-; CHECK-NEXT: vmov.32 q2[0], r0
-; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vmovx.f16 s6, s8
-; CHECK-NEXT: vins.f16 s8, s4
-; CHECK-NEXT: vmov.f32 s13, s8
-; CHECK-NEXT: vmovx.f16 s14, s1
-; CHECK-NEXT: vins.f16 s14, s6
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vmov r2, s13
-; CHECK-NEXT: vmov r3, s14
-; CHECK-NEXT: stm r1!, {r0, r2, r3}
+; CHECK-NEXT: vmov.32 q1[0], r0
+; CHECK-NEXT: vmovx.f16 s2, s1
+; CHECK-NEXT: vmovx.f16 s10, s4
+; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vins.f16 s2, s10
+; CHECK-NEXT: vmov.f32 s1, s4
+; CHECK-NEXT: vmov r0, s2
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: vmov r2, s1
+; CHECK-NEXT: str r0, [r1, #8]
+; CHECK-NEXT: strd r3, r2, [r1]
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: ldrd r2, r12, [r0]
; CHECK-NEXT: ldrd r3, lr, [r0, #8]
; CHECK-NEXT: vmov.32 q0[0], r2
; CHECK-NEXT: vmov.32 q1[1], lr
; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmov.f32 s3, s5
+; CHECK-NEXT: vmovx.f16 s10, s0
+; CHECK-NEXT: vmov.f32 s8, s1
+; CHECK-NEXT: vins.f16 s0, s2
+; CHECK-NEXT: vins.f16 s8, s5
; CHECK-NEXT: vmov.32 q1[0], r2
-; CHECK-NEXT: vmovx.f16 s8, s0
; CHECK-NEXT: vmov.32 q1[1], r0
-; CHECK-NEXT: vins.f16 s0, s2
+; CHECK-NEXT: vmovx.f16 s2, s2
; CHECK-NEXT: vmovx.f16 s12, s4
-; CHECK-NEXT: vins.f16 s4, s8
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vmov.f32 s9, s4
-; CHECK-NEXT: vmovx.f16 s10, s2
-; CHECK-NEXT: vins.f16 s10, s12
-; CHECK-NEXT: vmovx.f16 s12, s1
-; CHECK-NEXT: vins.f16 s1, s3
-; CHECK-NEXT: vmov.f32 s11, s1
-; CHECK-NEXT: vmovx.f16 s1, s3
-; CHECK-NEXT: vstrw.32 q2, [r1]
-; CHECK-NEXT: vmovx.f16 s8, s5
-; CHECK-NEXT: vins.f16 s5, s12
-; CHECK-NEXT: vmov.f32 s0, s5
-; CHECK-NEXT: vins.f16 s1, s8
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov r0, s1
-; CHECK-NEXT: strd r2, r0, [r1, #16]
+; CHECK-NEXT: vins.f16 s4, s10
+; CHECK-NEXT: vins.f16 s2, s12
+; CHECK-NEXT: vmovx.f16 s10, s1
+; CHECK-NEXT: vmovx.f16 s12, s5
+; CHECK-NEXT: vmovx.f16 s17, s3
+; CHECK-NEXT: vins.f16 s5, s10
+; CHECK-NEXT: vins.f16 s17, s12
+; CHECK-NEXT: vmov.f32 s16, s5
+; CHECK-NEXT: vmov r2, s17
+; CHECK-NEXT: vmov.f32 s1, s4
+; CHECK-NEXT: vmov.f32 s3, s8
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: vmov r0, s16
+; CHECK-NEXT: strd r0, r2, [r1, #16]
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r7, pc}
entry:
%s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q3, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT: vmov.f64 d0, d4
+; CHECK-NEXT: vmovx.f16 s6, s20
+; CHECK-NEXT: vmovx.f16 s12, s8
+; CHECK-NEXT: vmovx.f16 s24, s23
+; CHECK-NEXT: vmov.f32 s4, s9
+; CHECK-NEXT: vins.f16 s0, s20
+; CHECK-NEXT: vmov r2, s6
+; CHECK-NEXT: vins.f16 s4, s21
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: vmov.f32 s3, s4
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT: vmovx.f16 s0, s8
-; CHECK-NEXT: vmovx.f16 s20, s12
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmovx.f16 s24, s6
-; CHECK-NEXT: vmov.f64 d0, d6
-; CHECK-NEXT: vins.f16 s0, s8
+; CHECK-NEXT: vmov.f32 s1, s8
; CHECK-NEXT: vmov.f32 s17, s4
-; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: vmovx.f16 s26, s6
; CHECK-NEXT: vmov.f32 s18, s4
-; CHECK-NEXT: vmov.f32 s3, s13
-; CHECK-NEXT: vins.f16 s17, s20
-; CHECK-NEXT: vins.f16 s3, s9
-; CHECK-NEXT: vmovx.f16 s20, s18
-; CHECK-NEXT: vmov.f32 s1, s12
-; CHECK-NEXT: vins.f16 s2, s20
-; CHECK-NEXT: vmovx.f16 s20, s10
-; CHECK-NEXT: vins.f16 s20, s24
-; CHECK-NEXT: vmov r0, s11
-; CHECK-NEXT: vmov.16 q5[3], r0
-; CHECK-NEXT: vmovx.f16 s24, s7
-; CHECK-NEXT: vmovx.f16 s23, s11
-; CHECK-NEXT: vrev32.16 q2, q2
-; CHECK-NEXT: vins.f16 s23, s24
-; CHECK-NEXT: vmov.f32 s25, s15
-; CHECK-NEXT: vmov.f32 s22, s7
-; CHECK-NEXT: vmovx.f16 s28, s21
-; CHECK-NEXT: vmov.f32 s26, s15
+; CHECK-NEXT: vins.f16 s17, s12
+; CHECK-NEXT: vmovx.f16 s12, s18
+; CHECK-NEXT: vins.f16 s2, s12
+; CHECK-NEXT: vmovx.f16 s12, s7
+; CHECK-NEXT: vins.f16 s24, s12
+; CHECK-NEXT: vmovx.f16 s12, s22
+; CHECK-NEXT: vmov r0, s23
+; CHECK-NEXT: vins.f16 s12, s26
+; CHECK-NEXT: vmov.16 q3[3], r0
+; CHECK-NEXT: vrev32.16 q5, q5
+; CHECK-NEXT: vmov.f32 s15, s24
+; CHECK-NEXT: vmov.f32 s25, s11
+; CHECK-NEXT: vmov.f32 s14, s7
+; CHECK-NEXT: vmovx.f16 s28, s13
+; CHECK-NEXT: vmov.f32 s26, s11
; CHECK-NEXT: vins.f16 s25, s28
; CHECK-NEXT: vmovx.f16 s28, s26
-; CHECK-NEXT: vins.f16 s22, s28
-; CHECK-NEXT: vmovx.f16 s28, s13
+; CHECK-NEXT: vins.f16 s14, s28
+; CHECK-NEXT: vmovx.f16 s28, s9
; CHECK-NEXT: vmov.f32 s4, s5
; CHECK-NEXT: vins.f16 s4, s28
-; CHECK-NEXT: vmovx.f16 s28, s14
+; CHECK-NEXT: vmovx.f16 s28, s10
; CHECK-NEXT: vins.f16 s6, s28
; CHECK-NEXT: vmov.f32 s18, s2
; CHECK-NEXT: vmov.f32 s7, s6
-; CHECK-NEXT: vmov.f32 s6, s14
-; CHECK-NEXT: vmovx.f16 s12, s5
-; CHECK-NEXT: vins.f16 s9, s12
-; CHECK-NEXT: vmovx.f16 s12, s10
-; CHECK-NEXT: vins.f16 s6, s12
-; CHECK-NEXT: vmov.f32 s26, s22
-; CHECK-NEXT: vmov.f32 s10, s6
+; CHECK-NEXT: vmov.f32 s6, s10
+; CHECK-NEXT: vmovx.f16 s8, s5
+; CHECK-NEXT: vins.f16 s21, s8
+; CHECK-NEXT: vmovx.f16 s8, s22
+; CHECK-NEXT: vins.f16 s6, s8
+; CHECK-NEXT: vmov.f32 s26, s14
+; CHECK-NEXT: vmov.f32 s22, s6
; CHECK-NEXT: vmov.f32 s1, s17
-; CHECK-NEXT: vmov.f32 s21, s25
-; CHECK-NEXT: vmov.f32 s5, s9
+; CHECK-NEXT: vmov.f32 s13, s25
+; CHECK-NEXT: vmov.f32 s5, s21
; CHECK-NEXT: vmov.f32 s2, s18
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vmov.f32 s22, s26
-; CHECK-NEXT: vmov.f32 s6, s10
-; CHECK-NEXT: vstrw.32 q5, [r1, #32]
+; CHECK-NEXT: vmov.f32 s14, s26
+; CHECK-NEXT: vmov.f32 s6, s22
+; CHECK-NEXT: vstrw.32 q3, [r1, #32]
; CHECK-NEXT: vstrw.32 q1, [r1, #16]
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14}
; CHECK-NEXT: bx lr
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #144
-; CHECK-NEXT: sub sp, #144
-; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q7, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT: vmovx.f16 s0, s22
-; CHECK-NEXT: vmovx.f16 s4, s10
+; CHECK-NEXT: .pad #128
+; CHECK-NEXT: sub sp, #128
+; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT: vldrw.u32 q4, [r0]
+; CHECK-NEXT: vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT: vmovx.f16 s0, s31
+; CHECK-NEXT: vmovx.f16 s2, s15
+; CHECK-NEXT: vins.f16 s2, s0
+; CHECK-NEXT: vmovx.f16 s0, s30
+; CHECK-NEXT: vmovx.f16 s4, s14
+; CHECK-NEXT: vmov r2, s15
; CHECK-NEXT: vins.f16 s4, s0
-; CHECK-NEXT: vmov r2, s11
+; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vmov.16 q1[3], r2
-; CHECK-NEXT: vmovx.f16 s0, s23
-; CHECK-NEXT: vmovx.f16 s7, s11
-; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT: vins.f16 s7, s0
-; CHECK-NEXT: vmov.f32 s9, s31
-; CHECK-NEXT: vmov.f32 s6, s23
+; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vmov.f32 s7, s2
+; CHECK-NEXT: vmovx.f16 s2, s20
+; CHECK-NEXT: vmov.f32 s6, s31
; CHECK-NEXT: vmovx.f16 s0, s5
-; CHECK-NEXT: vmov.f32 s10, s31
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vins.f16 s9, s0
-; CHECK-NEXT: vmovx.f16 s0, s10
-; CHECK-NEXT: vins.f16 s6, s0
-; CHECK-NEXT: vmovx.f16 s0, s16
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT: vmov.f64 d0, d6
-; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT: vins.f16 s0, s16
-; CHECK-NEXT: vmov.f32 s10, s6
-; CHECK-NEXT: vmov.16 q0[4], r2
-; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s3, s13
-; CHECK-NEXT: vmov.f32 s9, s24
-; CHECK-NEXT: vins.f16 s3, s17
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vmovx.f16 s0, s12
-; CHECK-NEXT: vmov.f32 s10, s24
-; CHECK-NEXT: vins.f16 s9, s0
-; CHECK-NEXT: vmov.f32 s5, s12
-; CHECK-NEXT: vmovx.f16 s0, s10
-; CHECK-NEXT: vins.f16 s6, s0
-; CHECK-NEXT: vmov.f32 s10, s6
+; CHECK-NEXT: vmov q2, q1
+; CHECK-NEXT: vmov.f32 s5, s19
+; CHECK-NEXT: vmov.f32 s6, s19
+; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vins.f16 s5, s0
+; CHECK-NEXT: vmovx.f16 s0, s6
; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s9, s20
-; CHECK-NEXT: vmovx.f16 s0, s4
-; CHECK-NEXT: vmov.f32 s10, s20
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vmov.f64 d0, d14
-; CHECK-NEXT: vins.f16 s0, s4
-; CHECK-NEXT: vmov.f32 s20, s21
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.f32 s3, s29
-; CHECK-NEXT: vins.f16 s3, s5
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vmovx.f16 s0, s28
-; CHECK-NEXT: vins.f16 s9, s0
-; CHECK-NEXT: vmov.f32 s5, s28
-; CHECK-NEXT: vmovx.f16 s0, s10
-; CHECK-NEXT: vins.f16 s6, s0
-; CHECK-NEXT: vmov.f32 s10, s6
-; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vmov q1, q4
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vins.f16 s10, s0
+; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vmovx.f16 s16, s26
-; CHECK-NEXT: vmovx.f16 s8, s6
-; CHECK-NEXT: vins.f16 s8, s16
-; CHECK-NEXT: vmov r0, s7
-; CHECK-NEXT: vmov.16 q2[3], r0
+; CHECK-NEXT: vmov.f64 d4, d2
; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT: vmovx.f16 s11, s7
-; CHECK-NEXT: vmovx.f16 s16, s27
-; CHECK-NEXT: vmov q1, q3
-; CHECK-NEXT: vins.f16 s11, s16
-; CHECK-NEXT: vmov.f32 s1, s7
-; CHECK-NEXT: vmov.f32 s10, s27
-; CHECK-NEXT: vmovx.f16 s16, s9
-; CHECK-NEXT: vmov.f32 s2, s7
+; CHECK-NEXT: vmovx.f16 s2, s12
+; CHECK-NEXT: vins.f16 s8, s20
+; CHECK-NEXT: vmov.f32 s0, s5
+; CHECK-NEXT: vins.f16 s0, s21
+; CHECK-NEXT: vmov.16 q2[4], r2
+; CHECK-NEXT: vldrw.u32 q5, [r0, #80]
+; CHECK-NEXT: vmov.f32 s11, s0
+; CHECK-NEXT: vmov.f32 s9, s4
+; CHECK-NEXT: vmovx.f16 s0, s4
+; CHECK-NEXT: vmov.f32 s5, s20
+; CHECK-NEXT: vmov.f32 s6, s20
+; CHECK-NEXT: vins.f16 s5, s0
+; CHECK-NEXT: vmovx.f16 s0, s6
+; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT: vmov.f64 d2, d8
+; CHECK-NEXT: vins.f16 s10, s0
+; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT: vmov q2, q6
+; CHECK-NEXT: vmovx.f16 s24, s10
+; CHECK-NEXT: vmov.f32 s0, s17
+; CHECK-NEXT: vins.f16 s4, s12
+; CHECK-NEXT: vmov r0, s2
+; CHECK-NEXT: vins.f16 s0, s13
+; CHECK-NEXT: vmov.16 q1[4], r0
+; CHECK-NEXT: vmov.f32 s13, s28
+; CHECK-NEXT: vmov.f32 s7, s0
+; CHECK-NEXT: vmovx.f16 s0, s16
+; CHECK-NEXT: vmov.f32 s14, s28
+; CHECK-NEXT: vmovx.f16 s2, s11
+; CHECK-NEXT: vins.f16 s13, s0
+; CHECK-NEXT: vmov.f32 s5, s16
+; CHECK-NEXT: vmovx.f16 s0, s14
+; CHECK-NEXT: vmov r0, s11
+; CHECK-NEXT: vins.f16 s6, s0
+; CHECK-NEXT: vmovx.f16 s0, s22
+; CHECK-NEXT: vins.f16 s24, s0
+; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT: vmov q1, q2
+; CHECK-NEXT: vmovx.f16 s0, s23
+; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT: vins.f16 s2, s0
+; CHECK-NEXT: vmov.16 q6[3], r0
+; CHECK-NEXT: vrev32.16 q1, q1
+; CHECK-NEXT: vmov.f32 s27, s2
+; CHECK-NEXT: vmov.f32 s1, s11
+; CHECK-NEXT: vmov.f32 s26, s23
+; CHECK-NEXT: vmovx.f16 s16, s25
+; CHECK-NEXT: vmov.f32 s2, s11
; CHECK-NEXT: vins.f16 s1, s16
; CHECK-NEXT: vmovx.f16 s16, s2
-; CHECK-NEXT: vins.f16 s10, s16
-; CHECK-NEXT: vmovx.f16 s16, s29
-; CHECK-NEXT: vmov.f32 s2, s10
+; CHECK-NEXT: vins.f16 s26, s16
+; CHECK-NEXT: vmovx.f16 s16, s9
+; CHECK-NEXT: vmov.f32 s20, s21
; CHECK-NEXT: vins.f16 s20, s16
-; CHECK-NEXT: vmovx.f16 s16, s30
-; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vmovx.f16 s16, s10
; CHECK-NEXT: vins.f16 s22, s16
-; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s2, s26
; CHECK-NEXT: vmov.f32 s23, s22
-; CHECK-NEXT: vmov.f32 s22, s30
-; CHECK-NEXT: vrev32.16 q3, q0
+; CHECK-NEXT: vmov.f32 s22, s10
; CHECK-NEXT: vmovx.f16 s16, s21
-; CHECK-NEXT: vmov.f32 s24, s25
-; CHECK-NEXT: vins.f16 s13, s16
-; CHECK-NEXT: vmovx.f16 s16, s14
-; CHECK-NEXT: vins.f16 s22, s16
-; CHECK-NEXT: vmovx.f16 s16, s5
-; CHECK-NEXT: vins.f16 s24, s16
+; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT: vins.f16 s5, s16
+; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s25, s1
+; CHECK-NEXT: vmov.f32 s18, s10
+; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill
; CHECK-NEXT: vmovx.f16 s16, s6
-; CHECK-NEXT: vins.f16 s26, s16
-; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s27, s26
-; CHECK-NEXT: vmov.f32 s26, s6
-; CHECK-NEXT: vrev32.16 q4, q0
-; CHECK-NEXT: vmovx.f16 s4, s25
-; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT: vins.f16 s17, s4
-; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s5, s1
-; CHECK-NEXT: vmovx.f16 s28, s18
-; CHECK-NEXT: vmov.f32 s6, s2
-; CHECK-NEXT: vins.f16 s26, s28
-; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT: vmov.f32 s14, s10
+; CHECK-NEXT: vins.f16 s22, s16
+; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s14, s22
-; CHECK-NEXT: vmov.f32 s18, s26
-; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s29, s5
-; CHECK-NEXT: vmov.f32 s30, s6
-; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s5, s1
-; CHECK-NEXT: vstrw.32 q7, [r1]
-; CHECK-NEXT: vmov.f32 s6, s2
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s21, s13
-; CHECK-NEXT: vstrw.32 q1, [r1, #48]
-; CHECK-NEXT: vmov.f32 s9, s1
-; CHECK-NEXT: vmov.f32 s10, s2
-; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s25, s17
-; CHECK-NEXT: vstrw.32 q2, [r1, #32]
-; CHECK-NEXT: vmov.f32 s22, s14
-; CHECK-NEXT: vstrw.32 q0, [r1, #80]
-; CHECK-NEXT: vmov.f32 s26, s18
+; CHECK-NEXT: vmov.f32 s28, s29
+; CHECK-NEXT: vmovx.f16 s8, s17
+; CHECK-NEXT: vmov.f32 s26, s2
+; CHECK-NEXT: vmov.f32 s5, s13
+; CHECK-NEXT: vins.f16 s28, s8
+; CHECK-NEXT: vmovx.f16 s0, s18
+; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT: vins.f16 s30, s0
+; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s6, s14
+; CHECK-NEXT: vldrw.u32 q3, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s31, s30
+; CHECK-NEXT: vrev32.16 q0, q0
+; CHECK-NEXT: vmov.f32 s30, s18
+; CHECK-NEXT: vmovx.f16 s16, s29
+; CHECK-NEXT: vmov.f32 s9, s13
+; CHECK-NEXT: vins.f16 s1, s16
+; CHECK-NEXT: vmov.f32 s10, s14
+; CHECK-NEXT: vmovx.f16 s16, s2
+; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vins.f16 s30, s16
+; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s2, s30
+; CHECK-NEXT: vmov.f32 s18, s14
+; CHECK-NEXT: vstrw.32 q2, [r1, #48]
+; CHECK-NEXT: vmov.f32 s13, s17
+; CHECK-NEXT: vstrw.32 q6, [r1, #80]
+; CHECK-NEXT: vmov.f32 s29, s1
+; CHECK-NEXT: vstrw.32 q1, [r1]
+; CHECK-NEXT: vmov.f32 s30, s2
+; CHECK-NEXT: vmov.f32 s14, s18
+; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s18, s22
+; CHECK-NEXT: vstrw.32 q7, [r1, #16]
+; CHECK-NEXT: vmov.f32 s21, s17
+; CHECK-NEXT: vstrw.32 q3, [r1, #32]
+; CHECK-NEXT: vmov.f32 s22, s18
; CHECK-NEXT: vstrw.32 q5, [r1, #64]
-; CHECK-NEXT: vstrw.32 q6, [r1, #16]
-; CHECK-NEXT: add sp, #144
+; CHECK-NEXT: add sp, #128
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
entry:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vldrw.u32 q4, [r0]
-; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT: vmov.f64 d0, d8
-; CHECK-NEXT: vmov.u16 r0, q4[1]
-; CHECK-NEXT: vmov.f32 s21, s5
-; CHECK-NEXT: vins.f16 s0, s8
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vins.f16 s21, s21
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.f32 s12, s17
-; CHECK-NEXT: vmov.16 q5[6], r0
-; CHECK-NEXT: vins.f16 s12, s9
-; CHECK-NEXT: vmov.16 q5[7], r0
-; CHECK-NEXT: vmov.u16 r0, q4[3]
-; CHECK-NEXT: vmov.f32 s25, s4
-; CHECK-NEXT: vmov.16 q3[4], r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vins.f16 s25, s25
-; CHECK-NEXT: vmov.16 q3[5], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.16 q6[6], r0
-; CHECK-NEXT: vmov.f32 s13, s21
-; CHECK-NEXT: vmov.16 q6[7], r0
-; CHECK-NEXT: vmov.u16 r0, q2[1]
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov.f32 s15, s23
-; CHECK-NEXT: vmov.f32 s20, s19
-; CHECK-NEXT: vmov.u16 r0, q4[7]
-; CHECK-NEXT: vmov.f32 s1, s25
-; CHECK-NEXT: vins.f16 s20, s11
-; CHECK-NEXT: vmov.f32 s3, s27
-; CHECK-NEXT: vmov.16 q5[4], r0
-; CHECK-NEXT: vmov.f32 s25, s7
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vins.f16 s25, s25
-; CHECK-NEXT: vstrb.8 q3, [r1, #16]
-; CHECK-NEXT: vmov.16 q6[6], r0
-; CHECK-NEXT: vstrb.8 q0, [r1]
-; CHECK-NEXT: vmov.16 q6[7], r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vmov.16 q5[5], r0
-; CHECK-NEXT: vmov.u16 r0, q4[5]
-; CHECK-NEXT: vmov.f32 s21, s25
-; CHECK-NEXT: vmov.f32 s23, s27
-; CHECK-NEXT: vmov.f64 d12, d9
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmovx.f16 s2, s5
+; CHECK-NEXT: vmovx.f16 s12, s9
+; CHECK-NEXT: vins.f16 s2, s12
+; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT: vins.f16 s5, s9
+; CHECK-NEXT: vmovx.f16 s24, s11
+; CHECK-NEXT: vmovx.f16 s16, s13
+; CHECK-NEXT: vins.f16 s13, s13
+; CHECK-NEXT: vins.f16 s16, s16
+; CHECK-NEXT: vmov q5, q3
+; CHECK-NEXT: vmov.f32 s23, s16
+; CHECK-NEXT: vmovx.f16 s16, s8
+; CHECK-NEXT: vmov.f32 s0, s5
+; CHECK-NEXT: vmov.f32 s1, s21
+; CHECK-NEXT: vmov.f32 s3, s23
+; CHECK-NEXT: vmovx.f16 s20, s4
+; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vins.f16 s20, s16
+; CHECK-NEXT: vmov q4, q1
+; CHECK-NEXT: vmovx.f16 s8, s10
+; CHECK-NEXT: vmov.f32 s18, s20
+; CHECK-NEXT: vmovx.f16 s22, s7
+; CHECK-NEXT: vins.f16 s7, s11
+; CHECK-NEXT: vins.f16 s22, s24
+; CHECK-NEXT: vmovx.f16 s26, s6
+; CHECK-NEXT: vmovx.f16 s19, s12
+; CHECK-NEXT: vins.f16 s12, s12
+; CHECK-NEXT: vmov.f32 s20, s7
+; CHECK-NEXT: vins.f16 s6, s10
+; CHECK-NEXT: vins.f16 s26, s8
+; CHECK-NEXT: vmov.f32 s17, s12
+; CHECK-NEXT: vmovx.f16 s23, s15
+; CHECK-NEXT: vins.f16 s15, s15
+; CHECK-NEXT: vmov.f32 s24, s6
+; CHECK-NEXT: vmov.f32 s21, s15
+; CHECK-NEXT: vmovx.f16 s27, s14
+; CHECK-NEXT: vins.f16 s14, s14
+; CHECK-NEXT: vins.f16 s19, s19
+; CHECK-NEXT: vmov.f32 s25, s14
+; CHECK-NEXT: vins.f16 s23, s23
+; CHECK-NEXT: vins.f16 s27, s27
; CHECK-NEXT: vstrb.8 q5, [r1, #48]
-; CHECK-NEXT: vmov.f32 s17, s6
-; CHECK-NEXT: vins.f16 s24, s10
-; CHECK-NEXT: vmov.16 q6[4], r0
-; CHECK-NEXT: vins.f16 s17, s17
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.16 q4[6], r0
-; CHECK-NEXT: vmov.16 q4[7], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.16 q6[5], r0
-; CHECK-NEXT: vmov.f32 s25, s17
-; CHECK-NEXT: vmov.f32 s27, s19
; CHECK-NEXT: vstrb.8 q6, [r1, #32]
+; CHECK-NEXT: vstrb.8 q0, [r1, #16]
+; CHECK-NEXT: vstrb.8 q4, [r1]
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: bx lr
entry:
define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) {
; CHECK-LABEL: vst4_v2f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldmia r0, {s4, s5}
-; CHECK-NEXT: vldr s0, [r0, #8]
-; CHECK-NEXT: vmovx.f16 s12, s4
-; CHECK-NEXT: vins.f16 s4, s5
-; CHECK-NEXT: vmov.f32 s1, s0
-; CHECK-NEXT: vmovx.f16 s14, s0
-; CHECK-NEXT: vmov q2, q1
+; CHECK-NEXT: vldmia r0, {s0, s1}
+; CHECK-NEXT: vldr s4, [r0, #8]
+; CHECK-NEXT: vmovx.f16 s2, s0
; CHECK-NEXT: vins.f16 s0, s1
-; CHECK-NEXT: vmovx.f16 s4, s5
-; CHECK-NEXT: vmov.f32 s9, s0
-; CHECK-NEXT: vins.f16 s12, s4
-; CHECK-NEXT: vmovx.f16 s0, s1
-; CHECK-NEXT: vmov.f32 s10, s12
-; CHECK-NEXT: vins.f16 s14, s0
-; CHECK-NEXT: vmov.f32 s11, s14
-; CHECK-NEXT: vstrh.16 q2, [r1]
+; CHECK-NEXT: vmov.f32 s5, s4
+; CHECK-NEXT: vmovx.f16 s3, s4
+; CHECK-NEXT: vmovx.f16 s8, s1
+; CHECK-NEXT: vins.f16 s4, s5
+; CHECK-NEXT: vins.f16 s2, s8
+; CHECK-NEXT: vmovx.f16 s8, s5
+; CHECK-NEXT: vins.f16 s3, s8
+; CHECK-NEXT: vmov.f32 s1, s4
+; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: ldrd lr, r12, [r0]
-; CHECK-NEXT: ldrd r3, r2, [r0, #8]
-; CHECK-NEXT: vmov.32 q0[0], lr
+; CHECK-NEXT: ldrd r2, r12, [r0]
+; CHECK-NEXT: ldrd r3, lr, [r0, #8]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrd r2, r0, [r0, #16]
; CHECK-NEXT: vmov.32 q1[0], r3
; CHECK-NEXT: vmov.32 q0[1], r12
-; CHECK-NEXT: vmov.32 q1[1], r2
-; CHECK-NEXT: ldrd r2, r0, [r0, #16]
+; CHECK-NEXT: vmov.32 q1[1], lr
; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmov.f32 s3, s5
-; CHECK-NEXT: vmov.f32 s8, s1
-; CHECK-NEXT: vmovx.f16 s12, s3
-; CHECK-NEXT: vins.f16 s8, s5
-; CHECK-NEXT: vmov.32 q1[0], r2
-; CHECK-NEXT: vmov.32 q1[1], r0
-; CHECK-NEXT: vmov.f32 s6, s4
-; CHECK-NEXT: vmov.f32 s7, s5
-; CHECK-NEXT: vmov.f32 s9, s5
-; CHECK-NEXT: vins.f16 s9, s5
-; CHECK-NEXT: vmovx.f16 s10, s1
-; CHECK-NEXT: vins.f16 s10, s12
-; CHECK-NEXT: vmovx.f16 s12, s5
-; CHECK-NEXT: vmovx.f16 s11, s5
-; CHECK-NEXT: vins.f16 s11, s12
-; CHECK-NEXT: vstrh.16 q2, [r1, #16]
-; CHECK-NEXT: vmovx.f16 s8, s0
+; CHECK-NEXT: vmovx.f16 s12, s0
; CHECK-NEXT: vins.f16 s0, s2
-; CHECK-NEXT: vmovx.f16 s3, s4
-; CHECK-NEXT: vins.f16 s4, s6
-; CHECK-NEXT: vmov.f32 s1, s4
-; CHECK-NEXT: vmovx.f16 s10, s2
+; CHECK-NEXT: vmovx.f16 s4, s2
+; CHECK-NEXT: vins.f16 s12, s4
+; CHECK-NEXT: vmovx.f16 s6, s1
+; CHECK-NEXT: vins.f16 s1, s3
+; CHECK-NEXT: vmovx.f16 s8, s3
+; CHECK-NEXT: vins.f16 s6, s8
+; CHECK-NEXT: vmov.32 q2[0], r2
+; CHECK-NEXT: vmov.32 q2[1], r0
+; CHECK-NEXT: vmov.f32 s10, s8
+; CHECK-NEXT: vmov.f32 s11, s9
+; CHECK-NEXT: vmovx.f16 s3, s8
; CHECK-NEXT: vins.f16 s8, s10
-; CHECK-NEXT: vmovx.f16 s4, s6
-; CHECK-NEXT: vmov.f32 s2, s8
-; CHECK-NEXT: vins.f16 s3, s4
+; CHECK-NEXT: vmovx.f16 s14, s10
+; CHECK-NEXT: vmovx.f16 s7, s9
+; CHECK-NEXT: vins.f16 s3, s14
+; CHECK-NEXT: vins.f16 s9, s11
+; CHECK-NEXT: vmovx.f16 s14, s11
+; CHECK-NEXT: vins.f16 s7, s14
+; CHECK-NEXT: vmov.f32 s4, s1
+; CHECK-NEXT: vmov.f32 s1, s8
+; CHECK-NEXT: vmov.f32 s5, s9
+; CHECK-NEXT: vmov.f32 s2, s12
+; CHECK-NEXT: vstrh.16 q1, [r1, #16]
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: pop {r7, pc}
entry:
define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) {
; CHECK-LABEL: vst4_v8f16_align1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q5, [r0]
-; CHECK-NEXT: vmovx.f16 s2, s21
-; CHECK-NEXT: vins.f16 s21, s17
-; CHECK-NEXT: vmov.f32 s0, s21
-; CHECK-NEXT: vmovx.f16 s4, s17
-; CHECK-NEXT: vins.f16 s2, s4
-; CHECK-NEXT: vmovx.f16 s8, s20
-; CHECK-NEXT: vmovx.f16 s4, s16
-; CHECK-NEXT: vins.f16 s20, s16
-; CHECK-NEXT: vins.f16 s8, s4
-; CHECK-NEXT: vmov q1, q5
-; CHECK-NEXT: vmov.f32 s6, s8
-; CHECK-NEXT: vmovx.f16 s10, s23
-; CHECK-NEXT: vins.f16 s23, s19
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmovx.f16 s30, s5
+; CHECK-NEXT: vmovx.f16 s8, s17
+; CHECK-NEXT: vins.f16 s5, s17
+; CHECK-NEXT: vins.f16 s30, s8
+; CHECK-NEXT: vmovx.f16 s0, s4
+; CHECK-NEXT: vmovx.f16 s8, s16
+; CHECK-NEXT: vins.f16 s4, s16
+; CHECK-NEXT: vins.f16 s0, s8
+; CHECK-NEXT: vmovx.f16 s10, s7
; CHECK-NEXT: vmovx.f16 s12, s19
-; CHECK-NEXT: vmov.f32 s8, s23
-; CHECK-NEXT: vmovx.f16 s16, s18
+; CHECK-NEXT: vins.f16 s7, s19
; CHECK-NEXT: vins.f16 s10, s12
-; CHECK-NEXT: vmovx.f16 s14, s22
-; CHECK-NEXT: vins.f16 s22, s18
-; CHECK-NEXT: vmov.f32 s12, s22
+; CHECK-NEXT: vmovx.f16 s14, s6
+; CHECK-NEXT: vmovx.f16 s16, s18
+; CHECK-NEXT: vins.f16 s6, s18
; CHECK-NEXT: vins.f16 s14, s16
; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT: vmov.f32 s13, s18
+; CHECK-NEXT: vstr s0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: vmov.f32 s28, s5
; CHECK-NEXT: vmovx.f16 s24, s17
; CHECK-NEXT: vins.f16 s17, s17
-; CHECK-NEXT: vins.f16 s24, s24
-; CHECK-NEXT: vmov q5, q4
-; CHECK-NEXT: vmovx.f16 s7, s16
-; CHECK-NEXT: vmov.f32 s23, s24
+; CHECK-NEXT: vmovx.f16 s23, s16
; CHECK-NEXT: vins.f16 s16, s16
-; CHECK-NEXT: vmov.f32 s5, s16
; CHECK-NEXT: vmovx.f16 s11, s19
; CHECK-NEXT: vins.f16 s19, s19
-; CHECK-NEXT: vins.f16 s13, s13
-; CHECK-NEXT: vmov.f32 s9, s19
; CHECK-NEXT: vmovx.f16 s15, s18
-; CHECK-NEXT: vmov.f32 s1, s21
-; CHECK-NEXT: vins.f16 s7, s7
+; CHECK-NEXT: vins.f16 s18, s18
+; CHECK-NEXT: vins.f16 s24, s24
+; CHECK-NEXT: vmov q0, q4
+; CHECK-NEXT: vmov.f32 s3, s24
+; CHECK-NEXT: vmov q6, q1
+; CHECK-NEXT: vmov.f32 s29, s1
; CHECK-NEXT: vins.f16 s11, s11
; CHECK-NEXT: vins.f16 s15, s15
+; CHECK-NEXT: vins.f16 s23, s23
+; CHECK-NEXT: vmov.f32 s8, s7
+; CHECK-NEXT: vmov.f32 s12, s6
+; CHECK-NEXT: vmov.f32 s9, s19
+; CHECK-NEXT: vmov.f32 s13, s18
; CHECK-NEXT: vstrb.8 q2, [r1, #48]
+; CHECK-NEXT: vmov.f32 s31, s3
+; CHECK-NEXT: vldr s0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vmov.f32 s21, s16
; CHECK-NEXT: vstrb.8 q3, [r1, #32]
-; CHECK-NEXT: vmov.f32 s3, s23
-; CHECK-NEXT: vstrb.8 q1, [r1]
-; CHECK-NEXT: vstrb.8 q0, [r1, #16]
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT: vmov.f32 s26, s0
+; CHECK-NEXT: vstrb.8 q7, [r1, #16]
+; CHECK-NEXT: vmov.f32 s25, s16
+; CHECK-NEXT: vmov.f32 s27, s23
+; CHECK-NEXT: vstrb.8 q6, [r1]
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0