From: David Green Date: Fri, 18 Dec 2020 13:33:40 +0000 (+0000) Subject: Revert "[ARM] Match dual lane vmovs from insert_vector_elt" X-Git-Tag: llvmorg-13-init~2967 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6e913e44519245a79ec098c9b1459007dae84804;p=platform%2Fupstream%2Fllvm.git Revert "[ARM] Match dual lane vmovs from insert_vector_elt" This one needed more testing. --- diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 2d93793..d792240 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4790,14 +4790,6 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI, } } } - if (MI.getOpcode() == ARM::MVE_VMOV_q_rr) { - assert(MI.getOperand(4).isImm() && MI.getOperand(5).isImm()); - if ((MI.getOperand(4).getImm() != 2 && MI.getOperand(4).getImm() != 3) || - MI.getOperand(4).getImm() != MI.getOperand(5).getImm() + 2) { - ErrInfo = "Incorrect array index for MVE_VMOV_q_rr"; - return false; - } - } return true; } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 8595705..42498be 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5845,41 +5845,6 @@ def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd), let AsmMatchConverter = "cvtMVEVMOVQtoDReg"; } -let Predicates = [HasMVEInt] in { - // Double lane moves. There are a number of patterns here. We know that the - // insertelt's will be in descending order by index, and need to match the 5 - // patterns that might contain 2-0 or 3-1 pairs. These are: - // 3 2 1 0 -> vmovqrr 31; vmovqrr 20 - // 3 2 1 -> vmovqrr 31; vmov 2 - // 3 1 -> vmovqrr 31 - // 2 1 0 -> vmovqrr 20; vmov 1 - // 2 0 -> vmovqrr 20 - // The other potential patterns will be handled by single lane inserts. - def : Pat<(insertelt (insertelt (insertelt (insertelt (v4i32 MQPR:$src1), - rGPR:$srcA, (i32 0)), - rGPR:$srcB, (i32 1)), - rGPR:$srcC, (i32 2)), - rGPR:$srcD, (i32 3)), - (MVE_VMOV_q_rr (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcC, rGPR:$srcA, (i32 2), (i32 0)), - rGPR:$srcD, rGPR:$srcB, (i32 3), (i32 1))>; - def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1), - rGPR:$srcB, (i32 1)), - rGPR:$srcC, (i32 2)), - rGPR:$srcD, (i32 3)), - (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 2)), - rGPR:$srcD, rGPR:$srcB, (i32 3), (i32 1))>; - def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 1)), rGPR:$srcB, (i32 3)), - (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcB, rGPR:$srcA, (i32 3), (i32 1))>; - def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1), - rGPR:$srcB, (i32 0)), - rGPR:$srcC, (i32 1)), - rGPR:$srcD, (i32 2)), - (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 1)), - rGPR:$srcD, rGPR:$srcB, (i32 2), (i32 0))>; - def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 0)), rGPR:$srcB, (i32 2)), - (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcB, rGPR:$srcA, (i32 2), (i32 0))>; -} - // end of coproc mov // start of MVE interleaving load/store diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll index 09f5942..7294931 100644 --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -39,7 +39,6 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) { ; CHECK-NEXT: adr r3, .LCPI1_0 ; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: ldr r3, [sp, #40] ; CHECK-NEXT: vadd.i32 q2, q0, r1 ; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: vcmp.u32 hi, q1, q2 @@ -47,16 +46,21 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) { ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.u32 hi, q0, q2 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: ldr r2, [sp, #36] -; CHECK-NEXT: ldr r3, [sp, #44] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: ldr r2, [sp, #40] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: ldr r2, [sp, #44] +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: ldr r2, [sp] -; CHECK-NEXT: ldr r3, [sp, #8] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: ldr r2, [sp, #4] -; CHECK-NEXT: ldr r3, [sp, #12] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: ldr r2, [sp, #8] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: ldr r2, [sp, #12] +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: adr r2, .LCPI1_1 ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vstrw.32 q2, [r0] @@ -66,19 +70,21 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) { ; CHECK-NEXT: vcmp.u32 hi, q1, q2 ; CHECK-NEXT: vmrs r1, p0 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: ldr r2, [sp, #56] ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: ldr r1, [sp, #52] +; CHECK-NEXT: ldr r1, [sp, #48] ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.u32 hi, q0, q2 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: ldr r1, [sp, #52] ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldr r1, [sp, #48] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: ldr r1, [sp, #56] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: ldr r1, [sp, #16] +; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: ldr r1, [sp, #20] -; CHECK-NEXT: ldr r2, [sp, #24] ; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: ldr r1, [sp, #16] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: ldr r1, [sp, #24] +; CHECK-NEXT: vmov.32 q1[2], r1 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov.f32 s2, s1 @@ -407,75 +413,81 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) { define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroext %m) { ; CHECK-LABEL: test_width2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq.w .LBB4_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: adds r0, r2, #1 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r2 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r0, r0, #1 -; CHECK-NEXT: adr r2, .LCPI4_0 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: subs r0, #2 -; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vldrw.u32 q2, [r2] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: add.w lr, r3, r0, lsr #1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: adr r3, .LCPI4_0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3[2], q3[0], r8, r8 -; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vmov.32 q3[0], r6 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov.32 q3[2], r6 +; CHECK-NEXT: vmov r0, s9 ; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vmov r6, s5 +; CHECK-NEXT: adds r6, #2 ; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add.w r8, r8, #2 -; CHECK-NEXT: vmov r9, s12 ; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: adds r3, #1 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r9 -; CHECK-NEXT: vand q3, q3, q0 ; CHECK-NEXT: adc r12, r2, #0 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: subs r7, r5, r7 -; CHECK-NEXT: vmov r7, s14 -; CHECK-NEXT: sbcs r4, r6 -; CHECK-NEXT: vmov r6, s15 -; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: subs r2, r7, r2 -; CHECK-NEXT: sbcs.w r0, r6, r0 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.32 q3[2], r3 +; CHECK-NEXT: vand q3, q3, q0 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: teq.w r4, r2 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.32 q4[1], r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: eors r3, r2 +; CHECK-NEXT: orrs.w r3, r3, r12 +; CHECK-NEXT: cset r3, ne +; CHECK-NEXT: tst.w r3, #1 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: subs r5, r4, r5 +; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: vmov r5, s10 +; CHECK-NEXT: vmov.32 q4[3], r3 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: veor q4, q4, q1 +; CHECK-NEXT: sbcs.w r0, r3, r0 +; CHECK-NEXT: vmov r3, s11 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r4 -; CHECK-NEXT: eor.w r0, r7, r3 -; CHECK-NEXT: orrs.w r0, r0, r12 -; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: tst.w r0, #1 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: teq.w r5, r9 -; CHECK-NEXT: cset r2, ne -; CHECK-NEXT: tst.w r2, #1 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov q4[2], q4[0], r0, r2 -; CHECK-NEXT: vmov q4[3], q4[1], r0, r2 -; CHECK-NEXT: veor q4, q4, q2 -; CHECK-NEXT: vand q4, q4, q3 +; CHECK-NEXT: subs r2, r2, r5 +; CHECK-NEXT: vmov.32 q5[0], r0 +; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: vmov r0, s15 ; CHECK-NEXT: @ implicit-def: $q3 +; CHECK-NEXT: sbcs r0, r3 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.32 q5[3], r0 +; CHECK-NEXT: vand q4, q4, q5 ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: and r2, r2, #1 @@ -507,9 +519,8 @@ define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroe ; CHECK-NEXT: le lr, .LBB4_2 ; CHECK-NEXT: .LBB4_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI4_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-abs.ll b/llvm/test/CodeGen/Thumb2/mve-abs.ll index 80681d1..8a9b881 100644 --- a/llvm/test/CodeGen/Thumb2/mve-abs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-abs.ll @@ -42,18 +42,21 @@ define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: adds.w r1, r1, r0, asr #31 -; CHECK-NEXT: adc.w r12, r0, r0, asr #31 +; CHECK-NEXT: adc.w r2, r0, r0, asr #31 +; CHECK-NEXT: eor.w r2, r2, r0, asr #31 +; CHECK-NEXT: eor.w r0, r1, r0, asr #31 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: adds.w r1, r1, r0, asr #31 ; CHECK-NEXT: eor.w r1, r1, r0, asr #31 -; CHECK-NEXT: adds.w r2, r2, r3, asr #31 -; CHECK-NEXT: eor.w r0, r12, r0, asr #31 -; CHECK-NEXT: eor.w r2, r2, r3, asr #31 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: adc.w r1, r3, r3, asr #31 -; CHECK-NEXT: eor.w r1, r1, r3, asr #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: adc.w r1, r0, r0, asr #31 +; CHECK-NEXT: eor.w r0, r1, r0, asr #31 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %0 = icmp slt <2 x i64> %s1, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll index ad21dc1..dc1ed2e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -8,17 +8,19 @@ define arm_aapcs_vfpcc <4 x i32> @udiv_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: udiv r0, r1, r0 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: udiv r1, r2, r1 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: udiv r0, r1, r0 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: udiv r1, r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: @@ -32,17 +34,19 @@ define arm_aapcs_vfpcc <4 x i32> @sdiv_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: sdiv r0, r1, r0 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: sdiv r1, r2, r1 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: sdiv r0, r1, r0 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: sdiv r1, r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: sdiv r0, r1, r0 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: @@ -59,20 +63,22 @@ define arm_aapcs_vfpcc <4 x i32> @urem_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: udiv r2, r1, r0 ; CHECK-NEXT: mls r12, r2, r0, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: udiv r3, r2, r1 ; CHECK-NEXT: mls lr, r3, r1, r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: udiv r0, r3, r2 ; CHECK-NEXT: mls r0, r0, r2, r3 ; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 +; CHECK-NEXT: vmov.32 q0[0], r12 ; CHECK-NEXT: udiv r1, r3, r2 +; CHECK-NEXT: vmov.32 q0[1], lr +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: mls r1, r1, r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: pop {r7, pc} entry: %out = urem <4 x i32> %in1, %in2 @@ -88,20 +94,22 @@ define arm_aapcs_vfpcc <4 x i32> @srem_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: sdiv r2, r1, r0 ; CHECK-NEXT: mls r12, r2, r0, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: sdiv r3, r2, r1 ; CHECK-NEXT: mls lr, r3, r1, r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sdiv r0, r3, r2 ; CHECK-NEXT: mls r0, r0, r2, r3 ; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 +; CHECK-NEXT: vmov.32 q0[0], r12 ; CHECK-NEXT: sdiv r1, r3, r2 +; CHECK-NEXT: vmov.32 q0[1], lr +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: mls r1, r1, r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: pop {r7, pc} entry: %out = srem <4 x i32> %in1, %in2 @@ -731,8 +739,8 @@ entry: define arm_aapcs_vfpcc <2 x i64> @udiv_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-LABEL: udiv_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 @@ -742,17 +750,20 @@ define arm_aapcs_vfpcc <2 x i64> @udiv_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov r3, s17 ; CHECK-NEXT: bl __aeabi_uldivmod -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s23 +; CHECK-NEXT: vmov r12, s22 +; CHECK-NEXT: vmov lr, s23 ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: mov r1, lr ; CHECK-NEXT: bl __aeabi_uldivmod -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 +; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r7, pc} entry: %out = udiv <2 x i64> %in1, %in2 ret <2 x i64> %out @@ -761,8 +772,8 @@ entry: define arm_aapcs_vfpcc <2 x i64> @sdiv_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-LABEL: sdiv_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 @@ -772,17 +783,20 @@ define arm_aapcs_vfpcc <2 x i64> @sdiv_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov r3, s17 ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s23 +; CHECK-NEXT: vmov r12, s22 +; CHECK-NEXT: vmov lr, s23 ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: mov r1, lr ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 +; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r7, pc} entry: %out = sdiv <2 x i64> %in1, %in2 ret <2 x i64> %out @@ -791,8 +805,8 @@ entry: define arm_aapcs_vfpcc <2 x i64> @urem_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-LABEL: urem_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 @@ -802,17 +816,20 @@ define arm_aapcs_vfpcc <2 x i64> @urem_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov r3, s17 ; CHECK-NEXT: bl __aeabi_uldivmod -; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: vmov r12, s18 +; CHECK-NEXT: vmov lr, s19 +; CHECK-NEXT: vmov.32 q4[0], r2 ; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.32 q4[1], r3 ; CHECK-NEXT: vmov r1, s23 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: mov r3, lr ; CHECK-NEXT: bl __aeabi_uldivmod -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r5 +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vmov.32 q4[3], r3 +; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r7, pc} entry: %out = urem <2 x i64> %in1, %in2 ret <2 x i64> %out @@ -821,8 +838,8 @@ entry: define arm_aapcs_vfpcc <2 x i64> @srem_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-LABEL: srem_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 @@ -832,17 +849,20 @@ define arm_aapcs_vfpcc <2 x i64> @srem_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov r3, s17 ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: vmov r12, s18 +; CHECK-NEXT: vmov lr, s19 +; CHECK-NEXT: vmov.32 q4[0], r2 ; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.32 q4[1], r3 ; CHECK-NEXT: vmov r1, s23 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: mov r3, lr ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r5 +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vmov.32 q4[3], r3 +; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r7, pc} entry: %out = srem <2 x i64> %in1, %in2 ret <2 x i64> %out diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 45c5343..0f3a91c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -589,46 +589,50 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vmov.u16 r7, q2[4] -; CHECK-NEXT: vmov.u16 r3, q2[6] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r7 -; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov.u16 r7, q2[7] ; CHECK-NEXT: vmov.u16 r5, q2[0] -; CHECK-NEXT: vmov q4[3], q4[1], r7, r3 -; CHECK-NEXT: vmov.u16 r6, q2[2] +; CHECK-NEXT: vmov.32 q4[0], r7 +; CHECK-NEXT: vmov.u16 r7, q2[5] +; CHECK-NEXT: vmov.32 q4[1], r7 +; CHECK-NEXT: vmov.u16 r7, q2[6] +; CHECK-NEXT: vmov.32 q4[2], r7 +; CHECK-NEXT: vmov.u16 r7, q2[7] +; CHECK-NEXT: vmov.32 q4[3], r7 +; CHECK-NEXT: vmov.32 q3[0], r5 ; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vmov q3[2], q3[0], r6, r5 -; CHECK-NEXT: vshl.i32 q4, q4, #1 ; CHECK-NEXT: vmov.u16 r5, q2[1] +; CHECK-NEXT: vshl.i32 q4, q4, #1 +; CHECK-NEXT: vmov.32 q3[1], r5 ; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vmov.u16 r6, q2[3] -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov q3[3], q3[1], r6, r5 +; CHECK-NEXT: vmov.u16 r5, q2[2] +; CHECK-NEXT: vmov r7, s16 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.u16 r5, q2[3] +; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov.32 q3[3], r5 +; CHECK-NEXT: vadd.i16 q2, q2, q1 ; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vmov r7, s17 ; CHECK-NEXT: vshl.i32 q3, q3, #1 -; CHECK-NEXT: vadd.i16 q2, q2, q1 ; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: vmov r5, s15 ; CHECK-NEXT: vmov r6, s14 ; CHECK-NEXT: vmov r12, s13 -; CHECK-NEXT: ldrh.w r11, [r3] -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: ldrh.w r11, [r7] +; CHECK-NEXT: vmov r7, s12 +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh.w r9, [r5] ; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: ldrh.w r10, [r6] ; CHECK-NEXT: vmov r6, s19 ; CHECK-NEXT: ldrh.w r1, [r12] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q3[0], r3 +; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: vmov.16 q3[0], r7 ; CHECK-NEXT: vmov.16 q3[1], r1 ; CHECK-NEXT: vmov.16 q3[2], r10 -; CHECK-NEXT: ldrh r5, [r5] ; CHECK-NEXT: vmov.16 q3[3], r9 -; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: vmov.16 q3[4], r11 -; CHECK-NEXT: vmov.16 q3[5], r7 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q3[5], r3 +; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: vmov.16 q3[6], r5 ; CHECK-NEXT: vmov.16 q3[7], r6 ; CHECK-NEXT: vstrb.8 q3, [r4], #16 @@ -700,27 +704,26 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea ; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB12_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: bic r1, r2, #7 +; CHECK-NEXT: bic r8, r2, #7 ; CHECK-NEXT: adr r6, .LCPI12_2 -; CHECK-NEXT: sub.w r3, r1, #8 +; CHECK-NEXT: sub.w r3, r8, #8 ; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill ; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: vmov.i16 q3, #0x18 ; CHECK-NEXT: add.w r1, r7, r3, lsr #3 ; CHECK-NEXT: adr r3, .LCPI12_0 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: adr r7, .LCPI12_1 -; CHECK-NEXT: vmov.i16 q3, #0x18 +; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: str r1, [sp, #52] @ 4-byte Spill ; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #52] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r4, [sp, #60] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload @@ -730,27 +733,41 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea ; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vmov.u16 r3, q5[0] -; CHECK-NEXT: vmov.u16 r5, q5[2] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r3 -; CHECK-NEXT: vmov.u16 r3, q5[1] -; CHECK-NEXT: vmov.u16 r5, q5[3] ; CHECK-NEXT: vmov.u16 r7, q7[4] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 -; CHECK-NEXT: vmov.u16 r5, q5[6] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.u16 r3, q5[1] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.u16 r3, q5[2] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.u16 r3, q5[3] +; CHECK-NEXT: vmov.32 q0[3], r3 +; CHECK-NEXT: vmov.u16 r12, q6[0] ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmov.u16 r12, q7[6] +; CHECK-NEXT: vmov.32 q1[0], r12 ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vmov.u16 r1, q7[7] +; CHECK-NEXT: vmov.u16 r1, q6[1] ; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.u16 r1, q6[2] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u16 r1, q6[3] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov.u16 r1, q6[4] +; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmov r6, s11 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q4, q1, r0 ; CHECK-NEXT: ldrh.w r9, [r3] ; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 ; CHECK-NEXT: vmov.u16 r3, q5[5] -; CHECK-NEXT: vmov.u16 r5, q5[7] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.u16 r3, q5[6] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.u16 r3, q5[7] +; CHECK-NEXT: vmov.32 q0[3], r3 ; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 @@ -761,44 +778,42 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea ; CHECK-NEXT: ldrh r5, [r5] ; CHECK-NEXT: ldrh.w r11, [r3] ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r12, r7 +; CHECK-NEXT: vmov.32 q0[0], r7 ; CHECK-NEXT: vmov.u16 r7, q7[5] -; CHECK-NEXT: vmov q0[3], q0[1], r1, r7 +; CHECK-NEXT: vmov.32 q0[1], r7 +; CHECK-NEXT: vmov.u16 r7, q7[6] +; CHECK-NEXT: vmov.32 q0[2], r7 +; CHECK-NEXT: vmov.u16 r7, q7[7] +; CHECK-NEXT: vmov.32 q0[3], r7 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r7, s2 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: ldrh.w r8, [r3] -; CHECK-NEXT: vmov.u16 r3, q6[2] -; CHECK-NEXT: ldrh r7, [r1] -; CHECK-NEXT: vmov.u16 r1, q6[0] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q6[1] -; CHECK-NEXT: vmov.u16 r3, q6[3] -; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q6[4] -; CHECK-NEXT: vmov.u16 r3, q6[6] -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-NEXT: vmov.32 q0[0], r1 ; CHECK-NEXT: vmov.u16 r1, q6[5] -; CHECK-NEXT: vmov.u16 r3, q6[7] -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.u16 r1, q6[6] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.u16 r1, q6[7] +; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: vmov.u16 r1, q7[0] -; CHECK-NEXT: vmov.u16 r3, q7[2] -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r1 +; CHECK-NEXT: vmov.32 q3[0], r1 ; CHECK-NEXT: vmov.u16 r1, q7[1] -; CHECK-NEXT: vmov.u16 r3, q7[3] -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r1 +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov.u16 r1, q7[2] +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: vmov.u16 r1, q7[3] +; CHECK-NEXT: vmov.32 q3[3], r1 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vshl.i32 q3, q3, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r7, [r7] ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q1[0], r1 ; CHECK-NEXT: vmov r1, s9 @@ -809,7 +824,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea ; CHECK-NEXT: vmov.16 q1[3], r6 ; CHECK-NEXT: vmov.16 q1[4], r10 ; CHECK-NEXT: vmov.16 q1[5], r11 -; CHECK-NEXT: vmov.16 q1[6], r8 +; CHECK-NEXT: vmov.16 q1[6], r3 ; CHECK-NEXT: vmov.16 q1[7], r5 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q2[0], r1 @@ -867,8 +882,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea ; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload -; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: cmp r8, r2 ; CHECK-NEXT: bne.w .LBB12_2 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #104 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll index 67522ee..20e258d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll @@ -462,15 +462,17 @@ define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) { ; CHECK-NEXT: vmov.i32 q1, #0x10 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: ldr r3, [r3] +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: vmov.32 q0[1], r3 ; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: bx lr entry: %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll index 2738e2c..236a695 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -63,13 +63,15 @@ define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr) ; CHECK-NEXT: ldrb r2, [r1] ; CHECK-NEXT: vmov.i32 q0, #0xff ; CHECK-NEXT: ldrb r1, [r1, #1] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[2], r1 ; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: ldrb r1, [r0, r1] ; CHECK-NEXT: ldrb r0, [r0, r2] -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: bx lr entry: %offs = load <2 x i8>, <2 x i8>* %offptr, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll index 4f16967d..2a86ddb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -9,7 +9,8 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i32(<2 x i32*>* %offptr) { ; CHECK-NEXT: ldrd r1, r0, [r0] ; CHECK-NEXT: ldr r0, [r0] ; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: bx lr entry: %offs = load <2 x i32*>, <2 x i32*>* %offptr, align 4 @@ -36,26 +37,30 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(<8 x i32*>* %offptr) { ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov r5, s3 ; CHECK-NEXT: ldr.w r12, [r1] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: ldr.w lr, [r2] -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: ldr r4, [r4] +; CHECK-NEXT: ldr r0, [r0] ; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: ldr r4, [r4] +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: ldr r1, [r1] ; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r4 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4 @@ -66,53 +71,61 @@ entry: define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(<16 x i32*>* %offptr) { ; CHECK-LABEL: ptr_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r6, s6 ; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r7, s7 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov r6, s7 +; CHECK-NEXT: vmov r4, s11 ; CHECK-NEXT: ldr.w r12, [r1] ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: ldr r6, [r6] ; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: ldr r7, [r7] ; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: ldr r6, [r6] ; CHECK-NEXT: ldr r4, [r4] ; CHECK-NEXT: ldr.w lr, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: ldr r3, [r1] ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r5 -; CHECK-NEXT: vmov r6, s5 -; CHECK-NEXT: vmov r5, s9 +; CHECK-NEXT: ldr r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov r5, s5 ; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: ldr r6, [r6] ; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q0[3], q0[1], r7, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: ldr r6, [r6] -; CHECK-NEXT: vmov q1[2], q1[0], r6, r0 +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r6, s15 -; CHECK-NEXT: vmov q3[2], q3[0], r3, lr -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: ldr r5, [r5] ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r6, [r6] -; CHECK-NEXT: vmov q1[3], q1[1], r6, r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r12 +; CHECK-NEXT: vmov.32 q0[2], r5 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.32 q0[3], r6 +; CHECK-NEXT: ldr r5, [r5] ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: vmov q2[3], q2[1], r0, r5 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.32 q3[0], lr +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.32 q2[0], r5 +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.32 q2[2], r12 +; CHECK-NEXT: vmov.32 q2[3], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4 %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %offs, i32 4, <16 x i1> , <16 x i32> undef) @@ -220,10 +233,12 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_sext(<2 x i16*>* %offptr) { ; CHECK-NEXT: ldrd r1, r0, [r0] ; CHECK-NEXT: ldrsh.w r0, [r0] ; CHECK-NEXT: ldrsh.w r1, [r1] -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: vmov.32 q0[0], r1 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: bx lr entry: %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4 @@ -239,7 +254,8 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_zext(<2 x i16*>* %offptr) { ; CHECK-NEXT: vmov.i64 q0, #0xffff ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.32 q1[2], r0 ; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: bx lr entry: @@ -253,16 +269,18 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v4i16_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: bx lr entry: @@ -276,16 +294,18 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v4i16_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: bx lr entry: @@ -302,27 +322,31 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) { ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov r5, s3 ; CHECK-NEXT: ldrh.w r12, [r1] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: ldrh.w lr, [r2] -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r4 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -339,27 +363,31 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(<8 x i16*>* %offptr) { ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov r5, s3 ; CHECK-NEXT: ldrh.w r12, [r1] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: ldrh.w lr, [r2] -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r4 ; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -566,15 +594,17 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.32 q0[1], r3 ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: bx lr @@ -590,16 +620,18 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: bx lr entry: @@ -616,28 +648,32 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) { ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov r5, s3 ; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r4 ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -655,28 +691,32 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(<8 x i8*>* %offptr) { ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov r5, s3 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrb.w lr, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov q2[2], q2[0], r3, lr -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vand q1, q2, q1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -803,30 +843,34 @@ define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, ; CHECK-NEXT: vldrb.u32 q1, [r1, #4] ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov r5, s3 ; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r3 -; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q1[3], r4 +; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll index 8d146d1..17b2881 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll @@ -8,15 +8,17 @@ define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32_gather(i8* %base, <4 x i32>* ; NOGATSCAT-NEXT: vldrw.u32 q0, [r1] ; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0 ; NOGATSCAT-NEXT: vmov r0, s0 +; NOGATSCAT-NEXT: vmov r3, s1 ; NOGATSCAT-NEXT: vmov r1, s2 -; NOGATSCAT-NEXT: vmov r2, s1 -; NOGATSCAT-NEXT: vmov r3, s3 +; NOGATSCAT-NEXT: vmov r2, s3 ; NOGATSCAT-NEXT: ldr r0, [r0] +; NOGATSCAT-NEXT: ldr r3, [r3] +; NOGATSCAT-NEXT: vmov.32 q0[0], r0 ; NOGATSCAT-NEXT: ldr r1, [r1] +; NOGATSCAT-NEXT: vmov.32 q0[1], r3 ; NOGATSCAT-NEXT: ldr r2, [r2] -; NOGATSCAT-NEXT: ldr r3, [r3] -; NOGATSCAT-NEXT: vmov q0[2], q0[0], r1, r0 -; NOGATSCAT-NEXT: vmov q0[3], q0[1], r3, r2 +; NOGATSCAT-NEXT: vmov.32 q0[2], r1 +; NOGATSCAT-NEXT: vmov.32 q0[3], r2 ; NOGATSCAT-NEXT: bx lr ; ; NOMVE-LABEL: unscaled_i32_i32_gather: diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll index 7b2343a..a3f06e5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -93,61 +93,64 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> ; CHECK-LE-NEXT: push {r4, r5, r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: ldrd lr, r5, [r1] -; CHECK-LE-NEXT: movs r3, #0 -; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: rsbs.w r1, lr, #0 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r5, lr -; CHECK-LE-NEXT: sbcs.w r1, r3, lr, asr #31 -; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: ldrd lr, r12, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: @ implicit-def: $q1 +; CHECK-LE-NEXT: movs r4, #0 +; CHECK-LE-NEXT: rsbs.w r3, lr, #0 +; CHECK-LE-NEXT: vmov.32 q0[0], lr +; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w lr, #0 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r4, r5, #0 -; CHECK-LE-NEXT: sbcs.w r4, r3, r5, asr #31 +; CHECK-LE-NEXT: movlt.w lr, #1 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r3, #1 -; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r3, #1 -; CHECK-LE-NEXT: bfi r3, r1, #0, #1 -; CHECK-LE-NEXT: vmov r4, s4 -; CHECK-LE-NEXT: and r12, r3, #3 -; CHECK-LE-NEXT: lsls r1, r3, #31 +; CHECK-LE-NEXT: mvnne r1, #1 +; CHECK-LE-NEXT: bfi r1, lr, #0, #1 +; CHECK-LE-NEXT: vmov.32 q0[2], r12 +; CHECK-LE-NEXT: and r3, r1, #3 +; CHECK-LE-NEXT: lsls r1, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r1, [r2] -; CHECK-LE-NEXT: vmovne.32 q0[0], r1 -; CHECK-LE-NEXT: lsls.w r1, r12, #30 +; CHECK-LE-NEXT: vmovne.32 q1[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] -; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r3, s2 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: vmov r1, s0 -; CHECK-LE-NEXT: vmov q0[2], q0[0], r3, r1 -; CHECK-LE-NEXT: rsbs r5, r4, #0 +; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov r1, s6 +; CHECK-LE-NEXT: vmov.32 q1[0], r3 +; CHECK-LE-NEXT: rsbs r5, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-LE-NEXT: vmov r2, s2 ; CHECK-LE-NEXT: asr.w lr, r3, #31 -; CHECK-LE-NEXT: vmov r3, s6 +; CHECK-LE-NEXT: vmov.32 q1[1], lr ; CHECK-LE-NEXT: asr.w r12, r1, #31 -; CHECK-LE-NEXT: sbcs.w r1, r2, r4, asr #31 +; CHECK-LE-NEXT: vmov.32 q1[2], r1 ; CHECK-LE-NEXT: mov.w r1, #0 -; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r2, r3, asr #31 +; CHECK-LE-NEXT: vmov.32 q1[3], r12 +; CHECK-LE-NEXT: rsbs r3, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r2, #1 -; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: movlt r4, #1 +; CHECK-LE-NEXT: cmp r4, #0 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: mvnne r4, #1 +; CHECK-LE-NEXT: bfi r4, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r4, #3 +; CHECK-LE-NEXT: lsls r2, r4, #31 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: vstrne d0, [r0] +; CHECK-LE-NEXT: vstrne d2, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi -; CHECK-LE-NEXT: vstrmi d1, [r0, #8] +; CHECK-LE-NEXT: vstrmi d3, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r4, r5, r7, pc} ; @@ -161,7 +164,9 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> ; CHECK-BE-NEXT: rsbs.w r1, lr, #0 ; CHECK-BE-NEXT: mov.w r3, #0 ; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 -; CHECK-BE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-BE-NEXT: vmov.32 q0[1], r12 +; CHECK-BE-NEXT: @ implicit-def: $q2 +; CHECK-BE-NEXT: vmov.32 q0[3], lr ; CHECK-BE-NEXT: mov.w lr, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt.w lr, #1 @@ -173,7 +178,6 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: mvnne r3, #1 ; CHECK-BE-NEXT: bfi r3, lr, #0, #1 -; CHECK-BE-NEXT: @ implicit-def: $q2 ; CHECK-BE-NEXT: and r1, r3, #3 ; CHECK-BE-NEXT: lsls r3, r3, #31 ; CHECK-BE-NEXT: beq .LBB5_2 @@ -195,15 +199,17 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> ; CHECK-BE-NEXT: vrev64.32 q2, q1 ; CHECK-BE-NEXT: vmov r2, s11 ; CHECK-BE-NEXT: movs r4, #0 -; CHECK-BE-NEXT: vmov r1, s1 -; CHECK-BE-NEXT: vmov r3, s3 +; CHECK-BE-NEXT: vmov r3, s1 +; CHECK-BE-NEXT: vmov r1, s3 ; CHECK-BE-NEXT: rsbs r5, r2, #0 ; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 ; CHECK-BE-NEXT: vmov r2, s9 -; CHECK-BE-NEXT: asr.w r12, r1, #31 ; CHECK-BE-NEXT: asr.w lr, r3, #31 -; CHECK-BE-NEXT: vmov q1[2], q1[0], lr, r12 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-BE-NEXT: vmov.32 q1[0], lr +; CHECK-BE-NEXT: asr.w r12, r1, #31 +; CHECK-BE-NEXT: vmov.32 q1[1], r3 +; CHECK-BE-NEXT: vmov.32 q1[2], r12 +; CHECK-BE-NEXT: vmov.32 q1[3], r1 ; CHECK-BE-NEXT: mov.w r1, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt r1, #1 @@ -241,62 +247,65 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, ; CHECK-LE-NEXT: push {r4, r5, r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: ldrd lr, r5, [r1] -; CHECK-LE-NEXT: movs r3, #0 -; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: rsbs.w r1, lr, #0 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r5, lr -; CHECK-LE-NEXT: sbcs.w r1, r3, lr, asr #31 -; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: ldrd lr, r12, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: @ implicit-def: $q1 +; CHECK-LE-NEXT: movs r4, #0 +; CHECK-LE-NEXT: rsbs.w r3, lr, #0 +; CHECK-LE-NEXT: vmov.32 q0[0], lr +; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w lr, #0 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r4, r5, #0 -; CHECK-LE-NEXT: sbcs.w r4, r3, r5, asr #31 +; CHECK-LE-NEXT: movlt.w lr, #1 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r3, #1 -; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r3, #1 -; CHECK-LE-NEXT: bfi r3, r1, #0, #1 -; CHECK-LE-NEXT: vmov r4, s4 -; CHECK-LE-NEXT: and r12, r3, #3 -; CHECK-LE-NEXT: lsls r1, r3, #31 +; CHECK-LE-NEXT: mvnne r1, #1 +; CHECK-LE-NEXT: bfi r1, lr, #0, #1 +; CHECK-LE-NEXT: vmov.32 q0[2], r12 +; CHECK-LE-NEXT: and r3, r1, #3 +; CHECK-LE-NEXT: lsls r1, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r1, [r2] -; CHECK-LE-NEXT: vmovne.32 q0[0], r1 -; CHECK-LE-NEXT: lsls.w r1, r12, #30 +; CHECK-LE-NEXT: vmovne.32 q1[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] -; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r3, s2 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: vmov r1, s0 -; CHECK-LE-NEXT: vmov q0[2], q0[0], r3, r1 -; CHECK-LE-NEXT: rsbs r5, r4, #0 +; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 +; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov r3, s4 +; CHECK-LE-NEXT: vmov r1, s6 +; CHECK-LE-NEXT: vmov.32 q1[0], r3 +; CHECK-LE-NEXT: rsbs r5, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 +; CHECK-LE-NEXT: vmov r2, s2 ; CHECK-LE-NEXT: asr.w lr, r3, #31 -; CHECK-LE-NEXT: vmov r3, s6 +; CHECK-LE-NEXT: vmov.32 q1[1], lr ; CHECK-LE-NEXT: asr.w r12, r1, #31 -; CHECK-LE-NEXT: sbcs.w r1, r2, r4, asr #31 +; CHECK-LE-NEXT: vmov.32 q1[2], r1 ; CHECK-LE-NEXT: mov.w r1, #0 -; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r2, r3, asr #31 +; CHECK-LE-NEXT: vmov.32 q1[3], r12 +; CHECK-LE-NEXT: rsbs r3, r2, #0 +; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r2, #1 -; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: movlt r4, #1 +; CHECK-LE-NEXT: cmp r4, #0 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: mvnne r4, #1 +; CHECK-LE-NEXT: bfi r4, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r4, #3 +; CHECK-LE-NEXT: lsls r2, r4, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: vmovne r2, r3, d0 +; CHECK-LE-NEXT: vmovne r2, r3, d2 ; CHECK-LE-NEXT: strdne r2, r3, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r1, r2, d1 +; CHECK-LE-NEXT: vmovmi r1, r2, d3 ; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r4, r5, r7, pc} @@ -311,7 +320,9 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, ; CHECK-BE-NEXT: rsbs.w r1, lr, #0 ; CHECK-BE-NEXT: mov.w r3, #0 ; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 -; CHECK-BE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-BE-NEXT: vmov.32 q0[1], r12 +; CHECK-BE-NEXT: @ implicit-def: $q2 +; CHECK-BE-NEXT: vmov.32 q0[3], lr ; CHECK-BE-NEXT: mov.w lr, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt.w lr, #1 @@ -323,7 +334,6 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: mvnne r3, #1 ; CHECK-BE-NEXT: bfi r3, lr, #0, #1 -; CHECK-BE-NEXT: @ implicit-def: $q2 ; CHECK-BE-NEXT: and r1, r3, #3 ; CHECK-BE-NEXT: lsls r3, r3, #31 ; CHECK-BE-NEXT: beq .LBB6_2 @@ -345,15 +355,17 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, ; CHECK-BE-NEXT: vrev64.32 q2, q1 ; CHECK-BE-NEXT: vmov r2, s11 ; CHECK-BE-NEXT: movs r4, #0 -; CHECK-BE-NEXT: vmov r1, s1 -; CHECK-BE-NEXT: vmov r3, s3 +; CHECK-BE-NEXT: vmov r3, s1 +; CHECK-BE-NEXT: vmov r1, s3 ; CHECK-BE-NEXT: rsbs r5, r2, #0 ; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31 ; CHECK-BE-NEXT: vmov r2, s9 -; CHECK-BE-NEXT: asr.w r12, r1, #31 ; CHECK-BE-NEXT: asr.w lr, r3, #31 -; CHECK-BE-NEXT: vmov q1[2], q1[0], lr, r12 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-BE-NEXT: vmov.32 q1[0], lr +; CHECK-BE-NEXT: asr.w r12, r1, #31 +; CHECK-BE-NEXT: vmov.32 q1[1], r3 +; CHECK-BE-NEXT: vmov.32 q1[2], r12 +; CHECK-BE-NEXT: vmov.32 q1[3], r1 ; CHECK-BE-NEXT: mov.w r1, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt r1, #1 @@ -389,63 +401,64 @@ entry: define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) { ; CHECK-LE-LABEL: foo_zext_v2i64_v2i32: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r5, r7, lr} -; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: ldrd lr, r5, [r1] -; CHECK-LE-NEXT: movs r3, #0 -; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrd lr, r12, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: @ implicit-def: $q1 ; CHECK-LE-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-LE-NEXT: rsbs.w r1, lr, #0 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r5, lr -; CHECK-LE-NEXT: sbcs.w r1, r3, lr, asr #31 -; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: rsbs.w r3, lr, #0 +; CHECK-LE-NEXT: vmov.32 q0[0], lr +; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w lr, #0 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r4, r5, #0 -; CHECK-LE-NEXT: sbcs.w r4, r3, r5, asr #31 +; CHECK-LE-NEXT: movlt.w lr, #1 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r3, #1 -; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r3, #1 -; CHECK-LE-NEXT: bfi r3, r1, #0, #1 -; CHECK-LE-NEXT: and r12, r3, #3 -; CHECK-LE-NEXT: lsls r1, r3, #31 +; CHECK-LE-NEXT: mvnne r1, #1 +; CHECK-LE-NEXT: bfi r1, lr, #0, #1 +; CHECK-LE-NEXT: vmov.32 q0[2], r12 +; CHECK-LE-NEXT: and r3, r1, #3 +; CHECK-LE-NEXT: mov.w r12, #0 +; CHECK-LE-NEXT: lsls r1, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r1, [r2] -; CHECK-LE-NEXT: vmovne.32 q0[0], r1 -; CHECK-LE-NEXT: lsls.w r1, r12, #30 +; CHECK-LE-NEXT: vmovne.32 q1[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] -; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r1, s4 +; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 +; CHECK-LE-NEXT: vmov r1, s0 ; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: vand q0, q0, q2 +; CHECK-LE-NEXT: vand q1, q1, q2 ; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: vmov r3, s6 +; CHECK-LE-NEXT: vmov r3, s2 ; CHECK-LE-NEXT: sbcs.w r1, r2, r1, asr #31 -; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r2, r3, asr #31 +; CHECK-LE-NEXT: movlt.w r12, #1 +; CHECK-LE-NEXT: rsbs r1, r3, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r2, #1 ; CHECK-LE-NEXT: cmp r2, #0 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 +; CHECK-LE-NEXT: bfi r2, r12, #0, #1 ; CHECK-LE-NEXT: and r1, r2, #3 ; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: vstrne d0, [r0] +; CHECK-LE-NEXT: vstrne d2, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi -; CHECK-LE-NEXT: vstrmi d1, [r0, #8] +; CHECK-LE-NEXT: vstrmi d3, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r7, pc} ; ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32: ; CHECK-BE: @ %bb.0: @ %entry @@ -457,7 +470,9 @@ define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> ; CHECK-BE-NEXT: rsbs.w r1, lr, #0 ; CHECK-BE-NEXT: mov.w r3, #0 ; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 -; CHECK-BE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-BE-NEXT: vmov.32 q0[1], r12 +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vmov.32 q0[3], lr ; CHECK-BE-NEXT: mov.w lr, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt.w lr, #1 @@ -469,7 +484,6 @@ define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: mvnne r3, #1 ; CHECK-BE-NEXT: bfi r3, lr, #0, #1 -; CHECK-BE-NEXT: @ implicit-def: $q1 ; CHECK-BE-NEXT: and r1, r3, #3 ; CHECK-BE-NEXT: lsls r3, r3, #31 ; CHECK-BE-NEXT: beq .LBB7_2 @@ -527,65 +541,66 @@ entry: define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) { ; CHECK-LE-LABEL: foo_zext_v2i64_v2i32_unaligned: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r5, r7, lr} -; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: ldrd lr, r5, [r1] -; CHECK-LE-NEXT: movs r3, #0 -; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: ldrd lr, r12, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: @ implicit-def: $q1 ; CHECK-LE-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-LE-NEXT: rsbs.w r1, lr, #0 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r5, lr -; CHECK-LE-NEXT: sbcs.w r1, r3, lr, asr #31 -; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: rsbs.w r3, lr, #0 +; CHECK-LE-NEXT: vmov.32 q0[0], lr +; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w lr, #0 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r4, r5, #0 -; CHECK-LE-NEXT: sbcs.w r4, r3, r5, asr #31 +; CHECK-LE-NEXT: movlt.w lr, #1 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r3, #1 -; CHECK-LE-NEXT: cmp r3, #0 +; CHECK-LE-NEXT: movlt r1, #1 +; CHECK-LE-NEXT: cmp r1, #0 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r3, #1 -; CHECK-LE-NEXT: bfi r3, r1, #0, #1 -; CHECK-LE-NEXT: and r12, r3, #3 -; CHECK-LE-NEXT: lsls r1, r3, #31 +; CHECK-LE-NEXT: mvnne r1, #1 +; CHECK-LE-NEXT: bfi r1, lr, #0, #1 +; CHECK-LE-NEXT: vmov.32 q0[2], r12 +; CHECK-LE-NEXT: and r3, r1, #3 +; CHECK-LE-NEXT: mov.w r12, #0 +; CHECK-LE-NEXT: lsls r1, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r1, [r2] -; CHECK-LE-NEXT: vmovne.32 q0[0], r1 -; CHECK-LE-NEXT: lsls.w r1, r12, #30 +; CHECK-LE-NEXT: vmovne.32 q1[0], r1 +; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] -; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r1, s4 +; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 +; CHECK-LE-NEXT: vmov r1, s0 ; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: vand q0, q0, q2 +; CHECK-LE-NEXT: vand q1, q1, q2 ; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: vmov r3, s6 +; CHECK-LE-NEXT: vmov r3, s2 ; CHECK-LE-NEXT: sbcs.w r1, r2, r1, asr #31 -; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r2, r3, asr #31 +; CHECK-LE-NEXT: movlt.w r12, #1 +; CHECK-LE-NEXT: rsbs r1, r3, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r2, #1 ; CHECK-LE-NEXT: cmp r2, #0 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 +; CHECK-LE-NEXT: bfi r2, r12, #0, #1 ; CHECK-LE-NEXT: and r1, r2, #3 ; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: vmovne r2, r3, d0 +; CHECK-LE-NEXT: vmovne r2, r3, d2 ; CHECK-LE-NEXT: strdne r2, r3, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r1, r2, d1 +; CHECK-LE-NEXT: vmovmi r1, r2, d3 ; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r7, pc} ; ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32_unaligned: ; CHECK-BE: @ %bb.0: @ %entry @@ -597,7 +612,9 @@ define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, ; CHECK-BE-NEXT: rsbs.w r1, lr, #0 ; CHECK-BE-NEXT: mov.w r3, #0 ; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31 -; CHECK-BE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-BE-NEXT: vmov.32 q0[1], r12 +; CHECK-BE-NEXT: @ implicit-def: $q1 +; CHECK-BE-NEXT: vmov.32 q0[3], lr ; CHECK-BE-NEXT: mov.w lr, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt.w lr, #1 @@ -609,7 +626,6 @@ define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: mvnne r3, #1 ; CHECK-BE-NEXT: bfi r3, lr, #0, #1 -; CHECK-BE-NEXT: @ implicit-def: $q1 ; CHECK-BE-NEXT: and r1, r3, #3 ; CHECK-BE-NEXT: lsls r3, r3, #31 ; CHECK-BE-NEXT: beq .LBB8_2 diff --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll index 9f56367..c533216 100644 --- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll @@ -40,17 +40,17 @@ define arm_aapcs_vfpcc <2 x i64> @smin_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov lr, s3 -; CHECK-NEXT: subs r2, r3, r2 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: sbcs.w r1, r1, r12 ; CHECK-NEXT: vmov r12, s7 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov lr, s1 +; CHECK-NEXT: subs r2, r3, r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: sbcs.w r1, r1, r12 +; CHECK-NEXT: vmov r12, s5 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 @@ -62,8 +62,10 @@ define arm_aapcs_vfpcc <2 x i64> @smin_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vmov.32 q2[3], r1 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 @@ -112,17 +114,17 @@ define arm_aapcs_vfpcc <2 x i64> @umin_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov lr, s3 -; CHECK-NEXT: subs r2, r3, r2 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: sbcs.w r1, r1, r12 ; CHECK-NEXT: vmov r12, s7 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov lr, s1 +; CHECK-NEXT: subs r2, r3, r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: sbcs.w r1, r1, r12 +; CHECK-NEXT: vmov r12, s5 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r1, #1 @@ -134,8 +136,10 @@ define arm_aapcs_vfpcc <2 x i64> @umin_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vmov.32 q2[3], r1 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 @@ -185,17 +189,17 @@ define arm_aapcs_vfpcc <2 x i64> @smax_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov lr, s7 -; CHECK-NEXT: subs r2, r3, r2 ; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: sbcs.w r1, r1, r12 ; CHECK-NEXT: vmov r12, s3 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vmov lr, s5 +; CHECK-NEXT: subs r2, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sbcs.w r1, r1, r12 +; CHECK-NEXT: vmov r12, s1 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 @@ -207,8 +211,10 @@ define arm_aapcs_vfpcc <2 x i64> @smax_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vmov.32 q2[3], r1 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 @@ -257,17 +263,17 @@ define arm_aapcs_vfpcc <2 x i64> @umax_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov lr, s7 -; CHECK-NEXT: subs r2, r3, r2 ; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: sbcs.w r1, r1, r12 ; CHECK-NEXT: vmov r12, s3 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vmov lr, s5 +; CHECK-NEXT: subs r2, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sbcs.w r1, r1, r12 +; CHECK-NEXT: vmov r12, s1 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r1, #1 @@ -279,8 +285,10 @@ define arm_aapcs_vfpcc <2 x i64> @umax_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vmov.32 q2[3], r1 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 @@ -369,12 +377,12 @@ define arm_aapcs_vfpcc <2 x double> @maxnm_float64_t(<2 x double> %src1, <2 x do ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, r1, d8 -; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d11 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vmov r12, r1, d9 +; CHECK-NEXT: vmov r12, r1, d8 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 @@ -386,8 +394,10 @@ define arm_aapcs_vfpcc <2 x double> @maxnm_float64_t(<2 x double> %src1, <2 x do ; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q0[2], r4 +; CHECK-NEXT: vmov.32 q0[3], r4 ; CHECK-NEXT: vbic q1, q5, q0 ; CHECK-NEXT: vand q0, q4, q0 ; CHECK-NEXT: vorr q0, q0, q1 diff --git a/llvm/test/CodeGen/Thumb2/mve-neg.ll b/llvm/test/CodeGen/Thumb2/mve-neg.ll index f3f3fea..2d8d0f4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-neg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-neg.ll @@ -34,17 +34,19 @@ entry: define arm_aapcs_vfpcc <2 x i64> @neg_v2i64(<2 x i64> %s1) { ; CHECK-LABEL: neg_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: sbc.w r0, r12, r0 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sbc.w r3, r12, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: bx lr entry: %0 = sub nsw <2 x i64> zeroinitializer, %s1 diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll index b08f884..0fe26fb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -149,60 +149,67 @@ define dso_local i32 @e() #0 { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #392 ; CHECK-NEXT: sub sp, #392 -; CHECK-NEXT: movw r10, :lower16:.L_MergedGlobals +; CHECK-NEXT: movw r9, :lower16:.L_MergedGlobals ; CHECK-NEXT: vldr s0, .LCPI1_0 -; CHECK-NEXT: movt r10, :upper16:.L_MergedGlobals +; CHECK-NEXT: movt r9, :upper16:.L_MergedGlobals ; CHECK-NEXT: vldr s3, .LCPI1_1 -; CHECK-NEXT: mov r6, r10 -; CHECK-NEXT: mov r7, r10 -; CHECK-NEXT: ldr r1, [r6, #4]! -; CHECK-NEXT: movw r5, :lower16:e -; CHECK-NEXT: ldr r0, [r7, #8]! -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov s1, r6 -; CHECK-NEXT: movt r5, :upper16:e -; CHECK-NEXT: vmov q1[2], q1[0], r7, r7 -; CHECK-NEXT: vmov s9, r5 -; CHECK-NEXT: vdup.32 q4, r6 +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: mov r7, r9 +; CHECK-NEXT: ldr r1, [r5, #8]! +; CHECK-NEXT: vmov r6, s3 +; CHECK-NEXT: ldr r0, [r7, #4]! +; CHECK-NEXT: movw r4, :lower16:e +; CHECK-NEXT: vmov.32 q4[0], r5 +; CHECK-NEXT: movt r4, :upper16:e +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov s1, r7 +; CHECK-NEXT: vmov.32 q1[1], r6 +; CHECK-NEXT: vmov.32 q5[0], r7 +; CHECK-NEXT: vmov.32 q1[2], r5 +; CHECK-NEXT: vmov s9, r4 +; CHECK-NEXT: vmov.32 q1[3], r4 +; CHECK-NEXT: vdup.32 q6, r7 +; CHECK-NEXT: vstrw.32 q1, [sp, #76] +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vmov.32 q1[1], r7 ; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r4 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.32 q1[2], r6 +; CHECK-NEXT: vmov q3, q6 +; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vstrw.32 q1, [sp, #76] -; CHECK-NEXT: vmov q1[2], q1[0], r4, r6 ; CHECK-NEXT: mov.w r8, #4 -; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r6 -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.32 q5[1], r5 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: str r1, [sp, #24] +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: vmov.32 q1[3], r4 +; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: vmov.32 q7[1], r4 ; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov.f32 s11, s3 ; CHECK-NEXT: movs r1, #64 ; CHECK-NEXT: strh.w r8, [sp, #390] +; CHECK-NEXT: strd r0, r10, [sp, #24] ; CHECK-NEXT: vstrw.32 q0, [sp, #44] -; CHECK-NEXT: str.w r9, [sp, #28] -; CHECK-NEXT: vstrw.32 q2, [r0] ; CHECK-NEXT: str r0, [r0] -; CHECK-NEXT: vstrw.32 q5, [r0] +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vstrw.32 q7, [r0] ; CHECK-NEXT: vstrw.32 q3, [r0] ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bl __aeabi_memclr4 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r7 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r6 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r5 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r7 -; CHECK-NEXT: vmov.32 q4[0], r9 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: str.w r9, [r10] +; CHECK-NEXT: vmov.32 q5[1], r5 +; CHECK-NEXT: vmov.32 q4[1], r4 +; CHECK-NEXT: vmov.32 q5[2], r7 +; CHECK-NEXT: vmov.32 q4[2], r7 +; CHECK-NEXT: vmov.32 q5[3], r6 +; CHECK-NEXT: vmov.32 q6[0], r10 +; CHECK-NEXT: vmov.32 q4[3], r5 +; CHECK-NEXT: str.w r10, [r9] ; CHECK-NEXT: vstrw.32 q4, [r0] -; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q6, [r0] +; CHECK-NEXT: vstrw.32 q5, [r0] ; CHECK-NEXT: str.w r8, [sp, #308] ; CHECK-NEXT: .LBB1_1: @ %for.cond ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll index 49cc4ca..9848a56 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-and.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-and.ll @@ -577,20 +577,22 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-NEXT: vorr q2, q0, q1 ; CHECK-NEXT: vmov r0, s9 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vbic q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q1, q1, q3 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vorr q0, q0, q1 ; CHECK-NEXT: bx lr entry: @@ -607,40 +609,44 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i6 ; CHECK-NEXT: vmov r0, s9 ; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r1, s7 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: eors r2, r3 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vand q2, q3, q2 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 @@ -661,33 +667,37 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqr_v2i1(<2 x i64> %a, <2 x i64> %b, i64 %c ; CHECK-NEXT: eors r2, r1 ; CHECK-NEXT: eors r3, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s7 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r2 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vand q2, q3, q2 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll index ebe39e8..c7e553f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -13,12 +13,14 @@ define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) { ; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q1, q2, q1 ; CHECK-LE-NEXT: vmov.u8 r0, q1[0] -; CHECK-LE-NEXT: vmov.u8 r1, q1[2] -; CHECK-LE-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-LE-NEXT: vmov.32 q2[0], r0 ; CHECK-LE-NEXT: vmov.u8 r0, q1[1] -; CHECK-LE-NEXT: vmov.u8 r1, q1[3] +; CHECK-LE-NEXT: vmov.32 q2[1], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[2] +; CHECK-LE-NEXT: vmov.32 q2[2], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[3] +; CHECK-LE-NEXT: vmov.32 q2[3], r0 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmov q2[3], q2[1], r1, r0 ; CHECK-LE-NEXT: vcmp.i32 ne, q2, zr ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: add sp, #4 @@ -34,14 +36,16 @@ define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) { ; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q2, q1 ; CHECK-BE-NEXT: vmov.u8 r0, q1[0] -; CHECK-BE-NEXT: vmov.u8 r1, q1[2] -; CHECK-BE-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-BE-NEXT: vmov.32 q2[0], r0 ; CHECK-BE-NEXT: vmov.u8 r0, q1[1] -; CHECK-BE-NEXT: vmov.u8 r1, q1[3] +; CHECK-BE-NEXT: vmov.32 q2[1], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[2] +; CHECK-BE-NEXT: vmov.32 q2[2], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[3] +; CHECK-BE-NEXT: vmov.32 q2[3], r0 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-BE-NEXT: vmov.i32 q0, #0x0 ; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: add sp, #4 @@ -175,11 +179,13 @@ define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) { ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: and r1, r0, #2 ; CHECK-LE-NEXT: and r0, r0, #1 -; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: rsbs r0, r0, #0 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmov.32 q1[0], r0 ; CHECK-LE-NEXT: sub.w r1, r2, r1, lsr #1 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-LE-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-LE-NEXT: vmov.32 q1[1], r0 +; CHECK-LE-NEXT: vmov.32 q1[2], r1 +; CHECK-LE-NEXT: vmov.32 q1[3], r1 ; CHECK-LE-NEXT: vand q0, q0, q1 ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr @@ -189,12 +195,14 @@ define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) { ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: and r1, r0, #2 -; CHECK-BE-NEXT: and r0, r0, #1 ; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: rsbs r0, r0, #0 +; CHECK-BE-NEXT: and r0, r0, #1 ; CHECK-BE-NEXT: sub.w r1, r2, r1, lsr #1 -; CHECK-BE-NEXT: vmov q1[2], q1[0], r0, r1 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-BE-NEXT: rsbs r0, r0, #0 +; CHECK-BE-NEXT: vmov.32 q1[0], r1 +; CHECK-BE-NEXT: vmov.32 q1[1], r1 +; CHECK-BE-NEXT: vmov.32 q1[2], r0 +; CHECK-BE-NEXT: vmov.32 q1[3], r0 ; CHECK-BE-NEXT: vrev64.32 q2, q1 ; CHECK-BE-NEXT: vand q0, q0, q2 ; CHECK-BE-NEXT: add sp, #4 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll index a6e0068..b88576a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll @@ -46,12 +46,12 @@ entry: define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2i64(<2 x i64> %src) { ; CHECK-LABEL: sext_v2i1_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: sbcs.w r0, r2, r0 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt @@ -64,8 +64,10 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2i64(<2 x i64> %src) { ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: bx lr entry: %c = icmp sgt <2 x i64> %src, zeroinitializer @@ -119,14 +121,14 @@ entry: define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2i64(<2 x i64> %src) { ; CHECK-LABEL: zext_v2i1_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: adr r1, .LCPI7_0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: sbcs.w r1, r0, r1 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt @@ -139,7 +141,8 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2i64(<2 x i64> %src) { ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[2], r1 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 @@ -198,14 +201,16 @@ entry: define arm_aapcs_vfpcc <2 x i64> @trunc_v2i1_v2i64(<2 x i64> %src) { ; CHECK-LABEL: trunc_v2i1_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: and r1, r1, #1 -; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll index 1250b68..df6a38f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll @@ -11,12 +11,14 @@ define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) { ; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q1, q2, q1 ; CHECK-LE-NEXT: vmov.u8 r0, q1[0] -; CHECK-LE-NEXT: vmov.u8 r1, q1[2] -; CHECK-LE-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-LE-NEXT: vmov.32 q2[0], r0 ; CHECK-LE-NEXT: vmov.u8 r0, q1[1] -; CHECK-LE-NEXT: vmov.u8 r1, q1[3] +; CHECK-LE-NEXT: vmov.32 q2[1], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[2] +; CHECK-LE-NEXT: vmov.32 q2[2], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[3] +; CHECK-LE-NEXT: vmov.32 q2[3], r0 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 -; CHECK-LE-NEXT: vmov q2[3], q2[1], r1, r0 ; CHECK-LE-NEXT: vcmp.i32 ne, q2, zr ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr @@ -29,14 +31,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) { ; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q2, q1 ; CHECK-BE-NEXT: vmov.u8 r0, q1[0] -; CHECK-BE-NEXT: vmov.u8 r1, q1[2] -; CHECK-BE-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-BE-NEXT: vmov.32 q2[0], r0 ; CHECK-BE-NEXT: vmov.u8 r0, q1[1] -; CHECK-BE-NEXT: vmov.u8 r1, q1[3] +; CHECK-BE-NEXT: vmov.32 q2[1], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[2] +; CHECK-BE-NEXT: vmov.32 q2[2], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[3] +; CHECK-BE-NEXT: vmov.32 q2[3], r0 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-BE-NEXT: vmov.i32 q0, #0x0 ; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -141,11 +145,13 @@ define arm_aapcs_vfpcc <2 x i64> @load_v2i1(<2 x i1> *%src, <2 x i64> %a) { ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: ldrb r0, [r0] ; CHECK-LE-NEXT: and r1, r0, #1 -; CHECK-LE-NEXT: ubfx r0, r0, #1, #1 ; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: ubfx r0, r0, #1, #1 +; CHECK-LE-NEXT: vmov.32 q1[0], r1 ; CHECK-LE-NEXT: rsbs r0, r0, #0 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r0, r1 -; CHECK-LE-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-LE-NEXT: vmov.32 q1[1], r1 +; CHECK-LE-NEXT: vmov.32 q1[2], r0 +; CHECK-LE-NEXT: vmov.32 q1[3], r0 ; CHECK-LE-NEXT: vand q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; @@ -156,8 +162,10 @@ define arm_aapcs_vfpcc <2 x i64> @load_v2i1(<2 x i1> *%src, <2 x i64> %a) { ; CHECK-BE-NEXT: and r0, r0, #1 ; CHECK-BE-NEXT: rsbs r1, r1, #0 ; CHECK-BE-NEXT: rsbs r0, r0, #0 -; CHECK-BE-NEXT: vmov q1[2], q1[0], r0, r1 -; CHECK-BE-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-BE-NEXT: vmov.32 q1[0], r1 +; CHECK-BE-NEXT: vmov.32 q1[1], r1 +; CHECK-BE-NEXT: vmov.32 q1[2], r0 +; CHECK-BE-NEXT: vmov.32 q1[3], r0 ; CHECK-BE-NEXT: vrev64.32 q2, q1 ; CHECK-BE-NEXT: vand q0, q0, q2 ; CHECK-BE-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll index 3bbda5c..35fb1bf 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll @@ -325,18 +325,20 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vbic q0, q0, q2 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vorr q0, q1, q0 @@ -353,18 +355,20 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i6 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vbic q0, q0, q2 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vorr q0, q1, q0 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll index 7883bb5..fbc268fa 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-or.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-or.ll @@ -379,32 +379,36 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vorr q2, q3, q2 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 @@ -424,40 +428,44 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i6 ; CHECK-NEXT: vmov r0, s9 ; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r1, s7 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: eors r2, r3 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vorr q2, q3, q2 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vorr q2, q2, q3 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll index 805d938..f6d3baf 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -316,11 +316,13 @@ define <4 x i32> @shuffle5_b_v4i32(<8 x i16> %src, <4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp @@ -347,11 +349,13 @@ define <4 x i32> @shuffle5_t_v4i32(<8 x i16> %src, <4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll index 0173142..f92a4bd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll @@ -459,32 +459,36 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: veor q2, q3, q2 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 @@ -504,40 +508,44 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i6 ; CHECK-NEXT: vmov r0, s9 ; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r1, s7 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: eors r2, r3 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: veor q2, q3, q2 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: veor q2, q2, q3 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index cd44721..4579c27 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -20,7 +20,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_3: @ %vector.ph @@ -32,65 +32,69 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 ; CHECK-NEXT: add.w r11, r2, r3, lsl #2 -; CHECK-NEXT: add.w r10, r1, r3, lsl #2 +; CHECK-NEXT: add.w r6, r1, r3, lsl #2 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: mvn r10, #-2147483648 ; CHECK-NEXT: vmvn.i32 q1, #0x80000000 ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r5, [r0] -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: ldrd r7, r8, [r1] +; CHECK-NEXT: ldrd r4, r8, [r0] ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: smull r6, r5, r8, r5 +; CHECK-NEXT: ldrd r7, r5, [r1] ; CHECK-NEXT: adds r1, #8 +; CHECK-NEXT: smull r8, r5, r5, r8 ; CHECK-NEXT: smull r4, r7, r7, r4 -; CHECK-NEXT: asrl r6, r5, #31 +; CHECK-NEXT: asrl r8, r5, #31 ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: rsbs.w r9, r4, #-2147483648 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r4 -; CHECK-NEXT: sbcs r3, r7 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r7 +; CHECK-NEXT: vmov.32 q4[0], r4 +; CHECK-NEXT: mov.w r9, #-1 +; CHECK-NEXT: sbcs.w r3, r9, r7 ; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: vmov.32 q4[1], r7 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r9, ne -; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: mvn r6, #-2147483648 -; CHECK-NEXT: sbcs r3, r5 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q4[2], r8 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.32 q4[3], r5 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: rsbs.w r3, r8, #-2147483648 +; CHECK-NEXT: sbcs.w r3, r9, r5 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q2[2], q2[0], r3, r9 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r9 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vbic q3, q0, q2 ; CHECK-NEXT: vand q2, q4, q2 ; CHECK-NEXT: vorr q2, q2, q3 ; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: subs r4, r4, r6 -; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: subs.w r4, r4, r10 ; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: subs r5, r5, r6 +; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.32 q3[1], r3 -; CHECK-NEXT: sbcs r4, r4, #0 -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: subs.w r4, r4, r10 +; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vbic q4, q1, q3 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vorr q2, q2, q4 @@ -112,7 +116,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r3, [r12], #4 -; CHECK-NEXT: ldr r4, [r10], #4 +; CHECK-NEXT: ldr r4, [r6], #4 ; CHECK-NEXT: smull r4, r3, r4, r3 ; CHECK-NEXT: asrl r4, r3, #31 ; CHECK-NEXT: subs r5, r1, r4 @@ -225,141 +229,149 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader +; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB1_3 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: mov r1, r9 ; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_3: @ %vector.ph -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r7, r3, #4 +; CHECK-NEXT: bic r7, r3, #3 ; CHECK-NEXT: adr r4, .LCPI1_0 -; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: subs r1, r7, #4 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: add.w lr, r3, r1, lsr #2 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill ; CHECK-NEXT: adr r4, .LCPI1_1 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r11, r2, r3, lsl #2 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 -; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: add.w r11, r2, r7, lsl #2 +; CHECK-NEXT: add.w r1, r9, r7, lsl #2 +; CHECK-NEXT: add.w r12, r0, r7, lsl #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w r10, #-1 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov.w r2, #-1 +; CHECK-NEXT: vldrw.u32 q3, [r9], #16 ; CHECK-NEXT: vmov.f32 s16, s10 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vmov.f32 s18, s11 ; CHECK-NEXT: vmov.f32 s22, s15 ; CHECK-NEXT: vmullb.s32 q6, q5, q4 -; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov.f32 s10, s9 ; CHECK-NEXT: vmov r7, s25 ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: vmov r10, s26 +; CHECK-NEXT: vmov r8, s26 ; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 -; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: sbcs.w r5, r2, r7 +; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: sbcs.w r5, r10, r7 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r5, #1 ; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: vmov.32 q4[0], r5 +; CHECK-NEXT: vmov.32 q4[1], r5 ; CHECK-NEXT: vmov r5, s27 -; CHECK-NEXT: csetm r8, ne -; CHECK-NEXT: asrl r10, r5, #31 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 -; CHECK-NEXT: vmov q6[2], q6[0], r10, r4 -; CHECK-NEXT: sbcs.w r3, r2, r5 -; CHECK-NEXT: vmov q6[3], q6[1], r5, r7 -; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: asrl r8, r5, #31 +; CHECK-NEXT: vmov.32 q6[0], r4 +; CHECK-NEXT: rsbs.w r6, r8, #-2147483648 +; CHECK-NEXT: vmov.32 q6[1], r7 +; CHECK-NEXT: sbcs.w r6, r10, r5 +; CHECK-NEXT: vmov.32 q6[2], r8 +; CHECK-NEXT: mov.w r6, #0 +; CHECK-NEXT: vmov.32 q6[3], r5 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q4[2], q4[0], r3, r8 -; CHECK-NEXT: vmov q4[3], q4[1], r3, r8 +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csetm r6, ne ; CHECK-NEXT: mvn r8, #-2147483648 +; CHECK-NEXT: vmov.32 q4[2], r6 +; CHECK-NEXT: vmov.32 q4[3], r6 +; CHECK-NEXT: vmov r6, s14 ; CHECK-NEXT: vbic q5, q0, q4 ; CHECK-NEXT: vand q4, q6, q4 ; CHECK-NEXT: vorr q4, q4, q5 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: subs.w r5, r5, r8 +; CHECK-NEXT: sbcs r4, r4, #0 ; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: sbcs r3, r3, #0 -; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q5[0], r4 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s19 ; CHECK-NEXT: subs.w r5, r5, r8 -; CHECK-NEXT: vmov.32 q5[1], r3 -; CHECK-NEXT: vmov r5, s10 +; CHECK-NEXT: vmov r5, s12 ; CHECK-NEXT: sbcs r4, r4, #0 ; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: smull r6, r5, r6, r5 +; CHECK-NEXT: vmov.32 q5[2], r4 +; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: vbic q6, q1, q5 ; CHECK-NEXT: vand q4, q4, q5 ; CHECK-NEXT: vorr q4, q4, q6 -; CHECK-NEXT: asrl r6, r5, #31 -; CHECK-NEXT: smull r4, r7, r4, r3 +; CHECK-NEXT: smull r4, r7, r5, r4 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 -; CHECK-NEXT: vmov q5[2], q5[0], r6, r4 -; CHECK-NEXT: sbcs.w r3, r2, r7 -; CHECK-NEXT: vmov q5[3], q5[1], r5, r7 +; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 +; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: sbcs.w r5, r10, r7 +; CHECK-NEXT: vmov.32 q3[1], r7 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: vmov.32 q5[0], r5 +; CHECK-NEXT: vmov.32 q5[1], r5 +; CHECK-NEXT: vmov r5, s10 +; CHECK-NEXT: smull r6, r5, r6, r5 +; CHECK-NEXT: asrl r6, r5, #31 +; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 +; CHECK-NEXT: vmov.32 q3[2], r6 +; CHECK-NEXT: sbcs.w r3, r10, r5 +; CHECK-NEXT: vmov.32 q3[3], r5 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: rsbs.w r1, r6, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r2, r5 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: ldrd r1, r2, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: vbic q3, q0, q2 -; CHECK-NEXT: vand q2, q5, q2 -; CHECK-NEXT: vorr q2, q2, q3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: subs.w r3, r3, r8 -; CHECK-NEXT: sbcs r3, r4, #0 +; CHECK-NEXT: vmov.32 q5[2], r3 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vbic q2, q0, q5 +; CHECK-NEXT: vand q3, q3, q5 +; CHECK-NEXT: vorr q2, q3, q2 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r3, r3, #0 ; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov.32 q3[1], r5 ; CHECK-NEXT: subs.w r4, r4, r8 ; CHECK-NEXT: sbcs r3, r3, #0 ; CHECK-NEXT: mov.w r3, #0 @@ -367,7 +379,7 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q3[2], q3[0], r3, r5 +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vbic q5, q1, q3 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vorr q2, q2, q5 @@ -388,25 +400,25 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: .LBB1_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [r12], #4 -; CHECK-NEXT: ldr r4, [r9], #4 -; CHECK-NEXT: smull r4, r1, r4, r1 -; CHECK-NEXT: asrl r4, r1, #31 -; CHECK-NEXT: subs r5, r3, r4 -; CHECK-NEXT: sbcs.w r5, r0, r1 -; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: ldr r5, [r1], #4 +; CHECK-NEXT: smull r4, r5, r5, r4 +; CHECK-NEXT: asrl r4, r5, #31 +; CHECK-NEXT: subs r6, r3, r4 +; CHECK-NEXT: sbcs.w r6, r0, r5 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: csel r4, r4, r3, ne -; CHECK-NEXT: csel r1, r1, r0, ne -; CHECK-NEXT: subs r5, r4, r2 -; CHECK-NEXT: sbcs r1, r1, #0 -; CHECK-NEXT: csel r1, r4, r2, lt -; CHECK-NEXT: str r1, [r11], #4 +; CHECK-NEXT: csel r5, r5, r0, ne +; CHECK-NEXT: subs r6, r4, r2 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: csel r4, r4, r2, lt +; CHECK-NEXT: str r4, [r11], #4 ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -503,8 +515,10 @@ for.body: ; preds = %for.body.preheader2 define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) { ; CHECK-LABEL: ssatmul_4t_q31: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 @@ -513,19 +527,19 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: beq.w .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r7, r3, #3 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: bic r7, r7, #3 ; CHECK-NEXT: adr r4, .LCPI2_1 +; CHECK-NEXT: bic r7, r7, #3 +; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: adr r5, .LCPI2_2 ; CHECK-NEXT: vldrw.u32 q2, [r4] -; CHECK-NEXT: vldrw.u32 q3, [r5] +; CHECK-NEXT: adr r4, .LCPI2_2 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: adr r6, .LCPI2_0 ; CHECK-NEXT: subs r7, r3, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: vldrw.u32 q3, [r4] ; CHECK-NEXT: vdup.32 q1, r7 ; CHECK-NEXT: mov.w r12, #-1 ; CHECK-NEXT: mvn r8, #-2147483648 @@ -549,59 +563,63 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: vmov r6, s0 ; CHECK-NEXT: asrl r6, r5, #31 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 +; CHECK-NEXT: vmov r7, s3 +; CHECK-NEXT: rsbs.w r4, r6, #-2147483648 +; CHECK-NEXT: vmov.32 q7[0], r6 +; CHECK-NEXT: sbcs.w r4, r12, r5 +; CHECK-NEXT: vmov.32 q7[1], r5 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: vmov.f32 s22, s21 -; CHECK-NEXT: sbcs.w r7, r12, r5 -; CHECK-NEXT: mov.w r7, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: csetm r10, ne +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q6[0], r4 +; CHECK-NEXT: vmov.32 q6[1], r4 +; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 -; CHECK-NEXT: vmov q7[2], q7[0], r4, r6 +; CHECK-NEXT: vmov.32 q7[2], r4 ; CHECK-NEXT: sbcs.w r3, r12, r7 -; CHECK-NEXT: vmov q7[3], q7[1], r7, r5 +; CHECK-NEXT: vmov.32 q7[3], r7 ; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: vmov r7, s22 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r10 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r10 -; CHECK-NEXT: vbic q6, q2, q0 -; CHECK-NEXT: vand q0, q7, q0 -; CHECK-NEXT: vorr q6, q0, q6 +; CHECK-NEXT: vmov.32 q6[2], r3 +; CHECK-NEXT: vmov.32 q6[3], r3 +; CHECK-NEXT: vbic q0, q2, q6 +; CHECK-NEXT: vand q6, q7, q6 +; CHECK-NEXT: vorr q6, q6, q0 ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: vmov r5, s26 ; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: vmov r4, s27 ; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: vmov r4, s26 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: subs.w r5, r5, r8 +; CHECK-NEXT: vmov.32 q0[0], r3 ; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: sbcs r4, r4, #0 -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: vmov r3, s27 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q0[2], r3 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r4, s20 ; CHECK-NEXT: vbic q7, q3, q0 ; CHECK-NEXT: vand q0, q6, q0 ; CHECK-NEXT: vorr q6, q0, q7 ; CHECK-NEXT: smull r6, r5, r4, r3 -; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: vmov r4, s22 ; CHECK-NEXT: asrl r6, r5, #31 ; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 ; CHECK-NEXT: sbcs.w r3, r12, r5 @@ -609,43 +627,49 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r10, ne -; CHECK-NEXT: smull r4, r7, r7, r4 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q7[0], r3 +; CHECK-NEXT: vmov.32 q7[1], r3 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov.32 q4[0], r6 +; CHECK-NEXT: vmov.32 q4[1], r5 +; CHECK-NEXT: smull r4, r7, r4, r3 ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r6 +; CHECK-NEXT: vmov.32 q4[2], r4 ; CHECK-NEXT: sbcs.w r3, r12, r7 -; CHECK-NEXT: vmov q5[3], q5[1], r7, r5 +; CHECK-NEXT: vmov.32 q4[3], r7 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r10 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r10 -; CHECK-NEXT: vbic q4, q2, q0 -; CHECK-NEXT: vand q0, q5, q0 -; CHECK-NEXT: vorr q4, q0, q4 +; CHECK-NEXT: vmov.32 q7[2], r3 +; CHECK-NEXT: vmov.32 q7[3], r3 +; CHECK-NEXT: vbic q0, q2, q7 +; CHECK-NEXT: vand q4, q4, q7 +; CHECK-NEXT: vorr q4, q4, q0 ; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: vmov r4, s19 ; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: vmov r4, s18 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: subs.w r5, r5, r8 +; CHECK-NEXT: vmov.32 q0[0], r3 ; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: sbcs r4, r4, #0 -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q0[2], r3 ; CHECK-NEXT: vbic q5, q3, q0 ; CHECK-NEXT: vand q0, q4, q0 ; CHECK-NEXT: vorr q0, q0, q5 @@ -658,7 +682,8 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI2_0: @@ -750,31 +775,33 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r7, [r0] +; CHECK-NEXT: ldrd r4, r9, [r0] ; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: ldrd r5, r10, [r1] ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: umull r4, r5, r5, r4 ; CHECK-NEXT: lsrl r4, r5, #31 ; CHECK-NEXT: subs.w r6, r4, #-1 -; CHECK-NEXT: umull r6, r7, r10, r7 +; CHECK-NEXT: vmov.32 q1[0], r4 ; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r5, #1 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: lsrl r6, r7, #31 -; CHECK-NEXT: csetm r9, ne -; CHECK-NEXT: subs.w r5, r6, #-1 -; CHECK-NEXT: vmov.32 q0[1], r9 -; CHECK-NEXT: sbcs r5, r7, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r4 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: umull r6, r5, r10, r9 +; CHECK-NEXT: lsrl r6, r5, #31 +; CHECK-NEXT: subs.w r7, r6, #-1 +; CHECK-NEXT: vmov.32 q1[2], r6 +; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r5, #1 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov q0[2], q0[0], r5, r9 +; CHECK-NEXT: vmov.32 q0[2], r5 ; CHECK-NEXT: vand q1, q1, q0 ; CHECK-NEXT: vorn q0, q1, q0 ; CHECK-NEXT: vmov r4, s2 @@ -879,10 +906,8 @@ for.body: ; preds = %for.body.preheader, define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) { ; CHECK-LABEL: usatmul_4_q31: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: cmp r3, #0 @@ -918,53 +943,57 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: vmov r5, s17 ; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: subs.w r6, r4, #-1 ; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: subs.w r6, r4, #-1 +; CHECK-NEXT: vmov.32 q3[0], r4 ; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: vmov r6, s18 ; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: lsrl r6, r7, #31 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r5, #1 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r11, ne -; CHECK-NEXT: subs.w r5, r6, #-1 -; CHECK-NEXT: sbcs r5, r7, #0 -; CHECK-NEXT: vmov.32 q1[1], r11 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: vmov.32 q1[0], r5 +; CHECK-NEXT: vmov.32 q1[1], r5 +; CHECK-NEXT: vmov r5, s19 +; CHECK-NEXT: lsrl r6, r5, #31 +; CHECK-NEXT: subs.w r7, r6, #-1 +; CHECK-NEXT: vmov.32 q3[2], r6 +; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r6, r4 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r5, #1 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov q1[2], q1[0], r5, r11 +; CHECK-NEXT: vmov.32 q1[2], r5 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vorn q1, q3, q1 ; CHECK-NEXT: vmullb.u32 q3, q2, q0 ; CHECK-NEXT: vmov r5, s13 ; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: vmov r7, s15 ; CHECK-NEXT: subs.w r6, r4, #-1 +; CHECK-NEXT: vmov.32 q2[0], r4 ; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: vmov r6, s14 ; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: lsrl r6, r7, #31 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r5, #1 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r11, ne -; CHECK-NEXT: subs.w r5, r6, #-1 -; CHECK-NEXT: sbcs r5, r7, #0 -; CHECK-NEXT: vmov.32 q0[1], r11 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: lsrl r6, r5, #31 +; CHECK-NEXT: subs.w r7, r6, #-1 +; CHECK-NEXT: vmov.32 q2[2], r6 +; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r6, r4 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r5, #1 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov q0[2], q0[0], r5, r11 +; CHECK-NEXT: vmov.32 q0[2], r5 ; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vorn q0, q2, q0 ; CHECK-NEXT: vmov.f32 s1, s2 @@ -992,8 +1021,7 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: le lr, .LBB4_7 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader @@ -1563,12 +1591,12 @@ for.cond.cleanup: ; preds = %vector.body, %entry define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) { ; CHECK-LABEL: ssatmul_8t_q15: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB9_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -1579,99 +1607,107 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16* ; CHECK-NEXT: sub.w r12, r12, #8 ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI9_1 -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: add.w lr, lr, r12, lsr #3 ; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vldrw.u32 q4, [r4] +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q6, r5 -; CHECK-NEXT: adds r5, #8 -; CHECK-NEXT: vorr q5, q6, q0 -; CHECK-NEXT: vorr q6, q6, q4 +; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: adds r3, #8 +; CHECK-NEXT: vorr q5, q0, q5 +; CHECK-NEXT: vorr q0, q0, q4 ; CHECK-NEXT: vcmp.u32 cs, q1, q5 -; CHECK-NEXT: vpsel q7, q3, q2 -; CHECK-NEXT: vcmp.u32 cs, q1, q6 -; CHECK-NEXT: vmov r4, s28 ; CHECK-NEXT: vpsel q6, q3, q2 +; CHECK-NEXT: vcmp.u32 cs, q1, q0 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vpsel q0, q3, q2 ; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov r4, s29 +; CHECK-NEXT: vmov r4, s25 ; CHECK-NEXT: vmov.16 q5[1], r4 -; CHECK-NEXT: vmov r4, s30 +; CHECK-NEXT: vmov r4, s26 ; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov r4, s31 +; CHECK-NEXT: vmov r4, s27 ; CHECK-NEXT: vmov.16 q5[3], r4 -; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov r4, s1 ; CHECK-NEXT: vmov.16 q5[5], r4 -; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: vmov.16 q5[6], r4 -; CHECK-NEXT: vmov r4, s27 +; CHECK-NEXT: vmov r4, s3 ; CHECK-NEXT: vmov.16 q5[7], r4 ; CHECK-NEXT: vpt.i16 ne, q5, zr ; CHECK-NEXT: vldrht.u16 q6, [r0], #16 ; CHECK-NEXT: vmov.u16 r4, q6[0] -; CHECK-NEXT: vmov.u16 r3, q6[2] -; CHECK-NEXT: vmov q5[2], q5[0], r3, r4 -; CHECK-NEXT: vmov.u16 r3, q6[1] -; CHECK-NEXT: vmov.u16 r4, q6[3] ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u16 q7, [r1], #16 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q7[0] +; CHECK-NEXT: vmov.32 q5[0], r4 +; CHECK-NEXT: vmov.u16 r4, q6[1] +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov.u16 r4, q6[2] +; CHECK-NEXT: vmov.32 q5[2], r4 +; CHECK-NEXT: vmov.u16 r4, q6[3] +; CHECK-NEXT: vmov.32 q5[3], r4 +; CHECK-NEXT: vmov.u16 r4, q7[0] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.u16 r4, q7[1] +; CHECK-NEXT: vmov.32 q0[1], r4 ; CHECK-NEXT: vmov.u16 r4, q7[2] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q7[1] +; CHECK-NEXT: vmov.32 q0[2], r4 ; CHECK-NEXT: vmov.u16 r4, q7[3] -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vmov.u16 r4, q6[6] +; CHECK-NEXT: vmov.32 q0[3], r4 ; CHECK-NEXT: vmullb.s16 q0, q0, q5 ; CHECK-NEXT: vqshrnb.s32 q0, q0, #15 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q5[0], r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov.16 q5[1], r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.16 q5[2], r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov.16 q5[3], r3 -; CHECK-NEXT: vmov.u16 r3, q6[4] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q6[5] +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.16 q5[2], r4 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov.u16 r4, q6[4] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.u16 r4, q6[5] +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.u16 r4, q6[6] +; CHECK-NEXT: vmov.32 q0[2], r4 ; CHECK-NEXT: vmov.u16 r4, q6[7] -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q7[4] +; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: vmov.u16 r4, q7[4] +; CHECK-NEXT: vmov.32 q6[0], r4 +; CHECK-NEXT: vmov.u16 r4, q7[5] +; CHECK-NEXT: vmov.32 q6[1], r4 ; CHECK-NEXT: vmov.u16 r4, q7[6] -; CHECK-NEXT: vmov q6[2], q6[0], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q7[5] +; CHECK-NEXT: vmov.32 q6[2], r4 ; CHECK-NEXT: vmov.u16 r4, q7[7] -; CHECK-NEXT: vmov q6[3], q6[1], r4, r3 +; CHECK-NEXT: vmov.32 q6[3], r4 ; CHECK-NEXT: vmullb.s16 q0, q6, q0 ; CHECK-NEXT: vqshrnb.s32 q0, q0, #15 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q5[4], r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov.16 q5[5], r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.16 q5[6], r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov.16 q5[7], r3 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov.16 q5[7], r4 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.16 q5, [r2], #16 ; CHECK-NEXT: le lr, .LBB9_2 ; CHECK-NEXT: .LBB9_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI9_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll index 0021ff3..7313cb6 100644 --- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll @@ -34,13 +34,12 @@ entry: define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: sadd_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov lr, s4 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: cmp.w r0, #-1 ; CHECK-NEXT: cset r1, gt ; CHECK-NEXT: cmp.w r2, #-1 @@ -49,49 +48,53 @@ define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: cset r12, eq ; CHECK-NEXT: adds.w r1, r1, lr +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: cmp.w r2, #-1 +; CHECK-NEXT: cset r0, gt +; CHECK-NEXT: cmp r3, r0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: and.w r0, r0, r12 +; CHECK-NEXT: mvn r12, #-2147483648 +; CHECK-NEXT: and r3, r0, #1 +; CHECK-NEXT: cset r0, mi +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: cinv r0, r12, eq +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: asrne r1, r2, #31 +; CHECK-NEXT: csel r0, r0, r2, ne +; CHECK-NEXT: vmov.32 q2[0], r1 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: cmp.w r0, #-1 +; CHECK-NEXT: cset r1, gt +; CHECK-NEXT: cmp.w r2, #-1 +; CHECK-NEXT: cset r3, gt +; CHECK-NEXT: cmp r3, r1 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: cset lr, eq +; CHECK-NEXT: adds r1, r1, r4 ; CHECK-NEXT: adcs r0, r2 ; CHECK-NEXT: cmp.w r0, #-1 ; CHECK-NEXT: cset r2, gt ; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: vmov r3, s7 ; CHECK-NEXT: cset r2, ne -; CHECK-NEXT: and.w r2, r2, r12 -; CHECK-NEXT: ands r12, r2, #1 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: and.w r2, r2, lr +; CHECK-NEXT: ands r2, r2, #1 ; CHECK-NEXT: it ne ; CHECK-NEXT: asrne r1, r0, #31 -; CHECK-NEXT: cmp.w r3, #-1 -; CHECK-NEXT: cset lr, gt -; CHECK-NEXT: cmp.w r2, #-1 -; CHECK-NEXT: cset r4, gt -; CHECK-NEXT: cmp r4, lr -; CHECK-NEXT: cset lr, eq -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: mvn r6, #-2147483648 -; CHECK-NEXT: cmp.w r2, #-1 -; CHECK-NEXT: cset r3, gt -; CHECK-NEXT: cmp r4, r3 -; CHECK-NEXT: cset r3, ne -; CHECK-NEXT: and.w r3, r3, lr -; CHECK-NEXT: ands r3, r3, #1 -; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r5, r2, #31 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r1 +; CHECK-NEXT: vmov.32 q2[2], r1 ; CHECK-NEXT: cset r1, mi ; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: cinv r1, r6, eq -; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: csel r0, r1, r0, ne +; CHECK-NEXT: cinv r1, r12, eq ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cset r1, mi -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: cinv r1, r6, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r1, r1, r2, ne -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: csel r0, r1, r0, ne +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: pop {r4, pc} entry: %0 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 @@ -130,36 +133,34 @@ entry: define arm_aapcs_vfpcc <2 x i64> @uadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: uadd_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s2 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: adcs r1, r12, #0 +; CHECK-NEXT: itt ne +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: movne.w r2, #-1 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: adcs lr, r12, #0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: adcs r1, r12, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r2, #-1 -; CHECK-NEXT: adds r4, r4, r5 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: adcs r3, r12, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r1, #-1 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr entry: %0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 @@ -199,13 +200,12 @@ entry: define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: ssub_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov lr, s4 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: cmp.w r0, #-1 ; CHECK-NEXT: cset r1, gt ; CHECK-NEXT: cmp.w r2, #-1 @@ -214,49 +214,53 @@ define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: cset r12, ne ; CHECK-NEXT: subs.w r1, r1, lr +; CHECK-NEXT: sbcs r2, r0 +; CHECK-NEXT: cmp.w r2, #-1 +; CHECK-NEXT: cset r0, gt +; CHECK-NEXT: cmp r3, r0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: and.w r0, r0, r12 +; CHECK-NEXT: mvn r12, #-2147483648 +; CHECK-NEXT: and r3, r0, #1 +; CHECK-NEXT: cset r0, mi +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: cinv r0, r12, eq +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: asrne r1, r2, #31 +; CHECK-NEXT: csel r0, r0, r2, ne +; CHECK-NEXT: vmov.32 q2[0], r1 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: cmp.w r0, #-1 +; CHECK-NEXT: cset r1, gt +; CHECK-NEXT: cmp.w r2, #-1 +; CHECK-NEXT: cset r3, gt +; CHECK-NEXT: cmp r3, r1 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: cset lr, ne +; CHECK-NEXT: subs r1, r4, r1 ; CHECK-NEXT: sbc.w r0, r2, r0 ; CHECK-NEXT: cmp.w r0, #-1 ; CHECK-NEXT: cset r2, gt ; CHECK-NEXT: cmp r3, r2 -; CHECK-NEXT: vmov r3, s7 ; CHECK-NEXT: cset r2, ne -; CHECK-NEXT: and.w r2, r2, r12 -; CHECK-NEXT: ands r12, r2, #1 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: and.w r2, r2, lr +; CHECK-NEXT: ands r2, r2, #1 ; CHECK-NEXT: it ne ; CHECK-NEXT: asrne r1, r0, #31 -; CHECK-NEXT: cmp.w r3, #-1 -; CHECK-NEXT: cset lr, gt -; CHECK-NEXT: cmp.w r2, #-1 -; CHECK-NEXT: cset r4, gt -; CHECK-NEXT: cmp r4, lr -; CHECK-NEXT: cset lr, ne -; CHECK-NEXT: subs r5, r6, r5 -; CHECK-NEXT: sbcs r2, r3 -; CHECK-NEXT: mvn r6, #-2147483648 -; CHECK-NEXT: cmp.w r2, #-1 -; CHECK-NEXT: cset r3, gt -; CHECK-NEXT: cmp r4, r3 -; CHECK-NEXT: cset r3, ne -; CHECK-NEXT: and.w r3, r3, lr -; CHECK-NEXT: ands r3, r3, #1 -; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r5, r2, #31 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r1 +; CHECK-NEXT: vmov.32 q2[2], r1 ; CHECK-NEXT: cset r1, mi ; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: cinv r1, r6, eq -; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: csel r0, r1, r0, ne +; CHECK-NEXT: cinv r1, r12, eq ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cset r1, mi -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: cinv r1, r6, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csel r1, r1, r2, ne -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: csel r0, r1, r0, ne +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: pop {r4, pc} entry: %0 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 @@ -295,38 +299,36 @@ entry: define arm_aapcs_vfpcc <2 x i64> @usub_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: usub_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s2 ; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sbcs.w r0, r1, r0 ; CHECK-NEXT: adc r1, r12, #0 -; CHECK-NEXT: rsbs.w lr, r1, #1 +; CHECK-NEXT: rsbs.w r1, r1, #1 +; CHECK-NEXT: itt ne +; CHECK-NEXT: movne r0, #0 +; CHECK-NEXT: movne r2, #0 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: subs r2, r3, r2 +; CHECK-NEXT: sbcs.w r0, r1, r0 +; CHECK-NEXT: adc r1, r12, #0 +; CHECK-NEXT: rsbs.w r1, r1, #1 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r2, #0 -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r1, r3 -; CHECK-NEXT: adc r3, r12, #0 -; CHECK-NEXT: rsbs.w r3, r3, #1 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr entry: %0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %0 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll index 7d6ce1f..4514de4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -64,8 +64,9 @@ define arm_aapcs_vfpcc void @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr, <2 x ; CHECK-NEXT: ldrb r2, [r1] ; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: ldrb r1, [r1, #1] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.32 q2[2], r1 ; CHECK-NEXT: vand q1, q2, q1 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: strb r2, [r0, r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-sext.ll b/llvm/test/CodeGen/Thumb2/mve-sext.ll index e0e6fef..26f524d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-sext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext.ll @@ -63,11 +63,14 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i64_v2i64_v2i32(<2 x i64> %m) { ; CHECK-LABEL: sext_v2i64_v2i64_v2i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %shl = shl <2 x i64> %m, @@ -79,13 +82,15 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i64_v2i64_v2i35(<2 x i64> %m) { ; CHECK-LABEL: sext_v2i64_v2i64_v2i35: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: sbfx r0, r0, #0, #3 -; CHECK-NEXT: sbfx r1, r1, #0, #3 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: sbfx r0, r0, #0, #3 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -173,19 +178,23 @@ define arm_aapcs_vfpcc <8 x i32> @sext_v8i16_v8i32(<8 x i16> %src) { ; CHECK-LABEL: sext_v8i16_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmovlb.s16 q2, q1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: bx lr entry: @@ -199,37 +208,45 @@ define arm_aapcs_vfpcc <16 x i32> @sext_v16i8_v16i32(<16 x i8> %src) { ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] ; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmov.u8 r1, q0[6] ; CHECK-NEXT: vmovlb.s16 q4, q1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmovlb.s8 q2, q2 ; CHECK-NEXT: vmovlb.s8 q0, q3 ; CHECK-NEXT: vmovlb.s16 q3, q0 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q2, q2 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -242,11 +259,14 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i32_v2i64(<2 x i32> %src) { ; CHECK-LABEL: sext_v2i32_v2i64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %0 = sext <2 x i32> %src to <2 x i64> @@ -333,19 +353,23 @@ define arm_aapcs_vfpcc <8 x i32> @zext_v8i16_v8i32(<8 x i16> %src) { ; CHECK-LABEL: zext_v8i16_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmovlb.u16 q2, q1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: bx lr entry: @@ -359,35 +383,43 @@ define arm_aapcs_vfpcc <16 x i32> @zext_v16i8_v16i32(<16 x i8> %src) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[3] ; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.u8 r1, q0[6] ; CHECK-NEXT: vand q4, q1, q3 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 +; CHECK-NEXT: vmov.32 q5[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q5[3], q5[1], r1, r0 +; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.32 q5[3], r0 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vand q3, q5, q3 +; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-shifts.ll b/llvm/test/CodeGen/Thumb2/mve-shifts.ll index 1db3ebe..8243e0d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shifts.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shifts.ll @@ -38,12 +38,15 @@ define arm_aapcs_vfpcc <2 x i64> @shl_qq_int64_t(<2 x i64> %src1, <2 x i64> %src ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: lsll r2, r1, r0 -; CHECK-NEXT: vmov r12, s6 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: lsll r0, r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: lsll r2, r1, r0 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r1 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %0 = shl <2 x i64> %src1, %src2 @@ -87,21 +90,22 @@ entry: define arm_aapcs_vfpcc <2 x i64> @shru_qq_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: shru_qq_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r5, lr} -; CHECK-NEXT: push {r5, lr} ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: lsll r0, r5, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: lsll r2, r3, r1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r5 -; CHECK-NEXT: pop {r5, pc} +; CHECK-NEXT: lsll r0, r1, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: lsll r0, r1, r2 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r1 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr entry: %0 = lshr <2 x i64> %src1, %src2 ret <2 x i64> %0 @@ -148,12 +152,15 @@ define arm_aapcs_vfpcc <2 x i64> @shrs_qq_int64_t(<2 x i64> %src1, <2 x i64> %sr ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: asrl r2, r1, r0 -; CHECK-NEXT: vmov r12, s6 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: asrl r0, r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: asrl r2, r1, r0 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r1 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %0 = ashr <2 x i64> %src1, %src2 @@ -196,12 +203,15 @@ define arm_aapcs_vfpcc <2 x i64> @shl_qi_int64_t(<2 x i64> %src1) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: lsll r0, r1, #4 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: lsll r2, r3, #4 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: lsll r0, r1, #4 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %0 = shl <2 x i64> %src1, @@ -244,12 +254,15 @@ define arm_aapcs_vfpcc <2 x i64> @shru_qi_int64_t(<2 x i64> %src1) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: lsrl r0, r1, #4 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: lsrl r2, r3, #4 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: lsrl r0, r1, #4 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %0 = lshr <2 x i64> %src1, @@ -292,12 +305,15 @@ define arm_aapcs_vfpcc <2 x i64> @shrs_qi_int64_t(<2 x i64> %src1) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: asrl r0, r1, #4 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: asrl r2, r3, #4 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: asrl r0, r1, #4 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %0 = ashr <2 x i64> %src1, @@ -345,13 +361,16 @@ define arm_aapcs_vfpcc <2 x i64> @shl_qr_int64_t(<2 x i64> %src1, i64 %src2) { ; CHECK-LABEL: shl_qr_int64_t: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: lsll r12, r1, r0 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: lsll r2, r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: lsll r2, r3, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: lsll r2, r1, r0 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -403,15 +422,18 @@ entry: define arm_aapcs_vfpcc <2 x i64> @shru_qr_int64_t(<2 x i64> %src1, i64 %src2) { ; CHECK-LABEL: shru_qr_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: rsb.w r12, r0, #0 +; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: lsll r2, r1, r12 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: lsll r0, r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: lsll r2, r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: lsll r2, r1, r0 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -464,13 +486,16 @@ define arm_aapcs_vfpcc <2 x i64> @shrs_qr_int64_t(<2 x i64> %src1, i64 %src2) { ; CHECK-LABEL: shrs_qr_int64_t: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: asrl r12, r1, r0 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: asrl r2, r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: asrl r2, r3, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: asrl r2, r1, r0 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll index 087fa5d..dc04c5e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll @@ -37,20 +37,22 @@ define arm_aapcs_vfpcc <2 x i64> @add_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: vmov q0[3], q0[1], r1, r12 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: pop {r7, pc} entry: %0 = add nsw <2 x i64> %src1, %src2 @@ -186,20 +188,22 @@ define arm_aapcs_vfpcc <2 x i64> @sub_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r1, s7 ; CHECK-NEXT: subs.w lr, r3, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: sbc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: sbc.w r1, r3, r1 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: vmov q0[3], q0[1], r1, r12 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: pop {r7, pc} entry: %0 = sub nsw <2 x i64> %src2, %src1 @@ -348,9 +352,11 @@ define arm_aapcs_vfpcc <2 x i64> @mul_int64_t(<2 x i64> %src1, <2 x i64> %src2) ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: mla r0, r2, r0, lr ; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r12 +; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q0[2], r4 ; CHECK-NEXT: mla r1, r2, r3, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = mul nsw <2 x i64> %src1, %src2 diff --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll index fa1be8e..37ca5a2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll @@ -59,20 +59,22 @@ define <2 x i64> @vector_add_i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; CHECK-FP-NEXT: vmov d0, r0, r1 ; CHECK-FP-NEXT: add r0, sp, #8 ; CHECK-FP-NEXT: vldrw.u32 q1, [r0] -; CHECK-FP-NEXT: vmov r1, s0 -; CHECK-FP-NEXT: vmov r3, s4 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vmov r2, s5 +; CHECK-FP-NEXT: vmov r1, s2 +; CHECK-FP-NEXT: vmov r0, s3 +; CHECK-FP-NEXT: vmov r3, s6 +; CHECK-FP-NEXT: vmov r2, s7 ; CHECK-FP-NEXT: adds.w lr, r1, r3 -; CHECK-FP-NEXT: vmov r3, s2 -; CHECK-FP-NEXT: vmov r1, s6 +; CHECK-FP-NEXT: vmov r3, s0 +; CHECK-FP-NEXT: vmov r1, s4 ; CHECK-FP-NEXT: adc.w r12, r0, r2 -; CHECK-FP-NEXT: vmov r2, s3 -; CHECK-FP-NEXT: vmov r0, s7 +; CHECK-FP-NEXT: vmov r2, s1 +; CHECK-FP-NEXT: vmov r0, s5 ; CHECK-FP-NEXT: adds r1, r1, r3 -; CHECK-FP-NEXT: vmov q0[2], q0[0], r1, lr +; CHECK-FP-NEXT: vmov.32 q0[0], r1 ; CHECK-FP-NEXT: adcs r0, r2 -; CHECK-FP-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-FP-NEXT: vmov.32 q0[1], r0 +; CHECK-FP-NEXT: vmov.32 q0[2], lr +; CHECK-FP-NEXT: vmov.32 q0[3], r12 ; CHECK-FP-NEXT: vmov r0, r1, d0 ; CHECK-FP-NEXT: vmov r2, r3, d1 ; CHECK-FP-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll index 7cc51df..cb82f90 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -124,19 +124,22 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vabd_s16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmovlb.s16 q2, q2 ; CHECK-NEXT: vmovlb.s16 q3, q3 ; CHECK-NEXT: vsub.i32 q2, q3, q2 ; CHECK-NEXT: vabs.s32 q3, q2 @@ -149,17 +152,22 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-NEXT: vmov r0, s15 ; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmovlb.s16 q1, q3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmovlb.s16 q0, q3 ; CHECK-NEXT: vsub.i32 q0, q0, q1 ; CHECK-NEXT: vabs.s32 q0, q0 @@ -186,47 +194,47 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) { define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vabd_s32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.f32 s8, s0 ; CHECK-NEXT: vmov.f32 s12, s4 ; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.f32 s16, s2 -; CHECK-NEXT: vmov.f32 s20, s6 -; CHECK-NEXT: vmov.f32 s18, s3 -; CHECK-NEXT: vmov.f32 s22, s7 -; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s7 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: subs r0, r0, r2 ; CHECK-NEXT: sbc.w r1, r1, r2, asr #31 ; CHECK-NEXT: add.w r0, r0, r1, asr #31 ; CHECK-NEXT: eor.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: asrs r2, r1, #31 ; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: sbc.w r2, r2, r3, asr #31 -; CHECK-NEXT: vmov r3, s22 ; CHECK-NEXT: add.w r1, r1, r2, asr #31 ; CHECK-NEXT: eor.w r1, r1, r2, asr #31 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.32 q2[0], r1 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: subs r0, r0, r2 ; CHECK-NEXT: sbc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: add.w r0, r0, r1, asr #31 ; CHECK-NEXT: eor.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: subs r1, r1, r3 -; CHECK-NEXT: sbc.w r2, r2, r3, asr #31 -; CHECK-NEXT: add.w r1, r1, r2, asr #31 -; CHECK-NEXT: eor.w r1, r1, r2, asr #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: sbc.w r1, r1, r2, asr #31 +; CHECK-NEXT: add.w r0, r0, r1, asr #31 +; CHECK-NEXT: eor.w r0, r0, r1, asr #31 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -361,19 +369,22 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vabd_u16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmovlb.u16 q2, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vmovlb.u16 q3, q3 ; CHECK-NEXT: vsub.i32 q2, q3, q2 ; CHECK-NEXT: vabs.s32 q3, q2 @@ -386,17 +397,22 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-NEXT: vmov r0, s15 ; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmovlb.u16 q1, q3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmovlb.u16 q0, q3 ; CHECK-NEXT: vsub.i32 q0, q0, q1 ; CHECK-NEXT: vabs.s32 q0, q0 @@ -423,56 +439,59 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) { define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vabd_u32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s12, s0 +; CHECK-NEXT: vmov.i64 q3, #0xffffffff +; CHECK-NEXT: vmov.f32 s16, s0 ; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov.f32 s14, s1 -; CHECK-NEXT: vand q2, q2, q4 -; CHECK-NEXT: vand q3, q3, q4 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s13 -; CHECK-NEXT: vmov.f32 s20, s6 -; CHECK-NEXT: vmov.f32 s22, s7 -; CHECK-NEXT: vand q1, q5, q4 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s22, s3 -; CHECK-NEXT: vand q4, q5, q4 +; CHECK-NEXT: vmov.f32 s18, s1 +; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vand q4, q4, q3 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov r1, s19 ; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: sbc.w r0, r1, r0 ; CHECK-NEXT: add.w r1, r2, r0, asr #31 ; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: eor.w r12, r1, r0, asr #31 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov.f32 s16, s6 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vand q1, q4, q3 +; CHECK-NEXT: vmov.f32 s16, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vand q0, q4, q3 ; CHECK-NEXT: subs r0, r0, r3 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sbc.w r1, r2, r1 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: add.w r0, r0, r1, asr #31 ; CHECK-NEXT: eor.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r12 -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.32 q2[1], r12 ; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sbc.w r0, r1, r0 ; CHECK-NEXT: add.w r1, r2, r0, asr #31 -; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: eor.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: subs r0, r0, r3 -; CHECK-NEXT: sbc.w r1, r2, r1 -; CHECK-NEXT: add.w r0, r0, r1, asr #31 -; CHECK-NEXT: eor.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: eor.w r0, r1, r0, asr #31 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: subs r2, r3, r2 +; CHECK-NEXT: sbc.w r0, r1, r0 +; CHECK-NEXT: add.w r1, r2, r0, asr #31 +; CHECK-NEXT: eor.w r0, r1, r0, asr #31 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -599,8 +618,10 @@ for.cond.cleanup: ; preds = %vector.body define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vabd_loop_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: mov.w lr, #256 @@ -632,31 +653,34 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: subs.w r9, r5, r7 ; CHECK-NEXT: asr.w r6, r5, #31 +; CHECK-NEXT: vmov r5, s6 ; CHECK-NEXT: sbc.w r6, r6, r7, asr #31 +; CHECK-NEXT: vmov.32 q1[0], r8 +; CHECK-NEXT: vmov.32 q1[1], r9 ; CHECK-NEXT: and.w r6, r12, r6, asr #31 ; CHECK-NEXT: rsbs r6, r6, #0 ; CHECK-NEXT: bfi r4, r6, #4, #4 ; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: subs.w r10, r6, r3 -; CHECK-NEXT: asr.w r7, r6, #31 +; CHECK-NEXT: asrs r7, r6, #31 +; CHECK-NEXT: subs r6, r6, r3 ; CHECK-NEXT: sbc.w r3, r7, r3, asr #31 -; CHECK-NEXT: vmov r7, s14 -; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: vmov.32 q1[2], r6 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: asrs r5, r7, #31 -; CHECK-NEXT: subs r7, r7, r6 -; CHECK-NEXT: sbc.w r5, r5, r6, asr #31 -; CHECK-NEXT: asrs r6, r5, #31 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov q1[2], q1[0], r10, r8 -; CHECK-NEXT: vmov q1[3], q1[1], r7, r9 -; CHECK-NEXT: and r3, r3, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r4, r3, #8, #4 -; CHECK-NEXT: and.w r3, r12, r5, asr #31 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r4, r3, #12, #4 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: asrs r7, r3, #31 +; CHECK-NEXT: subs r3, r3, r5 +; CHECK-NEXT: sbc.w r5, r7, r5, asr #31 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: asrs r7, r5, #31 +; CHECK-NEXT: and.w r5, r12, r5, asr #31 +; CHECK-NEXT: vmov.32 q2[2], r7 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov r7, s8 +; CHECK-NEXT: and r7, r7, #1 +; CHECK-NEXT: rsbs r7, r7, #0 +; CHECK-NEXT: bfi r4, r7, #8, #4 +; CHECK-NEXT: bfi r4, r5, #12, #4 ; CHECK-NEXT: vmsr p0, r4 ; CHECK-NEXT: vpst ; CHECK-NEXT: vsubt.i32 q1, q0, q1 @@ -664,7 +688,8 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: br label %vector.body @@ -809,10 +834,8 @@ for.cond.cleanup: ; preds = %vector.body define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vabd_loop_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: mov.w lr, #256 @@ -859,25 +882,28 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: bfi r4, r3, #4, #4 ; CHECK-NEXT: vmov r3, s9 ; CHECK-NEXT: subs.w r10, r5, r7 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: vmov r7, s15 +; CHECK-NEXT: vmov r7, s10 +; CHECK-NEXT: vmov r5, s14 ; CHECK-NEXT: sbc.w r3, r6, r3 -; CHECK-NEXT: vmov r6, s11 -; CHECK-NEXT: asr.w r11, r3, #31 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: subs r3, r3, r5 -; CHECK-NEXT: sbc.w r5, r7, r6 -; CHECK-NEXT: asrs r6, r5, #31 -; CHECK-NEXT: and.w r5, r12, r5, asr #31 -; CHECK-NEXT: vmov q2[2], q2[0], r6, r11 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: vmov q2[2], q2[0], r10, r8 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r9 +; CHECK-NEXT: vmov r6, s15 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov.32 q2[0], r8 +; CHECK-NEXT: vmov.32 q2[1], r9 +; CHECK-NEXT: vmov.32 q2[2], r10 +; CHECK-NEXT: subs r5, r5, r7 +; CHECK-NEXT: vmov.32 q2[3], r5 +; CHECK-NEXT: sbc.w r3, r6, r3 +; CHECK-NEXT: asrs r6, r3, #31 +; CHECK-NEXT: and.w r3, r12, r3, asr #31 +; CHECK-NEXT: vmov.32 q4[2], r6 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov r6, s16 ; CHECK-NEXT: and r6, r6, #1 ; CHECK-NEXT: rsbs r6, r6, #0 ; CHECK-NEXT: bfi r4, r6, #8, #4 -; CHECK-NEXT: bfi r4, r5, #12, #4 +; CHECK-NEXT: bfi r4, r3, #12, #4 ; CHECK-NEXT: vmsr p0, r4 ; CHECK-NEXT: vpst ; CHECK-NEXT: vsubt.i32 q2, q1, q2 @@ -885,8 +911,7 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: le lr, .LBB11_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll index 56f9f56..a40beb4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcmp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmp.ll @@ -367,31 +367,36 @@ entry: define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, <2 x i64> %srcb, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_eq_v2i64: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: eors r2, r3 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q1, q3, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vbic q0, q3, q4 +; CHECK-NEXT: vand q1, q2, q4 +; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, %srcb @@ -402,31 +407,36 @@ entry: define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, <2 x i64> %srcb, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_eq_v2i32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: eors r2, r3 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q1, q3, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vbic q0, q3, q4 +; CHECK-NEXT: vand q1, q2, q4 +; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, %srcb @@ -437,76 +447,84 @@ entry: define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { ; CHECK-LABEL: vcmp_multi_v2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q0, q2, q0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: subs r1, r0, r2 -; CHECK-NEXT: asr.w r12, r0, #31 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vmov lr, s0 +; CHECK-NEXT: subs.w r1, lr, r2 +; CHECK-NEXT: asr.w r12, lr, #31 ; CHECK-NEXT: sbcs.w r1, r12, r2, asr #31 ; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: csetm lr, ne -; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: subs r4, r2, r1 -; CHECK-NEXT: sbcs.w r1, r12, r1, asr #31 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: subs r0, r1, r2 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r0, r12, r2, asr #31 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: cmp.w lr, #0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, lr ; CHECK-NEXT: tst.w r0, #1 -; CHECK-NEXT: vmov q3[3], q3[1], r1, lr ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q1, q1, q4 +; CHECK-NEXT: vmov.32 q5[0], r0 +; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.32 q5[3], r0 +; CHECK-NEXT: vand q1, q5, q4 ; CHECK-NEXT: vand q1, q3, q1 ; CHECK-NEXT: vbic q0, q0, q1 ; CHECK-NEXT: vand q1, q2, q1 ; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r7, pc} %a4 = icmp eq <2 x i64> %a, zeroinitializer %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c %a6 = icmp ne <2 x i32> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll index 818b65d..06361d9 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll @@ -438,22 +438,24 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x ; CHECK-NEXT: eors r2, r1 ; CHECK-NEXT: eors r3, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -471,22 +473,24 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x ; CHECK-NEXT: eors r2, r1 ; CHECK-NEXT: eors r3, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -499,76 +503,84 @@ entry: define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { ; CHECK-LABEL: vcmp_multi_v2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q0, q2, q0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: subs r1, r0, r2 -; CHECK-NEXT: asr.w r12, r0, #31 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vmov lr, s0 +; CHECK-NEXT: subs.w r1, lr, r2 +; CHECK-NEXT: asr.w r12, lr, #31 ; CHECK-NEXT: sbcs.w r1, r12, r2, asr #31 ; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: csetm lr, ne -; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: subs r4, r2, r1 -; CHECK-NEXT: sbcs.w r1, r12, r1, asr #31 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: subs r0, r1, r2 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r0, r12, r2, asr #31 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: cmp.w lr, #0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, lr ; CHECK-NEXT: tst.w r0, #1 -; CHECK-NEXT: vmov q3[3], q3[1], r1, lr ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q1, q1, q4 +; CHECK-NEXT: vmov.32 q5[0], r0 +; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.32 q5[3], r0 +; CHECK-NEXT: vand q1, q5, q4 ; CHECK-NEXT: vand q1, q3, q1 ; CHECK-NEXT: vbic q0, q0, q1 ; CHECK-NEXT: vand q1, q2, q1 ; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r7, pc} %a4 = icmp eq <2 x i64> %a, zeroinitializer %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c %a6 = icmp ne <2 x i32> %b, zeroinitializer @@ -1019,22 +1031,24 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eq_v2i64(<2 x i64> %src, i64 %src2, <2 ; CHECK-NEXT: eors r2, r1 ; CHECK-NEXT: eors r3, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -1052,22 +1066,24 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eq_v2i32(<2 x i64> %src, i64 %src2, <2 ; CHECK-NEXT: eors r2, r1 ; CHECK-NEXT: eors r3, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: eors r0, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: bx lr entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 @@ -1080,76 +1096,84 @@ entry: define arm_aapcs_vfpcc <2 x i32> @vcmp_r_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { ; CHECK-LABEL: vcmp_r_multi_v2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q0, q2, q0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: subs r1, r0, r2 -; CHECK-NEXT: asr.w r12, r0, #31 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vmov lr, s0 +; CHECK-NEXT: subs.w r1, lr, r2 +; CHECK-NEXT: asr.w r12, lr, #31 ; CHECK-NEXT: sbcs.w r1, r12, r2, asr #31 ; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: csetm lr, ne -; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: subs r4, r2, r1 -; CHECK-NEXT: sbcs.w r1, r12, r1, asr #31 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: subs r0, r1, r2 +; CHECK-NEXT: asr.w r12, r1, #31 +; CHECK-NEXT: sbcs.w r0, r12, r2, asr #31 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: cmp.w lr, #0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vmov q3[2], q3[0], r1, lr ; CHECK-NEXT: tst.w r0, #1 -; CHECK-NEXT: vmov q3[3], q3[1], r1, lr ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q1, q1, q4 +; CHECK-NEXT: vmov.32 q5[0], r0 +; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.32 q5[3], r0 +; CHECK-NEXT: vand q1, q5, q4 ; CHECK-NEXT: vand q1, q3, q1 ; CHECK-NEXT: vbic q0, q0, q1 ; CHECK-NEXT: vand q1, q2, q1 ; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r7, pc} %a4 = icmp eq <2 x i64> %a, zeroinitializer %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c %a6 = icmp ne <2 x i32> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll index 759f4dd..e9b7174 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll @@ -363,21 +363,23 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, < ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, zeroinitializer @@ -390,21 +392,23 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, < ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, zeroinitializer @@ -777,21 +781,23 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> zeroinitializer, %src @@ -804,21 +810,23 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vbic q2, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vorr q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vbic q0, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %src, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll index 953dafe..e408bc4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll @@ -4,8 +4,10 @@ define arm_aapcs_vfpcc <4 x i32> @vcreate_i32(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: vcreate_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: bx lr entry: %conv = zext i32 %a to i64 @@ -25,8 +27,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @insert_0123(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_0123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q0[3], r3 ; CHECK-NEXT: bx lr entry: %v1 = insertelement <4 x i32> undef, i32 %a, i32 0 @@ -39,8 +43,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @insert_3210(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_3210: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q0[2], q0[0], r1, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: bx lr entry: %v1 = insertelement <4 x i32> undef, i32 %a, i32 3 @@ -53,8 +59,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @insert_0213(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.32 q0[3], r3 ; CHECK-NEXT: bx lr entry: %v1 = insertelement <4 x i32> undef, i32 %a, i32 0 @@ -67,7 +75,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @insert_0220(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_0220: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: bx lr entry: %v1 = insertelement <4 x i32> undef, i32 %a, i32 0 @@ -80,8 +89,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @insert_321(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_321: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.32 q0[1], r2 ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: bx lr entry: %v1 = insertelement <4 x i32> undef, i32 %a, i32 3 @@ -94,7 +104,8 @@ define arm_aapcs_vfpcc <4 x i32> @insert_310(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_310: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: bx lr entry: %v1 = insertelement <4 x i32> undef, i32 %a, i32 3 @@ -106,7 +117,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @insert_320(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_320: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q0[2], q0[0], r1, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[2], r1 ; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: bx lr entry: @@ -119,7 +131,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @insert_31(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_31: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: bx lr entry: %v1 = insertelement <4 x i32> undef, i32 %a, i32 3 @@ -152,8 +165,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @insert_210(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_210: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: bx lr entry: %v1 = insertelement <4 x i32> undef, i32 %a, i32 2 @@ -165,7 +179,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @insert_20(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: insert_20: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: bx lr entry: %v1 = insertelement <4 x i32> undef, i32 %a, i32 2 @@ -230,26 +245,28 @@ entry: define hidden <8 x i16> @create_i16(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d, i16 zeroext %a2, i16 zeroext %b2, i16 zeroext %c2, i16 zeroext %d2) local_unnamed_addr #0 { ; CHECK-LABEL: create_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r5, r6, r7, lr} -; CHECK-NEXT: push {r5, r6, r7, lr} -; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: lsll r2, r7, #16 ; CHECK-NEXT: orr.w r0, r1, r0, lsl #16 -; CHECK-NEXT: orr.w r12, r2, r3 -; CHECK-NEXT: ldr r2, [sp, #24] -; CHECK-NEXT: ldr r3, [sp, #28] -; CHECK-NEXT: orrs r0, r7 ; CHECK-NEXT: lsll r2, r5, #16 -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: ldrd r1, r2, [sp, #16] -; CHECK-NEXT: orr.w r1, r2, r1, lsl #16 -; CHECK-NEXT: orrs r1, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: ldrd lr, r4, [sp, #16] +; CHECK-NEXT: orr.w r1, r2, r3 +; CHECK-NEXT: ldr.w r12, [sp, #24] +; CHECK-NEXT: orrs r0, r5 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: ldr r0, [sp, #28] +; CHECK-NEXT: lsll r12, r7, #16 +; CHECK-NEXT: orr.w r4, r4, lr, lsl #16 +; CHECK-NEXT: orr.w r0, r0, r12 +; CHECK-NEXT: orrs r7, r4 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r7 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: pop {r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %conv = zext i16 %a to i64 %shl = shl nuw i64 %conv, 48 @@ -308,59 +325,59 @@ entry: define hidden <16 x i8> @create_i8(i8 zeroext %a1, i8 zeroext %b1, i8 zeroext %c1, i8 zeroext %d1, i8 zeroext %a2, i8 zeroext %b2, i8 zeroext %c2, i8 zeroext %d2, i8 zeroext %a3, i8 zeroext %b3, i8 zeroext %c3, i8 zeroext %d3, i8 zeroext %a4, i8 zeroext %b4, i8 zeroext %c4, i8 zeroext %d4) local_unnamed_addr #0 { ; CHECK-LABEL: create_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: ldr r4, [sp, #36] +; CHECK-NEXT: .save {r4, r5, r7, r9, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r7, r9, r11, lr} +; CHECK-NEXT: ldr.w r12, [sp, #28] ; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: ldr r6, [sp, #32] +; CHECK-NEXT: ldr r4, [sp, #24] +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: lsll r12, r11, #16 +; CHECK-NEXT: lsls r1, r1, #16 +; CHECK-NEXT: lsll r4, r5, #24 +; CHECK-NEXT: orr.w r0, r1, r0, lsl #22 +; CHECK-NEXT: orr.w r12, r12, r4 +; CHECK-NEXT: ldr r4, [sp, #32] ; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: lsll r4, r11, #16 -; CHECK-NEXT: mov lr, r1 -; CHECK-NEXT: lsll r6, r7, #24 -; CHECK-NEXT: mov r12, r3 -; CHECK-NEXT: orr.w r1, r6, r4 -; CHECK-NEXT: ldr r4, [sp, #40] +; CHECK-NEXT: orr.w r0, r0, r2, lsl #8 +; CHECK-NEXT: lsll r4, r7, #8 +; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: orr.w r12, r12, r4 +; CHECK-NEXT: ldr r4, [sp, #36] +; CHECK-NEXT: orrs r0, r5 +; CHECK-NEXT: ldr r2, [sp, #56] +; CHECK-NEXT: orr.w r0, r0, r11 +; CHECK-NEXT: orr.w r4, r4, r12 +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: orrs r0, r7 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: ldr r0, [sp, #60] +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: ldr r6, [sp, #68] -; CHECK-NEXT: lsll r4, r3, #8 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: orrs r1, r4 -; CHECK-NEXT: ldr r4, [sp, #44] -; CHECK-NEXT: lsll r6, r5, #16 +; CHECK-NEXT: lsll r0, r1, #16 +; CHECK-NEXT: lsll r2, r3, #24 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: ldr r2, [sp, #64] ; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: orr.w r8, r1, r4 -; CHECK-NEXT: ldr r4, [sp, #64] -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: lsll r4, r1, #24 -; CHECK-NEXT: orrs r4, r6 -; CHECK-NEXT: ldr r6, [sp, #72] -; CHECK-NEXT: lsll r6, r9, #8 -; CHECK-NEXT: orrs r4, r6 -; CHECK-NEXT: ldr r6, [sp, #76] -; CHECK-NEXT: orrs r4, r6 -; CHECK-NEXT: lsl.w r6, lr, #16 -; CHECK-NEXT: orr.w r0, r6, r0, lsl #22 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r8 +; CHECK-NEXT: lsll r2, r9, #8 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: ldr r2, [sp, #68] +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: ldr r2, [sp, #40] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: ldr r0, [sp, #44] +; CHECK-NEXT: lsls r0, r0, #16 +; CHECK-NEXT: orr.w r0, r0, r2, lsl #22 +; CHECK-NEXT: ldr r2, [sp, #48] ; CHECK-NEXT: orr.w r0, r0, r2, lsl #8 ; CHECK-NEXT: ldr r2, [sp, #52] -; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: orrs r0, r7 -; CHECK-NEXT: orr.w r0, r0, r11 -; CHECK-NEXT: lsls r2, r2, #16 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: orrs r0, r3 -; CHECK-NEXT: ldr r3, [sp, #48] -; CHECK-NEXT: orr.w r2, r2, r3, lsl #22 -; CHECK-NEXT: ldr r3, [sp, #56] -; CHECK-NEXT: orr.w r2, r2, r3, lsl #8 -; CHECK-NEXT: ldr r3, [sp, #60] -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: orrs r1, r5 -; CHECK-NEXT: orr.w r1, r1, r9 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: orr.w r0, r0, r9 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r7, r9, r11, pc} entry: %conv = zext i8 %a1 to i64 %shl = shl nuw nsw i64 %conv, 54 diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll index 5fa04f8a3..831ca04 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll @@ -44,15 +44,17 @@ define arm_aapcs_vfpcc <4 x i32> @foo_int32_float(<4 x float> %src) { ; CHECK-MVE-LABEL: foo_int32_float: ; CHECK-MVE: @ %bb.0: @ %entry ; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s0 -; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s2 -; CHECK-MVE-NEXT: vcvt.s32.f32 s8, s1 -; CHECK-MVE-NEXT: vcvt.s32.f32 s10, s3 +; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s1 +; CHECK-MVE-NEXT: vcvt.s32.f32 s10, s2 +; CHECK-MVE-NEXT: vcvt.s32.f32 s8, s3 ; CHECK-MVE-NEXT: vmov r0, s4 -; CHECK-MVE-NEXT: vmov r1, s6 -; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-MVE-NEXT: vmov.32 q0[0], r0 +; CHECK-MVE-NEXT: vmov r0, s6 +; CHECK-MVE-NEXT: vmov.32 q0[1], r0 +; CHECK-MVE-NEXT: vmov r0, s10 +; CHECK-MVE-NEXT: vmov.32 q0[2], r0 ; CHECK-MVE-NEXT: vmov r0, s8 -; CHECK-MVE-NEXT: vmov r1, s10 -; CHECK-MVE-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-MVE-NEXT: vmov.32 q0[3], r0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: foo_int32_float: @@ -68,15 +70,17 @@ define arm_aapcs_vfpcc <4 x i32> @foo_uint32_float(<4 x float> %src) { ; CHECK-MVE-LABEL: foo_uint32_float: ; CHECK-MVE: @ %bb.0: @ %entry ; CHECK-MVE-NEXT: vcvt.u32.f32 s4, s0 -; CHECK-MVE-NEXT: vcvt.u32.f32 s6, s2 -; CHECK-MVE-NEXT: vcvt.u32.f32 s8, s1 -; CHECK-MVE-NEXT: vcvt.u32.f32 s10, s3 +; CHECK-MVE-NEXT: vcvt.u32.f32 s6, s1 +; CHECK-MVE-NEXT: vcvt.u32.f32 s10, s2 +; CHECK-MVE-NEXT: vcvt.u32.f32 s8, s3 ; CHECK-MVE-NEXT: vmov r0, s4 -; CHECK-MVE-NEXT: vmov r1, s6 -; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-MVE-NEXT: vmov.32 q0[0], r0 +; CHECK-MVE-NEXT: vmov r0, s6 +; CHECK-MVE-NEXT: vmov.32 q0[1], r0 +; CHECK-MVE-NEXT: vmov r0, s10 +; CHECK-MVE-NEXT: vmov.32 q0[2], r0 ; CHECK-MVE-NEXT: vmov r0, s8 -; CHECK-MVE-NEXT: vmov r1, s10 -; CHECK-MVE-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-MVE-NEXT: vmov.32 q0[3], r0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: foo_uint32_float: @@ -345,21 +349,24 @@ entry: define arm_aapcs_vfpcc <2 x i64> @foo_int64_float(<2 x double> %src) { ; CHECK-LABEL: foo_int64_float: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 +; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r7, pc} entry: %out = fptosi <2 x double> %src to <2 x i64> ret <2 x i64> %out @@ -368,21 +375,24 @@ entry: define arm_aapcs_vfpcc <2 x i64> @foo_uint64_float(<2 x double> %src) { ; CHECK-LABEL: foo_uint64_float: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 +; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r7, pc} entry: %out = fptoui <2 x double> %src to <2 x i64> ret <2 x i64> %out diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll index 7514722..bce76f0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -38,8 +38,10 @@ entry: define arm_aapcs_vfpcc <2 x i64> @vdup_i64(i64 %src) { ; CHECK-LABEL: vdup_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q0[2], q0[0], r0, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r1 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: bx lr entry: %0 = insertelement <2 x i64> undef, i64 %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll index d64a4c9..df2cb43 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -130,36 +130,40 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) { ; CHECK-LABEL: add_v8i16_v8i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.32 q2[2], r0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.32 q3[2], r1 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vmov r1, s12 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r1 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vmov r1, s12 ; CHECK-NEXT: vmov r3, s14 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov r1, s15 ; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r3 @@ -179,56 +183,62 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) { ; CHECK-LABEL: add_v8i16_v8i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: sxth r1, r1 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: vmov.32 q1[3], r1 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 +; CHECK-NEXT: adc.w r12, r1, r0, asr #31 ; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 +; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: asrs r0, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: asrs r3, r1, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r2, r12, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adc.w r1, r2, r1, asr #31 ; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: adc.w r12, r0, r3, asr #31 -; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r0, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: adc.w r0, r0, r3, asr #31 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r0, r2, asr #31 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: sxth r3, r0 -; CHECK-NEXT: adds r0, r1, r3 -; CHECK-NEXT: adc.w r1, r2, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i16> %x to <8 x i64> @@ -385,35 +395,39 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.32 q2[2], r0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.32 q3[2], r1 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vmov r1, s12 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r1 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vmov r1, s12 ; CHECK-NEXT: vmov r3, s14 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov r1, s15 ; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 @@ -422,10 +436,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) { ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 @@ -434,10 +449,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) { ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[11] ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 @@ -446,10 +462,11 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) { ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[13] ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 @@ -458,11 +475,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) { ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[15] ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r3 @@ -482,116 +500,130 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) { ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: vmov.32 q1[3], r1 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 +; CHECK-NEXT: adc.w r12, r1, r0, asr #31 ; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 +; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: asrs r0, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: asrs r3, r1, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r2, r12, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adc.w r1, r2, r1, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: adc.w r12, r0, r3, asr #31 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r0, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: adc.w r12, r0, r3, asr #31 -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r0, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: adc.w r12, r0, r3, asr #31 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r0, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: adc.w r12, r0, r3, asr #31 -; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r0, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: adc.w r12, r0, r3, asr #31 -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r0, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: adc.w r0, r0, r3, asr #31 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r0, r2, asr #31 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: sxtb r3, r0 -; CHECK-NEXT: adds r0, r1, r3 -; CHECK-NEXT: adc.w r1, r2, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i64> @@ -797,36 +829,40 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w r12, r3, r2 +; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u16 r3, q0[3] +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: add.w r12, r2, r3 +; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add r12, r2 +; CHECK-NEXT: add.w r12, r2, r3 +; CHECK-NEXT: vmov r3, s14 ; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: adds.w r4, r12, r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] ; CHECK-NEXT: adc.w r12, r2, lr ; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r3, r3, r4 @@ -851,58 +887,64 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) { ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.u16 r3, q0[1] ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r12 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 ; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r2, asr #31 ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: adc.w r12, r12, r3, asr #31 -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r2 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r4, r2 -; CHECK-NEXT: vmov.u16 r4, q0[4] -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: sxth r4, r4 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: adc.w r12, r12, r3, asr #31 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r4, r4, #31 +; CHECK-NEXT: vmov.32 q1[2], r2 ; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 +; CHECK-NEXT: vmov.32 q1[3], r3 ; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: adds.w r4, r4, lr ; CHECK-NEXT: adc.w r12, r12, r3 ; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adds.w lr, r4, r3 +; CHECK-NEXT: vmov.u16 r4, q0[4] +; CHECK-NEXT: sxth r4, r4 +; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: asrs r4, r4, #31 +; CHECK-NEXT: vmov.32 q1[1], r4 +; CHECK-NEXT: vmov.u16 r4, q0[5] +; CHECK-NEXT: sxth r4, r4 +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: asrs r2, r4, #31 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r4, asr #31 ; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: adc.w r2, r12, r2, asr #31 ; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adc.w r2, r2, r4, asr #31 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc.w r3, r3, r4, asr #31 ; CHECK-NEXT: vmov.u16 r4, q0[7] ; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adc.w r2, r2, r4, asr #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc.w r3, r3, r4, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r4, pc} entry: %xx = sext <8 x i16> %x to <8 x i64> @@ -1077,35 +1119,39 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w r12, r3, r2 +; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: add.w r12, r2, r3 +; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add r12, r2 +; CHECK-NEXT: add.w r12, r2, r3 +; CHECK-NEXT: vmov r3, s14 ; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: adds.w r4, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] ; CHECK-NEXT: adc.w r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 @@ -1114,55 +1160,59 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { ; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: adc.w r3, r12, r4 ; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: vmov.32 q2[0], r4 +; CHECK-NEXT: vmov.u8 r4, q0[9] +; CHECK-NEXT: vmov.32 q2[2], r4 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: adds.w r12, lr, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: vmov.32 q2[0], r4 +; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: vmov.32 q2[2], r4 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: vmov.32 q2[0], r4 +; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: vmov.32 q2[2], r4 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r4 +; CHECK-NEXT: vmov.u8 r4, q0[15] +; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: vand q0, q2, q1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r4, pc} @@ -1179,118 +1229,132 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) { ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r12 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: vmov.32 q1[3], r3 ; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: adc.w r12, r12, r3, asr #31 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r2 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r4, r2 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: adc.w r12, r12, r3, asr #31 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r4, r4, #31 +; CHECK-NEXT: vmov.32 q1[2], r2 ; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 +; CHECK-NEXT: vmov.32 q1[3], r3 ; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: adds.w r4, r4, lr ; CHECK-NEXT: adc.w r12, r12, r3 ; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[6] +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: asrs r4, r4, #31 +; CHECK-NEXT: vmov.32 q1[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[5] ; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: asrs r2, r4, #31 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: adc.w r12, r12, r4, asr #31 +; CHECK-NEXT: vmov.u8 r4, q0[6] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.32 q1[0], r4 ; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 +; CHECK-NEXT: vmov.32 q1[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[7] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: asrs r2, r4, #31 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: adc.w r12, r12, r4, asr #31 ; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 +; CHECK-NEXT: vmov.32 q1[0], r4 ; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 +; CHECK-NEXT: vmov.32 q1[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[9] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: asrs r2, r4, #31 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: adc.w r12, r12, r4, asr #31 ; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 +; CHECK-NEXT: vmov.32 q1[0], r4 ; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 +; CHECK-NEXT: vmov.32 q1[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: asrs r2, r4, #31 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: adc.w r12, r12, r4, asr #31 ; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 +; CHECK-NEXT: vmov.32 q1[0], r4 ; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov.32 q1[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: asrs r2, r4, #31 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r4, asr #31 ; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: adc.w r2, r12, r2, asr #31 ; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adc.w r2, r2, r4, asr #31 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc.w r3, r3, r4, asr #31 ; CHECK-NEXT: vmov.u8 r4, q0[15] ; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adc.w r2, r2, r4, asr #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc.w r3, r3, r4, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r4, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll index 628e0d6..e59fb0b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -47,19 +47,21 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %b) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r1, s3 @@ -79,24 +81,28 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %b) { ; CHECK-LABEL: add_v2i32_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: vand q0, q2, q0 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r1, s3 @@ -196,23 +202,28 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) { ; CHECK-NEXT: vcmp.i16 eq, q1, zr ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.u16 r1, q2[2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.u16 r1, q2[3] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vmrs r0, p0 ; CHECK-NEXT: and r1, r0, #1 -; CHECK-NEXT: ubfx r2, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: ubfx r1, r0, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: vmov.32 q3[3], r1 ; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 +; CHECK-NEXT: vmov.32 q4[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.32 q4[2], r1 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 ; CHECK-NEXT: vmov r1, s15 @@ -222,64 +233,75 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %b) { ; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: ubfx r3, r0, #8, #1 -; CHECK-NEXT: ubfx r0, r0, #12, #1 ; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r0, r0, #12, #1 +; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q4[2], r0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 ; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov r0, s13 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q2[6] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u16 r1, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q2[5] -; CHECK-NEXT: vmov.u16 r3, q2[7] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r1 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: adcs r1, r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r3, r1, #1 -; CHECK-NEXT: ubfx r0, r1, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adc.w r12, r12, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: adds.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: ubfx r3, r1, #8, #1 -; CHECK-NEXT: ubfx r1, r1, #12, #1 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: adcs r1, r2 @@ -304,48 +326,58 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) { ; CHECK-NEXT: vcmp.i16 eq, q1, zr ; CHECK-NEXT: vpsel q1, q3, q2 ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vcmp.i32 ne, q2, zr ; CHECK-NEXT: vmrs r0, p0 ; CHECK-NEXT: and r1, r0, #1 -; CHECK-NEXT: ubfx r2, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmov.32 q2[0], r1 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: ubfx r1, r0, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vmov.32 q2[3], r1 ; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.32 q3[0], r1 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: vmov.32 q3[3], r1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: adc.w r2, r3, r12 ; CHECK-NEXT: ubfx r3, r0, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r0, r0, #12, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r0, r0, #0 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vand q2, q3, q2 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r0, s9 @@ -356,51 +388,61 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %b) { ; CHECK-NEXT: adds.w r12, r1, r0 ; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.u16 r3, q1[6] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.u16 r3, q1[7] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q2, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r0, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r3 -; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q2[2], r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r0 +; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.u16 r3, q0[7] ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r3 @@ -425,18 +467,20 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %b) { ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov r2, s1 @@ -456,28 +500,32 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %b) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q2, #0xffff -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vand q2, q1, q2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r1, s3 @@ -662,23 +710,28 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vcmp.i16 ne, q1, zr ; CHECK-NEXT: vpsel q5, q3, q2 ; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov.u16 r1, q5[2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.u16 r1, q5[3] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q5[3] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vmrs r0, p0 ; CHECK-NEXT: and r1, r0, #1 -; CHECK-NEXT: ubfx r2, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r1 +; CHECK-NEXT: vmov.32 q6[0], r1 +; CHECK-NEXT: vmov.32 q6[1], r1 +; CHECK-NEXT: ubfx r1, r0, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: vmov.32 q6[2], r1 +; CHECK-NEXT: vmov.32 q6[3], r1 ; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov.u8 r2, q0[1] -; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 +; CHECK-NEXT: vmov.32 q7[0], r1 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.32 q7[2], r1 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 ; CHECK-NEXT: vmov r1, s27 @@ -688,64 +741,75 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov r2, s26 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: ubfx r3, r0, #8, #1 -; CHECK-NEXT: ubfx r0, r0, #12, #1 ; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r0, r0, #12, #1 +; CHECK-NEXT: vmov.32 q6[0], r3 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 +; CHECK-NEXT: vmov.32 q6[1], r3 +; CHECK-NEXT: vmov.32 q6[2], r0 +; CHECK-NEXT: vmov.32 q6[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov q7[2], q7[0], r3, r0 +; CHECK-NEXT: vmov.32 q7[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.32 q7[2], r0 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 ; CHECK-NEXT: vmov r3, s24 ; CHECK-NEXT: vmov r0, s25 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q5[6] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u16 r1, q5[4] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q5[5] -; CHECK-NEXT: vmov.u16 r3, q5[7] -; CHECK-NEXT: vmov q6[3], q6[1], r3, r1 +; CHECK-NEXT: vmov r3, s27 +; CHECK-NEXT: adcs r1, r0 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov.u16 r2, q5[4] +; CHECK-NEXT: vmov.32 q6[0], r2 +; CHECK-NEXT: vmov.u16 r2, q5[5] +; CHECK-NEXT: vmov.32 q6[1], r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.32 q6[3], r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r3, r1, #1 -; CHECK-NEXT: ubfx r0, r1, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r0, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q5[2], r3 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vmov.32 q6[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r0 +; CHECK-NEXT: vmov.32 q6[2], r3 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adc.w r12, r12, r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov r3, s21 +; CHECK-NEXT: adds.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: ubfx r3, r1, #8, #1 -; CHECK-NEXT: ubfx r1, r1, #12, #1 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r1, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r1, r3 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.32 q5[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov.32 q6[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.32 q6[2], r2 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 ; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r1, s21 +; CHECK-NEXT: vmov r2, s21 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: vmov r3, s22 ; CHECK-NEXT: adcs r1, r2 @@ -771,41 +835,49 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.u16 r3, q2[2] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.u16 r3, q2[3] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r0, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q3[2], r3 +; CHECK-NEXT: vmov.32 q3[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov.32 q4[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: adds.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 ; CHECK-NEXT: vmov r3, s12 @@ -815,44 +887,52 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov r2, s15 ; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q2[6] ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.u16 r3, q2[7] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r0, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: adds.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 @@ -900,48 +980,58 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q4, q2, q1 ; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.u16 r1, q4[2] -; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 +; CHECK-NEXT: vmov.32 q5[0], r0 ; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.u16 r1, q4[3] -; CHECK-NEXT: vmov q5[3], q5[1], r1, r0 +; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.32 q5[3], r0 ; CHECK-NEXT: vcmp.i32 ne, q5, zr ; CHECK-NEXT: vmrs r0, p0 ; CHECK-NEXT: and r1, r0, #1 -; CHECK-NEXT: ubfx r2, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r1 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r1 +; CHECK-NEXT: vmov.32 q5[0], r1 +; CHECK-NEXT: vmov.32 q5[1], r1 +; CHECK-NEXT: ubfx r1, r0, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: vmov.32 q5[2], r1 +; CHECK-NEXT: vmov.32 q5[3], r1 ; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 +; CHECK-NEXT: vmov.32 q6[0], r1 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r1 +; CHECK-NEXT: vmov.32 q6[1], r1 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vmov.32 q6[2], r1 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: vmov.32 q6[3], r1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmov r1, s20 +; CHECK-NEXT: vmov r1, s22 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov r12, s23 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: vmov r3, s21 +; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: adc.w r2, r3, r12 ; CHECK-NEXT: ubfx r3, r0, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r0, r0, #12, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r0, r0, #0 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: vmov q5[2], q5[0], r0, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r0, r3 +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.32 q5[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q6[2], q6[0], r3, r0 +; CHECK-NEXT: vmov.32 q6[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r0 +; CHECK-NEXT: vmov.32 q6[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: vmov.32 q6[2], r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q6[3], r0 ; CHECK-NEXT: vand q5, q6, q5 ; CHECK-NEXT: vmov r3, s20 ; CHECK-NEXT: vmov r0, s21 @@ -952,50 +1042,60 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: adds.w r12, r1, r0 ; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q4[4] -; CHECK-NEXT: vmov.u16 r3, q4[6] -; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov.u16 r2, q4[5] -; CHECK-NEXT: vmov.u16 r3, q4[7] -; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vmov.32 q5[1], r2 +; CHECK-NEXT: vmov.u16 r2, q4[6] +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.u16 r2, q4[7] +; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q5, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r0, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.32 q4[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: vmov.32 q4[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov.32 q5[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q5[2], q5[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q5[2], r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r3, r0 +; CHECK-NEXT: vmov.32 q5[3], r3 ; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: adds.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s19 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q4[1], r3 +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vmov.32 q4[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: vmov.u8 r3, q0[7] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vmov.32 q5[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vand q4, q5, q4 ; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: vmov r2, s17 @@ -1024,50 +1124,60 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q1, q2, q1 ; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.u16 r3, q1[2] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.u16 r3, q1[3] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q2, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r0, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r0 +; CHECK-NEXT: vmov.32 q3[3], r3 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: adds.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov.u8 r3, q0[11] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vand q2, q3, q2 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 @@ -1076,53 +1186,63 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q1[6] ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.u16 r3, q1[7] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q2, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r0, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vmov.32 q1[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r0 -; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q2[2], r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r0 +; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: adds.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: vmov.u8 r3, q0[15] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r3 @@ -1148,18 +1268,20 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %b) { ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov r2, s1 @@ -1179,28 +1301,32 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %b) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vand q2, q1, q2 +; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r1, s3 @@ -1221,19 +1347,21 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %b) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r1, s3 @@ -1299,19 +1427,21 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %b, ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 @@ -1336,24 +1466,28 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %b, ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vand q0, q2, q0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 @@ -1463,23 +1597,28 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b, ; CHECK-NEXT: vcmp.i16 eq, q1, zr ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.u16 r3, q2[2] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.u16 r3, q2[3] -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.32 q1[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r12 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r12 -; CHECK-NEXT: vmov.u16 r12, q0[0] +; CHECK-NEXT: vmov.32 q3[2], r3 +; CHECK-NEXT: vmov.32 q3[3], r3 +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vmov.32 q4[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r12 +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 ; CHECK-NEXT: vmov r12, s15 @@ -1489,61 +1628,72 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %b, ; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: add lr, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 ; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 ; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adds.w r4, lr, r3 ; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, r12, r2 ; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q2[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: adds.w r12, r4, r3 +; CHECK-NEXT: adc.w lr, lr, r2 ; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.u16 r3, q2[7] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: and r4, r2, #1 +; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.32 q2[0], r4 +; CHECK-NEXT: vmov.32 q2[1], r4 ; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: vmov.32 q2[3], r4 +; CHECK-NEXT: vmov.u16 r4, q0[4] +; CHECK-NEXT: vmov.32 q3[0], r4 ; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 +; CHECK-NEXT: vmov.32 q3[2], r4 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adds.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: adds.w r12, r12, r3 ; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, lr, r4 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, lr, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 ; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 @@ -1576,113 +1726,133 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %b, ; CHECK-NEXT: vcmp.i16 eq, q1, zr ; CHECK-NEXT: vpsel q1, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.u16 r3, q1[2] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.u16 r3, q1[3] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r12, p0 -; CHECK-NEXT: and r3, r12, #1 -; CHECK-NEXT: ubfx r2, r12, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov.32 q3[3], r3 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov lr, s11 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adds r5, r4, r2 -; CHECK-NEXT: ubfx r4, r12, #8, #1 -; CHECK-NEXT: ubfx r2, r12, #12, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: vmov r5, s9 +; CHECK-NEXT: adds.w lr, r4, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: adc.w r12, r12, r5 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.u16 r4, q0[3] ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adcs r3, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r12, r5, r2 -; CHECK-NEXT: vmov.u16 r5, q1[4] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u16 r4, q1[6] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q1[5] -; CHECK-NEXT: vmov.u16 r4, q1[7] -; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov r5, s11 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds.w r12, r3, r4 +; CHECK-NEXT: adc.w r3, r2, r5 +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: and r4, r5, #1 -; CHECK-NEXT: ubfx r2, r5, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r2 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q1[0], r5 +; CHECK-NEXT: vmov.32 q1[1], r5 +; CHECK-NEXT: ubfx r5, r2, #4, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q1[2], r5 +; CHECK-NEXT: vmov.32 q1[3], r5 +; CHECK-NEXT: vmov.u16 r5, q0[4] +; CHECK-NEXT: sxth r5, r5 +; CHECK-NEXT: vmov.32 q2[0], r5 +; CHECK-NEXT: asrs r5, r5, #31 +; CHECK-NEXT: vmov.32 q2[1], r5 +; CHECK-NEXT: vmov.u16 r5, q0[5] +; CHECK-NEXT: sxth r5, r5 +; CHECK-NEXT: vmov.32 q2[2], r5 +; CHECK-NEXT: asrs r5, r5, #31 +; CHECK-NEXT: vmov.32 q2[3], r5 ; CHECK-NEXT: vand q1, q2, q1 ; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: adds.w r12, r12, r4 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: adcs r5, r3 ; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: ubfx r4, r5, #8, #1 -; CHECK-NEXT: ubfx r5, r5, #12, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r4 -; CHECK-NEXT: vmov.u16 r5, q0[6] -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: sxth r5, r5 -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, r5, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q2, q1 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -1700,18 +1870,20 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %b, ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: orr.w r12, r3, r2 @@ -1736,28 +1908,32 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %b, ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q2, #0xffff -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vand q2, q1, q2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 @@ -1929,8 +2105,8 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vcmp.i8 eq, q1, zr @@ -1956,23 +2132,28 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vcmp.i16 ne, q1, zr ; CHECK-NEXT: vpsel q5, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q5[0] -; CHECK-NEXT: vmov.u16 r3, q5[2] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov.u16 r2, q5[1] -; CHECK-NEXT: vmov.u16 r3, q5[3] -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q5[2] +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.u16 r2, q5[3] +; CHECK-NEXT: vmov.32 q1[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: rsb.w r12, r3, #0 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r3, r12 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r12 -; CHECK-NEXT: vmov.u8 r12, q0[0] +; CHECK-NEXT: vmov.32 q6[2], r3 +; CHECK-NEXT: vmov.32 q6[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: vmov.32 q7[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov q7[2], q7[0], r3, r12 +; CHECK-NEXT: vmov.32 q7[2], r3 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 ; CHECK-NEXT: vmov r12, s27 @@ -1982,70 +2163,80 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov r3, s24 ; CHECK-NEXT: add lr, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 ; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q6[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.32 q6[1], r3 +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.32 q6[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 +; CHECK-NEXT: vmov.32 q7[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov.32 q7[2], r2 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 ; CHECK-NEXT: vmov r3, s24 ; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adds.w r4, lr, r3 ; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: adc.w lr, r12, r2 ; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q5[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: adds.w r12, r4, r3 +; CHECK-NEXT: adc.w lr, lr, r2 ; CHECK-NEXT: vmov.u16 r2, q5[4] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u16 r2, q5[5] -; CHECK-NEXT: vmov.u16 r3, q5[7] -; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vmov.32 q6[1], r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.32 q6[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: and r4, r2, #1 +; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.32 q5[0], r4 +; CHECK-NEXT: vmov.32 q5[1], r4 ; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vmov.32 q5[2], r4 +; CHECK-NEXT: vmov.32 q5[3], r4 +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: vmov.32 q6[0], r4 ; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: vmov q6[2], q6[0], r4, r3 +; CHECK-NEXT: vmov.32 q6[2], r4 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: adds.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov r4, s21 +; CHECK-NEXT: adds.w r12, r12, r3 ; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, lr, r4 +; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, lr, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 ; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q5[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.32 q6[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.32 q6[2], r2 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 ; CHECK-NEXT: vmov r3, s20 ; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: adc.w r4, r12, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds.w r12, lr, r3 -; CHECK-NEXT: adc.w lr, r4, r2 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s23 +; CHECK-NEXT: adc.w lr, r12, r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adds.w r12, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -2062,104 +2253,121 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: adc.w lr, lr, r4 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.u16 r4, q2[2] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.u16 r4, q2[3] -; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r4 -; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: vmov.32 q3[1], r4 +; CHECK-NEXT: ubfx r4, r2, #4, #1 +; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.32 q3[3], r4 +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: vmov.32 q4[0], r4 ; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.32 q4[2], r4 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds.w r5, r12, r4 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r4, s13 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: adc.w lr, lr, r4 ; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: adds.w r4, r4, r12 ; CHECK-NEXT: adc.w r12, lr, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: ubfx r4, r2, #8, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r4 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r2 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov.u16 r4, q2[6] -; CHECK-NEXT: adc.w r12, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov.u16 r4, q2[7] -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s15 +; CHECK-NEXT: adc.w lr, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: adc.w lr, lr, r4 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r4, r3, #1 -; CHECK-NEXT: ubfx r2, r3, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r4, r2, #1 ; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 -; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.32 q2[0], r4 +; CHECK-NEXT: vmov.32 q2[1], r4 +; CHECK-NEXT: ubfx r4, r2, #4, #1 +; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: vmov.32 q2[3], r4 +; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: vmov.32 q3[0], r4 ; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 +; CHECK-NEXT: vmov.32 q3[2], r4 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adc.w r5, r12, r4 -; CHECK-NEXT: ubfx r4, r3, #8, #1 -; CHECK-NEXT: ubfx r3, r3, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: adc.w lr, lr, r4 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, lr, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -2199,239 +2407,279 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q4, q2, q1 ; CHECK-NEXT: vmov.u16 r2, q4[0] -; CHECK-NEXT: vmov.u16 r3, q4[2] -; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov.u16 r2, q4[1] -; CHECK-NEXT: vmov.u16 r3, q4[3] -; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vmov.32 q5[1], r2 +; CHECK-NEXT: vmov.u16 r2, q4[2] +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.u16 r2, q4[3] +; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q5, zr -; CHECK-NEXT: vmrs r12, p0 -; CHECK-NEXT: and r3, r12, #1 -; CHECK-NEXT: ubfx r2, r12, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q5[2], r3 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q6[2], r3 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vmov.32 q6[3], r3 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov r3, s22 ; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov lr, s23 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: adds r5, r4, r2 -; CHECK-NEXT: ubfx r4, r12, #8, #1 -; CHECK-NEXT: ubfx r2, r12, #12, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 +; CHECK-NEXT: vmov r12, s23 +; CHECK-NEXT: vmov r5, s21 +; CHECK-NEXT: adds.w lr, r4, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q5[0], r3 ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: vmov q5[2], q5[0], r2, r4 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r4 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: adc.w r12, r12, r5 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: vmov.u8 r4, q0[3] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q6[2], q6[0], r4, r2 +; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q6[3], q6[1], r4, r2 +; CHECK-NEXT: vmov.32 q6[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q6[3], r2 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: vmov r3, s20 ; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s23 -; CHECK-NEXT: adcs r3, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds.w r12, r5, r2 -; CHECK-NEXT: vmov.u16 r5, q4[4] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u16 r4, q4[6] -; CHECK-NEXT: vmov q5[2], q5[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q4[5] -; CHECK-NEXT: vmov.u16 r4, q4[7] -; CHECK-NEXT: vmov q5[3], q5[1], r4, r5 +; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: vmov r5, s23 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds.w r12, r3, r4 +; CHECK-NEXT: adc.w r3, r2, r5 +; CHECK-NEXT: vmov.u16 r2, q4[4] +; CHECK-NEXT: vmov.32 q5[0], r2 +; CHECK-NEXT: vmov.u16 r2, q4[5] +; CHECK-NEXT: vmov.32 q5[1], r2 +; CHECK-NEXT: vmov.u16 r2, q4[6] +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.u16 r2, q4[7] +; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q5, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: and r4, r5, #1 -; CHECK-NEXT: ubfx r2, r5, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r4 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r2 -; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: ubfx r4, r5, #8, #1 -; CHECK-NEXT: ubfx r5, r5, #12, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 -; CHECK-NEXT: vmov.u8 r5, q0[6] -; CHECK-NEXT: vmov.u8 r4, q0[7] +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q4[0], r5 +; CHECK-NEXT: vmov.32 q4[1], r5 +; CHECK-NEXT: ubfx r5, r2, #4, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q4[2], r5 +; CHECK-NEXT: vmov.32 q4[3], r5 +; CHECK-NEXT: vmov.u8 r5, q0[4] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.32 q5[0], r5 +; CHECK-NEXT: asrs r5, r5, #31 +; CHECK-NEXT: vmov.32 q5[1], r5 +; CHECK-NEXT: vmov.u8 r5, q0[5] ; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r5 +; CHECK-NEXT: vmov.32 q5[2], r5 ; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r5 +; CHECK-NEXT: vmov.32 q5[3], r5 ; CHECK-NEXT: vand q4, q5, q4 ; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: vmov r5, s17 -; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adds.w r12, r12, r4 ; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adcs r5, r3 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, r5, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q4[1], r3 +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vmov.32 q4[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q5[0], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q5[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q5[3], r2 +; CHECK-NEXT: vand q4, q5, q4 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov r5, s19 -; CHECK-NEXT: adds.w r12, r2, r4 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov.u8 r5, q3[8] -; CHECK-NEXT: vmov.16 q4[0], r5 -; CHECK-NEXT: vmov.u8 r5, q3[9] -; CHECK-NEXT: vmov.16 q4[1], r5 -; CHECK-NEXT: vmov.u8 r5, q3[10] -; CHECK-NEXT: vmov.16 q4[2], r5 -; CHECK-NEXT: vmov.u8 r5, q3[11] -; CHECK-NEXT: vmov.16 q4[3], r5 -; CHECK-NEXT: vmov.u8 r5, q3[12] -; CHECK-NEXT: vmov.16 q4[4], r5 -; CHECK-NEXT: vmov.u8 r5, q3[13] -; CHECK-NEXT: vmov.16 q4[5], r5 -; CHECK-NEXT: vmov.u8 r5, q3[14] -; CHECK-NEXT: vmov.16 q4[6], r5 -; CHECK-NEXT: vmov.u8 r5, q3[15] -; CHECK-NEXT: vmov.16 q4[7], r5 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds.w r12, r3, r4 +; CHECK-NEXT: adc.w r3, r2, r5 +; CHECK-NEXT: vmov.u8 r2, q3[8] +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q3[9] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u8 r2, q3[10] +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.u8 r2, q3[11] +; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov.u8 r2, q3[12] +; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vmov.u8 r2, q3[13] +; CHECK-NEXT: vmov.16 q4[5], r2 +; CHECK-NEXT: vmov.u8 r2, q3[14] +; CHECK-NEXT: vmov.16 q4[6], r2 +; CHECK-NEXT: vmov.u8 r2, q3[15] +; CHECK-NEXT: vmov.16 q4[7], r2 ; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov.u16 r5, q1[0] -; CHECK-NEXT: vmov.u16 r4, q1[2] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q1[1] -; CHECK-NEXT: vmov.u16 r4, q1[3] -; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: and r4, r5, #1 -; CHECK-NEXT: ubfx r2, r5, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 -; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: ubfx r4, r5, #8, #1 -; CHECK-NEXT: ubfx r5, r5, #12, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r5, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r4 -; CHECK-NEXT: vmov.u8 r5, q0[10] -; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q2[0], r5 +; CHECK-NEXT: vmov.32 q2[1], r5 +; CHECK-NEXT: ubfx r5, r2, #4, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q2[2], r5 +; CHECK-NEXT: vmov.32 q2[3], r5 +; CHECK-NEXT: vmov.u8 r5, q0[8] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.32 q3[0], r5 +; CHECK-NEXT: asrs r5, r5, #31 +; CHECK-NEXT: vmov.32 q3[1], r5 +; CHECK-NEXT: vmov.u8 r5, q0[9] ; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 +; CHECK-NEXT: vmov.32 q3[2], r5 ; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov.32 q3[3], r5 ; CHECK-NEXT: vand q2, q3, q2 ; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: vmov r5, s9 -; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adds.w r12, r12, r4 ; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s11 -; CHECK-NEXT: adds.w r12, r2, r4 -; CHECK-NEXT: vmov.u16 r4, q1[6] -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov.u16 r5, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r5 -; CHECK-NEXT: vmov.u16 r5, q1[5] -; CHECK-NEXT: vmov.u16 r4, q1[7] -; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: and r4, r5, #1 -; CHECK-NEXT: ubfx r2, r5, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: adcs r5, r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, r5, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: vand q2, q3, q2 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov r5, s11 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds.w r12, r3, r4 +; CHECK-NEXT: adc.w r3, r2, r5 +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q1[0], r5 +; CHECK-NEXT: vmov.32 q1[1], r5 +; CHECK-NEXT: ubfx r5, r2, #4, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q1[2], r5 +; CHECK-NEXT: vmov.32 q1[3], r5 +; CHECK-NEXT: vmov.u8 r5, q0[12] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.32 q2[0], r5 +; CHECK-NEXT: asrs r5, r5, #31 +; CHECK-NEXT: vmov.32 q2[1], r5 +; CHECK-NEXT: vmov.u8 r5, q0[13] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.32 q2[2], r5 +; CHECK-NEXT: asrs r5, r5, #31 +; CHECK-NEXT: vmov.32 q2[3], r5 ; CHECK-NEXT: vand q1, q2, q1 ; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: adds.w r12, r12, r4 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: adcs r5, r3 ; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: ubfx r4, r5, #8, #1 -; CHECK-NEXT: ubfx r5, r5, #12, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r4 -; CHECK-NEXT: vmov.u8 r5, q0[14] -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, r5, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q2, q1 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -2450,18 +2698,20 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %b, i6 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: orr.w r12, r3, r2 @@ -2486,28 +2736,32 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %b, i6 ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vand q2, q1, q2 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q1[3], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 @@ -2533,19 +2787,21 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %b, i64 ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s5 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s7 ; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: orrs.w r3, r3, r12 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll index 5906fbc..ee15f82 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -429,17 +429,19 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.u8 r1, q1[1] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.32 q4[0], r1 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.32 q3[2], r0 ; CHECK-NEXT: vmov.i64 q2, #0xff -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 +; CHECK-NEXT: vmov.32 q4[2], r1 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r0, s14 @@ -447,163 +449,189 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: umull r12, r1, r1, r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.32 q4[2], r0 ; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: orr.w lr, r3, r1 ; CHECK-NEXT: vmov.u8 r3, q1[2] -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u8 r3, q1[3] +; CHECK-NEXT: vmov.32 q3[2], r3 +; CHECK-NEXT: add r2, r12 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r1, r3 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: vmov.u8 r3, q1[5] -; CHECK-NEXT: vmov q3[3], q3[1], r4, r1 -; CHECK-NEXT: add.w r1, r2, r12 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, lr +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: umull r0, r3, r0, r3 +; CHECK-NEXT: vmov.32 q5[0], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umull r0, r3, r3, r0 +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vmov r1, s20 +; CHECK-NEXT: vmov r0, s21 ; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: adc.w r2, lr, r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: adds.w r12, r1, r0 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[4] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: adc.w r12, r0, r4 ; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[7] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: adc.w r12, r0, r4 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov.u8 r4, q0[7] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[9] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[8] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: adc.w r12, r0, r4 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[11] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: adc.w r12, r0, r4 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[11] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[13] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[12] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: adc.w r12, r0, r4 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[13] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[15] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r4 +; CHECK-NEXT: vmov.32 q5[0], r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q5[2], r2 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[14] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[15] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: vand q1, q3, q2 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q0, q3, q2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umlal r0, r1, r3, r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: umlal r0, r1, r3, r2 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -615,140 +643,152 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u8 r0, q1[0] ; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov.u8 r2, q1[1] -; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w lr, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q1[2] -; CHECK-NEXT: adc.w r12, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r1 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov.u8 r3, q0[2] ; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u8 r1, q1[4] -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.u8 r1, q1[2] ; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: smull r1, r3, r3, r1 -; CHECK-NEXT: smull r0, r2, r2, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: adc.w r12, r0, r2 +; CHECK-NEXT: vmov.32 q2[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[3] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r1, r3, r3, r1 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r2, r3 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[8] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u8 r1, q1[8] -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r1, r3, r3, r1 -; CHECK-NEXT: smull r0, r2, r2, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: adc.w r12, r0, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u8 r3, q0[10] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r2, r3 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[11] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[12] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u8 r1, q1[12] -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[13] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[13] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r1, r3, r3, r1 -; CHECK-NEXT: smull r0, r2, r2, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r3, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r3, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[14] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[14] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 @@ -758,7 +798,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smlal r0, r1, r3, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i64> %yy = sext <16 x i8> %y to <16 x i64> @@ -1316,183 +1356,211 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.u8 r2, q1[0] -; CHECK-NEXT: vmov.u8 r3, q1[1] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[1] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.i64 q2, #0xff -; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r12, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov.u8 r4, q1[3] +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov.u8 r4, q0[2] +; CHECK-NEXT: umull r12, lr, r3, r2 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov.u8 r5, q0[3] -; CHECK-NEXT: umull r12, lr, r2, r12 ; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.32 q4[0], r4 +; CHECK-NEXT: vmov.u8 r4, q0[3] +; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: umull r2, r3, r3, r2 ; CHECK-NEXT: orr.w lr, lr, r3 ; CHECK-NEXT: vmov.u8 r3, q1[2] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[2] -; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u8 r3, q1[3] +; CHECK-NEXT: vmov.32 q3[2], r3 +; CHECK-NEXT: add r2, r12 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r6, s18 ; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: umull r5, r6, r6, r5 -; CHECK-NEXT: vmov q3[2], q3[0], r5, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r6, r4 -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r3, r4, r4, r3 +; CHECK-NEXT: vmov.32 q5[2], r3 +; CHECK-NEXT: vmov.32 q5[3], r4 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov r5, s21 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, lr, r5 +; CHECK-NEXT: vmov r5, s22 ; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q1[4] +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: vmov.32 q3[0], r5 ; CHECK-NEXT: vmov.u8 r5, q1[5] -; CHECK-NEXT: adcs r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[4] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[4] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov.32 q4[0], r4 +; CHECK-NEXT: vmov.u8 r4, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.32 q4[2], r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[0], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[2], r5 +; CHECK-NEXT: vmov.32 q5[3], r4 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r5, s21 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r5, s22 +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q1[6] +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.u8 r4, q0[6] +; CHECK-NEXT: vmov.32 q3[0], r5 ; CHECK-NEXT: vmov.u8 r5, q1[7] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds.w r6, r6, r12 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[6] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: adc.w r12, r2, r4 -; CHECK-NEXT: vmov.u8 r5, q0[6] +; CHECK-NEXT: vmov.32 q4[0], r4 ; CHECK-NEXT: vmov.u8 r4, q0[7] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.32 q4[2], r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[0], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[2], r5 +; CHECK-NEXT: vmov.32 q5[3], r4 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r5, s21 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r5, s22 +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q1[8] +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: vmov.32 q3[0], r5 ; CHECK-NEXT: vmov.u8 r5, q1[9] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[8] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: adc.w r12, r2, r4 -; CHECK-NEXT: vmov.u8 r5, q0[8] +; CHECK-NEXT: vmov.32 q4[0], r4 ; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.32 q4[2], r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[0], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[2], r5 +; CHECK-NEXT: vmov.32 q5[3], r4 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r5, s21 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r5, s22 +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q1[10] +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.u8 r4, q0[10] +; CHECK-NEXT: vmov.32 q3[0], r5 ; CHECK-NEXT: vmov.u8 r5, q1[11] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[10] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: adc.w r12, r2, r4 -; CHECK-NEXT: vmov.u8 r5, q0[10] +; CHECK-NEXT: vmov.32 q4[0], r4 ; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.32 q4[2], r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[0], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[2], r5 +; CHECK-NEXT: vmov.32 q5[3], r4 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r5, s21 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r5, s22 +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q1[12] +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: vmov.32 q3[0], r5 ; CHECK-NEXT: vmov.u8 r5, q1[13] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[12] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: adc.w r12, r2, r4 -; CHECK-NEXT: vmov.u8 r5, q0[12] +; CHECK-NEXT: vmov.32 q4[0], r4 ; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.32 q4[2], r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[0], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q5[2], r5 +; CHECK-NEXT: vmov.32 q5[3], r4 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r5, s21 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r5, s22 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q1[14] +; CHECK-NEXT: vmov.32 q3[0], r5 ; CHECK-NEXT: vmov.u8 r5, q1[15] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[14] -; CHECK-NEXT: vmov q1[2], q1[0], r5, r6 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov.u8 r5, q0[14] +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.u8 r4, q0[14] +; CHECK-NEXT: vand q1, q3, q2 +; CHECK-NEXT: vmov.32 q3[0], r4 ; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: umlal r3, r2, r5, r6 -; CHECK-NEXT: vmov r6, s6 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: umlal r3, r2, r5, r6 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vand q0, q3, q2 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: umlal r2, r3, r4, r5 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: umlal r2, r3, r4, r5 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -1505,25 +1573,22 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull lr, r12, r3, r2 -; CHECK-NEXT: vmov.u8 r3, q1[1] -; CHECK-NEXT: vmov.u8 r2, q0[1] -; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[1] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.u8 r4, q1[3] -; CHECK-NEXT: smull r2, r3, r2, r3 -; CHECK-NEXT: vmov.u8 r5, q0[3] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: vmov q2[2], q2[0], r2, lr -; CHECK-NEXT: smull r4, r5, r5, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r12 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vmov lr, s10 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov r12, s9 @@ -1534,123 +1599,140 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[4] -; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[3] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r5, r4, r4, r5 ; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r5, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r5, r4 -; CHECK-NEXT: vmov.u8 r4, q0[6] -; CHECK-NEXT: adc.w r12, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q1[6] -; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.u8 r5, q0[7] -; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov.u8 r4, q1[4] +; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: smull r2, r5, r5, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r4 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: vmov.32 q2[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[5] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: smull r2, r4, r4, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r4 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[8] -; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: adc.w r12, r12, r4 +; CHECK-NEXT: vmov.u8 r4, q1[6] ; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: vmov.32 q2[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: smull r2, r4, r4, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r4 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r5, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r5, r4 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: adc.w r12, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q1[10] -; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: vmov.u8 r5, q0[11] -; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: adc.w r12, r12, r4 +; CHECK-NEXT: vmov.u8 r4, q1[8] ; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: smull r2, r5, r5, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r4 +; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: vmov.32 q2[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[9] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: smull r2, r4, r4, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r4 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[12] -; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: adc.w r12, r12, r4 +; CHECK-NEXT: vmov.u8 r4, q1[10] ; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[11] +; CHECK-NEXT: vmov.32 q2[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: smull r2, r4, r4, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r4 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r5, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r5, r5, r4 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: adc.w r12, r12, r4 +; CHECK-NEXT: vmov.u8 r4, q1[12] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[13] +; CHECK-NEXT: vmov.32 q2[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: smull r2, r4, r4, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r4 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: vmov.u8 r4, q1[14] +; CHECK-NEXT: sxtb.w r12, r4 ; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov.u8 r3, q1[14] -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smlal r5, r2, r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[15] +; CHECK-NEXT: smlal r2, r3, r4, r12 +; CHECK-NEXT: vmov.u8 r4, q1[15] +; CHECK-NEXT: sxtb.w r12, r4 ; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smlal r5, r2, r4, r3 -; CHECK-NEXT: adds r0, r0, r5 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: smlal r2, r3, r4, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r4, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> %yy = sext <16 x i8> %y to <16 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll index 74f2b2c..72462bb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -52,17 +52,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmullb.u32 q3, q0, q1 -; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vand q0, q3, q0 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 @@ -86,17 +88,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmullb.s32 q3, q0, q1 -; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vand q0, q3, q0 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 @@ -285,37 +289,44 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) { ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i64 q3, #0xffff ; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vand q4, q0, q3 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov r1, s18 ; CHECK-NEXT: vand q1, q2, q3 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -331,32 +342,36 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xffff -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vand q3, q2, q3 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s14 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: sxth r1, r1 ; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r1 +; CHECK-NEXT: vand q0, q3, q2 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r1, s3 @@ -590,20 +605,22 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q5, q2, q0 -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vcmp.i8 eq, q2, zr +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vpsel q5, q1, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i64 q4, #0xff ; CHECK-NEXT: vmov.16 q2[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] ; CHECK-NEXT: vmov.16 q2[1], r0 @@ -619,149 +636,181 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov.u8 r0, q5[7] ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vmov.u8 r3, q4[1] -; CHECK-NEXT: vpsel q6, q3, q0 +; CHECK-NEXT: vpsel q6, q1, q0 ; CHECK-NEXT: vmov.u16 r0, q6[0] -; CHECK-NEXT: vmov.u16 r1, q6[2] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q6[1] -; CHECK-NEXT: vmov.u16 r1, q6[3] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u16 r0, q6[2] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q6[3] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: and r1, r0, #1 -; CHECK-NEXT: ubfx r2, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 -; CHECK-NEXT: vmov q7[3], q7[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q1[0] +; CHECK-NEXT: vmov.32 q7[0], r1 +; CHECK-NEXT: vmov.32 q7[1], r1 +; CHECK-NEXT: ubfx r1, r0, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: vmov.32 q7[2], r1 +; CHECK-NEXT: vmov.32 q7[3], r1 +; CHECK-NEXT: vmov.u8 r1, q3[0] +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.u8 r1, q3[1] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vand q2, q0, q4 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[1] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov.u8 r2, q4[0] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umull r1, r12, r2, r1 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: umull r2, r3, r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vand q1, q0, q4 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: umull r1, r2, r2, r1 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: umull r1, r2, r2, r1 +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vand q0, q0, q7 ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r12, s3 ; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds.w lr, r2, r1 -; CHECK-NEXT: vmov.u8 r1, q4[3] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: adc.w r2, r3, r12 ; CHECK-NEXT: ubfx r3, r0, #8, #1 -; CHECK-NEXT: ubfx r0, r0, #12, #1 ; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: ubfx r0, r0, #12, #1 +; CHECK-NEXT: vmov.32 q7[0], r3 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q7[2], q7[0], r0, r3 -; CHECK-NEXT: vmov q7[3], q7[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.u8 r3, q1[3] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov.u8 r3, q4[2] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r3 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.32 q7[1], r3 +; CHECK-NEXT: vmov.u8 r3, q2[2] +; CHECK-NEXT: vmov.32 q7[2], r0 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.32 q7[3], r0 +; CHECK-NEXT: vmov.u8 r0, q3[2] +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q3[3] +; CHECK-NEXT: vmov.u8 r3, q2[3] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vand q1, q1, q4 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov q7, q4 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: umull r0, r3, r3, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: umull r0, r3, r3, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vand q0, q2, q7 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adds r1, r1, r3 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r2, r12, r0 +; CHECK-NEXT: adcs r2, r0 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q4[5] -; CHECK-NEXT: adc.w lr, r2, r3 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q6[4] -; CHECK-NEXT: vmov.u16 r3, q6[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q6[5] -; CHECK-NEXT: vmov.u16 r3, q6[7] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q6[6] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q6[7] +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r0, r2, #4, #1 +; CHECK-NEXT: vmrs lr, p0 +; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: and r3, lr, #1 ; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: vmov.32 q6[1], r3 +; CHECK-NEXT: ubfx r3, lr, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q6[2], r3 +; CHECK-NEXT: vmov.32 q6[3], r3 +; CHECK-NEXT: vmov.u8 r3, q3[4] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.u8 r3, q3[5] +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vmov.u8 r0, q3[4] +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q3[5] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vand q1, q1, q4 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: umull r0, r3, r0, r3 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: umull r0, r3, r3, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vand q0, q2, q6 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adcs r1, r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: adds r3, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q3[6] +; CHECK-NEXT: adcs r1, r0 +; CHECK-NEXT: ubfx r0, lr, #8, #1 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.u8 r3, q1[5] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov.u8 r3, q4[4] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r3 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.32 q6[0], r0 +; CHECK-NEXT: vmov.32 q6[1], r0 +; CHECK-NEXT: ubfx r0, lr, #12, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q6[2], r0 +; CHECK-NEXT: vmov.32 q6[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.u8 r2, q3[7] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vand q1, q1, q4 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r1 -; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r3, lr, r0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: umull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r3, r4 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r4, q4[7] -; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: vmov.u8 r3, q1[7] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q4[6] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q6 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: umull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q2, q6 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adds r2, r2, r3 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds.w r2, r2, r12 ; CHECK-NEXT: adcs r1, r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: adds.w r12, r2, r0 ; CHECK-NEXT: vmov.u8 r2, q5[8] ; CHECK-NEXT: vmov.16 q6[0], r2 @@ -779,152 +828,182 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q6[6], r2 ; CHECK-NEXT: vmov.u8 r2, q5[15] ; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vmov.u8 r0, q7[9] -; CHECK-NEXT: vpsel q3, q3, q0 +; CHECK-NEXT: vmov.u8 r0, q7[8] +; CHECK-NEXT: vpsel q3, q1, q0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vmov.u16 r3, q3[2] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u8 r0, q7[9] +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q3[1] -; CHECK-NEXT: vmov.u16 r3, q3[3] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vmov.32 q1[2], r0 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r4, r2, #4, #1 +; CHECK-NEXT: vmrs lr, p0 +; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload +; CHECK-NEXT: vand q1, q1, q5 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: and r3, lr, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[8] -; CHECK-NEXT: vmov.u8 r4, q1[9] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q7[8] -; CHECK-NEXT: vmov q5[2], q5[0], r0, r4 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q5, q5, q2 +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.32 q4[1], r3 +; CHECK-NEXT: ubfx r3, lr, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: vmov.32 q4[3], r3 +; CHECK-NEXT: vmov.u8 r3, q6[8] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.u8 r3, q6[9] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vand q0, q0, q5 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r1, s22 ; CHECK-NEXT: umull r0, r3, r0, r3 -; CHECK-NEXT: umull r1, r4, r1, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: umull r0, r3, r3, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vand q0, q2, q4 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r3, lr, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adcs r1, r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: adds r3, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q7[10] +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.u8 r2, q7[11] +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vand q1, q1, q5 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: adcs r1, r0 +; CHECK-NEXT: ubfx r0, lr, #8, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: ubfx r0, lr, #12, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q6[10] +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q6[11] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: umull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r3, r4 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r4, q7[11] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: vmov.u8 r3, q1[11] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q7[10] -; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q5, q5, q2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: umull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q2, q4 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adds r2, r2, r3 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds.w r2, r2, r12 ; CHECK-NEXT: adcs r1, r0 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: adds.w r12, r2, r0 ; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: adc.w lr, r1, r3 -; CHECK-NEXT: vmov.u16 r3, q3[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.u16 r3, q3[7] -; CHECK-NEXT: vmov.u8 r0, q7[13] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[6] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q3[7] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r4, r2, #4, #1 +; CHECK-NEXT: vmov.u8 r0, q7[12] +; CHECK-NEXT: vmrs lr, p0 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q7[13] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vand q1, q1, q5 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: and r3, lr, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r3, q1[12] -; CHECK-NEXT: vmov.u8 r4, q1[13] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q7[12] -; CHECK-NEXT: vmov q4[2], q4[0], r0, r4 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: ubfx r3, lr, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q3[2], r3 +; CHECK-NEXT: vmov.32 q3[3], r3 +; CHECK-NEXT: vmov.u8 r3, q6[12] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.u8 r3, q6[13] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vand q0, q0, q5 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r1, s18 ; CHECK-NEXT: umull r0, r3, r0, r3 -; CHECK-NEXT: umull r1, r4, r1, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: umull r0, r3, r3, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vand q0, q2, q3 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r3, lr, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adcs r1, r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: adds r3, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q7[14] +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.u8 r2, q7[15] +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vand q1, q1, q5 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: adcs r1, r0 +; CHECK-NEXT: ubfx r0, lr, #8, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: ubfx r0, lr, #12, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov.u8 r0, q6[14] +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q6[15] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: umull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r3, r4 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r4, q7[15] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[14] -; CHECK-NEXT: vmov.u8 r3, q1[15] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q7[14] -; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: umull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q2, q3 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adds r2, r2, r3 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds.w r2, r2, r12 ; CHECK-NEXT: adcs r1, r0 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #88 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -938,18 +1017,15 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vmov.u8 r3, q1[1] +; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: vmov.u8 r0, q4[0] -; CHECK-NEXT: vmov.u8 r4, q0[5] ; CHECK-NEXT: vmov.16 q5[0], r0 ; CHECK-NEXT: vmov.u8 r0, q4[1] ; CHECK-NEXT: vmov.16 q5[1], r0 @@ -965,128 +1041,147 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov.u8 r0, q4[7] ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q5, q3, q2 ; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov.u16 r1, q5[2] -; CHECK-NEXT: vmov q6[2], q6[0], r1, r0 +; CHECK-NEXT: vmov.32 q6[0], r0 ; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.u16 r1, q5[3] -; CHECK-NEXT: vmov q6[3], q6[1], r1, r0 +; CHECK-NEXT: vmov.32 q6[1], r0 +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.32 q6[2], r0 +; CHECK-NEXT: vmov.u16 r0, q5[3] +; CHECK-NEXT: vmov.32 q6[3], r0 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmrs r0, p0 ; CHECK-NEXT: and r1, r0, #1 -; CHECK-NEXT: ubfx r2, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r1 +; CHECK-NEXT: vmov.32 q6[0], r1 +; CHECK-NEXT: vmov.32 q6[1], r1 +; CHECK-NEXT: ubfx r1, r0, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: vmov.32 q6[2], r1 +; CHECK-NEXT: vmov.32 q6[3], r1 ; CHECK-NEXT: vmov.u8 r1, q1[0] -; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r1, r12, r2, r1 +; CHECK-NEXT: smull r1, r2, r2, r1 +; CHECK-NEXT: vmov.32 q7[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[1] +; CHECK-NEXT: vmov.32 q7[1], r2 ; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r2, r3, r2, r3 -; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 -; CHECK-NEXT: vmov q7[3], q7[1], r3, r12 +; CHECK-NEXT: smull r1, r2, r2, r1 +; CHECK-NEXT: vmov.32 q7[2], r1 +; CHECK-NEXT: vmov.32 q7[3], r2 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: vmov r1, s26 +; CHECK-NEXT: vmov r2, s24 ; CHECK-NEXT: vmov r12, s27 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: adds.w lr, r1, r3 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: adc.w r2, r3, r12 ; CHECK-NEXT: ubfx r3, r0, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r0, r0, #12, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r0, r0, #0 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov.32 q6[2], r0 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q6[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: smull r0, r3, r3, r0 +; CHECK-NEXT: vmov.32 q7[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.32 q7[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: smull r0, r3, r3, r0 -; CHECK-NEXT: smull r1, r2, r2, r1 -; CHECK-NEXT: vmov q7[2], q7[0], r1, r0 -; CHECK-NEXT: vmov q7[3], q7[1], r2, r3 +; CHECK-NEXT: vmov.32 q7[2], r0 +; CHECK-NEXT: vmov.32 q7[3], r3 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: vmov r3, s24 ; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: adds r1, r1, r3 ; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r2, r12, r0 +; CHECK-NEXT: adcs r2, r0 ; CHECK-NEXT: vmov r0, s26 ; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q1[5] -; CHECK-NEXT: adc.w lr, r2, r3 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q5[4] -; CHECK-NEXT: vmov.u16 r3, q5[6] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u16 r2, q5[5] -; CHECK-NEXT: vmov.u16 r3, q5[7] -; CHECK-NEXT: smull r1, r4, r4, r1 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vmov.32 q6[1], r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.32 q6[3], r2 +; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r0, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r0, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r0, r3 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q5[2], r3 +; CHECK-NEXT: vmov.32 q5[3], r3 +; CHECK-NEXT: vmov.u8 r3, q1[4] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r0, r3, r0, r3 +; CHECK-NEXT: vmov.32 q6[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.32 q6[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smull r0, r3, r3, r0 -; CHECK-NEXT: vmov q6[2], q6[0], r1, r0 -; CHECK-NEXT: vmov q6[3], q6[1], r4, r3 +; CHECK-NEXT: vmov.32 q6[2], r0 +; CHECK-NEXT: vmov.32 q6[3], r3 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r1, s20 +; CHECK-NEXT: vmov r3, s20 ; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r0, r0, lr -; CHECK-NEXT: adds r1, r1, r4 -; CHECK-NEXT: vmov.u8 r4, q1[7] -; CHECK-NEXT: adc.w r12, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc.w r12, r1, r0 +; CHECK-NEXT: vmov r1, s22 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: adds r3, r3, r1 +; CHECK-NEXT: adc.w r1, r12, r0 +; CHECK-NEXT: ubfx r0, r2, #8, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q5[0], r0 +; CHECK-NEXT: vmov.32 q5[1], r0 +; CHECK-NEXT: ubfx r0, r2, #12, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov.32 q5[2], r0 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q5[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: smull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q6[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.32 q6[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r4, r0, r4 -; CHECK-NEXT: vmov q6[2], q6[0], r0, r2 -; CHECK-NEXT: vmov q6[3], q6[1], r4, r3 +; CHECK-NEXT: smull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q6[2], r0 +; CHECK-NEXT: vmov.32 q6[3], r2 ; CHECK-NEXT: vand q5, q6, q5 ; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: adds r2, r2, r3 ; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r12, r0 +; CHECK-NEXT: adcs r1, r0 ; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: adc.w lr, r2, r3 +; CHECK-NEXT: adds.w r12, r2, r0 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -1103,133 +1198,155 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vcmp.i16 ne, q5, zr -; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.u16 r3, q2[2] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.u16 r3, q2[3] -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r4, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q3[2], r3 +; CHECK-NEXT: vmov.32 q3[3], r3 ; CHECK-NEXT: vmov.u8 r3, q1[8] -; CHECK-NEXT: vmov.u8 r4, q0[8] ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r4 +; CHECK-NEXT: smull r0, r3, r0, r3 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.32 q4[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r0, r3, r3, r0 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r3 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r0, r0, lr -; CHECK-NEXT: adds r1, r1, r4 -; CHECK-NEXT: vmov.u8 r4, q1[11] -; CHECK-NEXT: adc.w r12, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc.w r12, r1, r0 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: adds r3, r3, r1 +; CHECK-NEXT: adc.w r1, r12, r0 +; CHECK-NEXT: ubfx r0, r2, #8, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: ubfx r0, r2, #12, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: smull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.32 q4[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r4, r0, r4 -; CHECK-NEXT: vmov q4[2], q4[0], r0, r2 -; CHECK-NEXT: vmov q4[3], q4[1], r4, r3 +; CHECK-NEXT: smull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.32 q4[3], r2 ; CHECK-NEXT: vand q3, q4, q3 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: adds r2, r2, r3 ; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r12, r0 +; CHECK-NEXT: adcs r1, r0 ; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: adc.w lr, r2, r3 +; CHECK-NEXT: adds.w r12, r2, r0 ; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.u16 r3, q2[6] -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.u16 r3, q2[7] -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r4, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vmov.u8 r3, q1[12] -; CHECK-NEXT: vmov.u8 r4, q0[12] ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r4 +; CHECK-NEXT: smull r0, r3, r0, r3 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r0, r3, r3, r0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r3 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r0, r0, lr -; CHECK-NEXT: adds r1, r1, r4 -; CHECK-NEXT: vmov.u8 r4, q1[15] -; CHECK-NEXT: adc.w r12, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[14] -; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc.w r12, r1, r0 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: adds r3, r3, r1 +; CHECK-NEXT: adc.w r1, r12, r0 +; CHECK-NEXT: ubfx r0, r2, #8, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: ubfx r0, r2, #12, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: smull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r4, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: smull r0, r2, r2, r0 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: vand q0, q3, q2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adds r2, r2, r3 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r12, r0 +; CHECK-NEXT: adcs r1, r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i64> @@ -1243,37 +1360,44 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) { ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i64 q3, #0xff ; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vand q4, q0, q3 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov r1, s18 ; CHECK-NEXT: vand q1, q2, q3 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1289,32 +1413,36 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vand q3, q2, q3 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s14 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.32 q3[3], r1 +; CHECK-NEXT: vand q0, q3, q2 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r1, s3 @@ -1335,38 +1463,40 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) { ; CHECK-LABEL: add_v2i64_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r4, r5, r2, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r4, lr -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: mla r1, r1, r4, r12 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: mla r0, r4, r0, r1 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: mla r1, r2, r1, r5 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: umull r12, r3, r1, r0 +; CHECK-NEXT: mla r1, r1, r2, r3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov.32 q3[0], r12 +; CHECK-NEXT: mla r0, r2, r0, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: umull r2, r3, r1, r0 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: mla r1, r1, r2, r3 ; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: mla r1, r2, r3, r1 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: mla r0, r2, r0, r1 ; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: cset r0, eq +; CHECK-NEXT: tst.w r0, #1 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov r0, s11 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s11 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: tst.w r1, #1 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vand q0, q3, q0 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 @@ -1374,7 +1504,7 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %b, zeroinitializer %m = mul <2 x i64> %x, %y @@ -1439,17 +1569,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmullb.u32 q3, q0, q1 -; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vand q0, q3, q0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 @@ -1478,17 +1610,19 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmullb.s32 q3, q0, q1 -; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vand q0, q3, q0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 @@ -1692,31 +1826,37 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i64 q3, #0xffff ; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vand q4, q0, q3 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull lr, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vand q1, q2, q3 -; CHECK-NEXT: umull r2, r3, r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.32 q0[3], r3 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 @@ -1725,6 +1865,7 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, ; CHECK-NEXT: adc.w r3, lr, r12 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -1743,32 +1884,36 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q3, #0xffff -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vand q3, q2, q3 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: smull lr, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: smull r2, r3, r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q3[3], r3 +; CHECK-NEXT: vand q0, q3, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 @@ -2021,16 +2166,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: .pad #80 +; CHECK-NEXT: sub sp, #80 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q5, q2, q0 -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vcmp.i8 eq, q2, zr +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vpsel q5, q1, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.u8 r2, q5[0] -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i64 q4, #0xff ; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[1] ; CHECK-NEXT: vmov.16 q2[1], r2 @@ -2046,152 +2193,184 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.16 q2[6], r2 ; CHECK-NEXT: vmov.u8 r2, q5[7] ; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vpsel q6, q3, q0 +; CHECK-NEXT: vpsel q6, q1, q0 ; CHECK-NEXT: vmov.u16 r2, q6[0] -; CHECK-NEXT: vmov.u16 r3, q6[2] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q6[1] -; CHECK-NEXT: vmov.u16 r3, q6[3] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q6[2] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q6[3] +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmrs lr, p0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: and r3, lr, #1 -; CHECK-NEXT: ubfx r2, lr, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q7[2], q7[0], r2, r3 -; CHECK-NEXT: vmov q7[3], q7[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmov.32 q7[0], r3 +; CHECK-NEXT: vmov.32 q7[1], r3 +; CHECK-NEXT: ubfx r3, lr, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q7[2], r3 +; CHECK-NEXT: vmov.32 q7[3], r3 +; CHECK-NEXT: vmov.u8 r3, q3[0] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.u8 r3, q3[1] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.u8 r3, q1[0] +; CHECK-NEXT: vand q2, q0, q4 +; CHECK-NEXT: vmov.32 q0[0], r3 ; CHECK-NEXT: vmov.u8 r3, q1[1] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q4[0] -; CHECK-NEXT: vmov.u8 r2, q4[1] -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: umull r2, r12, r2, r12 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r12 +; CHECK-NEXT: vmov r12, s8 +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vand q1, q0, q4 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: umull r3, r2, r3, r12 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q0[3], r3 ; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adds r6, r2, r3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: adds r4, r4, r2 ; CHECK-NEXT: ubfx r2, lr, #8, #1 ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: vmov.u8 r3, q4[3] -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: ubfx r4, lr, #12, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q7[2], q7[0], r4, r2 -; CHECK-NEXT: vmov q7[3], q7[1], r4, r2 -; CHECK-NEXT: vmov.u8 r2, q1[2] -; CHECK-NEXT: vmov.u8 r4, q1[3] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-NEXT: vmov.u8 r4, q4[2] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r4 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.32 q7[0], r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.32 q7[1], r2 +; CHECK-NEXT: ubfx r2, lr, #12, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.u8 r3, q2[2] +; CHECK-NEXT: vmov.32 q7[2], r2 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.32 q7[3], r2 +; CHECK-NEXT: vmov.u8 r2, q3[2] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.u8 r2, q3[3] +; CHECK-NEXT: vmov.u8 r3, q2[3] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vand q1, q1, q4 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r5, r4, r5, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q4[5] -; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov q7, q4 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vand q0, q2, q7 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: adc.w lr, r2, r6 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: adc.w lr, r12, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds.w r12, r3, r2 ; CHECK-NEXT: vmov.u16 r2, q6[4] -; CHECK-NEXT: vmov.u16 r6, q6[6] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q6[5] -; CHECK-NEXT: vmov.u16 r6, q6[7] -; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q6[6] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q6[7] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: adc.w lr, lr, r4 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r6, r2, #1 -; CHECK-NEXT: ubfx r5, r2, #4, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r5, r6 -; CHECK-NEXT: vmov q6[3], q6[1], r5, r6 -; CHECK-NEXT: vmov.u8 r6, q1[4] -; CHECK-NEXT: vmov.u8 r5, q1[5] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q4[4] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: vmrs r6, p0 +; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: and r4, r6, #1 +; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.32 q6[0], r4 +; CHECK-NEXT: vmov.32 q6[1], r4 +; CHECK-NEXT: ubfx r4, r6, #4, #1 +; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.32 q6[2], r4 +; CHECK-NEXT: vmov.32 q6[3], r4 +; CHECK-NEXT: vmov.u8 r4, q3[4] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.u8 r4, q3[5] +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.32 q0[2], r4 +; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vmov.u8 r3, q3[4] +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.u8 r3, q3[5] +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vand q1, q1, q4 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r6 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q6 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.32 q2[1], r4 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: umull r3, r4, r4, r3 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vmov.32 q2[3], r4 +; CHECK-NEXT: vand q0, q2, q6 ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds.w r6, r12, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r5, lr, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: ubfx r6, r2, #8, #1 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r6, r6, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r12, r5, r4 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r6 -; CHECK-NEXT: vmov.u8 r5, q4[7] -; CHECK-NEXT: vmov q6[3], q6[1], r2, r6 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds r4, r4, r5 +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: ubfx r2, r6, #8, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.u8 r3, q3[6] +; CHECK-NEXT: vmov.32 q6[0], r2 +; CHECK-NEXT: vmov.32 q6[1], r2 +; CHECK-NEXT: ubfx r2, r6, #12, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.32 q6[3], r2 ; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: vmov.u8 r6, q1[7] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: vmov.u8 r6, q4[6] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.u8 r3, q3[7] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vand q1, q1, q4 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 -; CHECK-NEXT: vmov.u8 r4, q7[9] -; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r6, s0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: vand q0, q2, q6 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 ; CHECK-NEXT: vmov r6, s3 +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: adds r3, r3, r4 ; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: adc.w lr, r2, r6 +; CHECK-NEXT: vmov.u8 r5, q7[8] +; CHECK-NEXT: adc.w r3, r2, r6 ; CHECK-NEXT: vmov.u8 r2, q5[8] ; CHECK-NEXT: vmov.16 q6[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[9] @@ -2209,69 +2388,84 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.u8 r2, q5[15] ; CHECK-NEXT: vmov.16 q6[7], r2 ; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vpsel q3, q3, q0 +; CHECK-NEXT: vpsel q3, q1, q0 +; CHECK-NEXT: vmov.32 q1[0], r5 ; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vmov.u16 r6, q3[2] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: vmov.u8 r5, q7[9] +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q3[1] -; CHECK-NEXT: vmov.u16 r6, q3[3] -; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[2] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vmov.32 q1[2], r5 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload +; CHECK-NEXT: vand q1, q1, q5 +; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: and r6, r2, #1 -; CHECK-NEXT: ubfx r5, r2, #4, #1 ; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q4[2], q4[0], r5, r6 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r6 -; CHECK-NEXT: vmov.u8 r6, q1[8] -; CHECK-NEXT: vmov.u8 r5, q1[9] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q7[8] -; CHECK-NEXT: vmov q5[2], q5[0], r4, r5 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q5, q5, q2 +; CHECK-NEXT: vmov.32 q4[0], r6 +; CHECK-NEXT: vmov.32 q4[1], r6 +; CHECK-NEXT: ubfx r6, r2, #4, #1 +; CHECK-NEXT: rsbs r6, r6, #0 +; CHECK-NEXT: vmov.32 q4[2], r6 +; CHECK-NEXT: vmov.32 q4[3], r6 +; CHECK-NEXT: vmov.u8 r6, q6[8] +; CHECK-NEXT: vmov.32 q0[0], r6 +; CHECK-NEXT: vmov.u8 r6, q6[9] +; CHECK-NEXT: vmov.32 q0[2], r6 +; CHECK-NEXT: vand q0, q0, q5 ; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r5, s20 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r3, s22 ; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r6 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds.w r6, r12, r4 +; CHECK-NEXT: vmov.32 q2[0], r6 +; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov.32 q2[1], r5 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: vmov.32 q2[2], r6 +; CHECK-NEXT: vmov.32 q2[3], r5 +; CHECK-NEXT: vand q0, q2, q4 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r6, s1 ; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r5, lr, r3 +; CHECK-NEXT: adds.w r5, r5, r12 +; CHECK-NEXT: adcs r6, r3 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adc.w r12, r6, r4 ; CHECK-NEXT: ubfx r6, r2, #8, #1 +; CHECK-NEXT: rsbs r6, r6, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r6, r6, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r12, r5, r4 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r6 -; CHECK-NEXT: vmov.u8 r5, q7[11] -; CHECK-NEXT: vmov q4[3], q4[1], r2, r6 -; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: vmov.u8 r6, q1[11] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: vmov.32 q4[0], r6 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q4[1], r6 ; CHECK-NEXT: vmov.u8 r6, q7[10] -; CHECK-NEXT: vmov q5[2], q5[0], r5, r6 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q5, q5, q2 +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vmov.32 q1[0], r6 +; CHECK-NEXT: vmov.32 q4[3], r2 +; CHECK-NEXT: vmov.u8 r2, q6[10] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.u8 r2, q6[11] +; CHECK-NEXT: vmov.u8 r6, q7[11] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q1[2], r6 +; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vand q1, q1, q5 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r6, s20 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: vmov r6, s4 ; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 -; CHECK-NEXT: vmov.u8 r4, q7[13] -; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.32 q2[1], r6 +; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: umull r2, r6, r6, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r6 +; CHECK-NEXT: vand q0, q2, q4 ; CHECK-NEXT: vmov r6, s0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r5, s2 @@ -2279,68 +2473,83 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov r6, s3 ; CHECK-NEXT: adc.w r2, r2, r12 ; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: adc.w lr, r2, r6 +; CHECK-NEXT: vmov.u8 r5, q7[12] +; CHECK-NEXT: vmov.32 q1[0], r5 +; CHECK-NEXT: vmov.u8 r5, q7[13] +; CHECK-NEXT: vmov.32 q1[2], r5 +; CHECK-NEXT: vand q1, q1, q5 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: adc.w r3, r2, r6 ; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.u16 r6, q3[6] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.u16 r6, q3[7] -; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[6] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q3[7] +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r6, r2, #1 -; CHECK-NEXT: ubfx r5, r2, #4, #1 ; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 -; CHECK-NEXT: vmov.u8 r6, q1[12] -; CHECK-NEXT: vmov.u8 r5, q1[13] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q7[12] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov.32 q3[0], r6 +; CHECK-NEXT: vmov.32 q3[1], r6 +; CHECK-NEXT: ubfx r6, r2, #4, #1 +; CHECK-NEXT: rsbs r6, r6, #0 +; CHECK-NEXT: vmov.32 q3[2], r6 +; CHECK-NEXT: vmov.32 q3[3], r6 +; CHECK-NEXT: vmov.u8 r6, q6[12] +; CHECK-NEXT: vmov.32 q0[0], r6 +; CHECK-NEXT: vmov.u8 r6, q6[13] +; CHECK-NEXT: vmov.32 q0[2], r6 +; CHECK-NEXT: vand q0, q0, q5 ; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r6 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds.w r6, r12, r4 +; CHECK-NEXT: vmov.32 q2[0], r6 +; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov.32 q2[1], r5 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: umull r6, r5, r5, r6 +; CHECK-NEXT: vmov.32 q2[2], r6 +; CHECK-NEXT: vmov.32 q2[3], r5 +; CHECK-NEXT: vand q0, q2, q3 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r6, s1 ; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r5, lr, r3 +; CHECK-NEXT: adds.w r5, r5, r12 +; CHECK-NEXT: adcs r6, r3 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adc.w r12, r6, r4 ; CHECK-NEXT: ubfx r6, r2, #8, #1 +; CHECK-NEXT: rsbs r6, r6, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsb.w r6, r6, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r12, r5, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov.u8 r5, q7[15] -; CHECK-NEXT: vmov q3[3], q3[1], r2, r6 -; CHECK-NEXT: vmov.u8 r2, q1[14] -; CHECK-NEXT: vmov.u8 r6, q1[15] -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: vmov.32 q3[0], r6 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[1], r6 ; CHECK-NEXT: vmov.u8 r6, q7[14] -; CHECK-NEXT: vmov q1[2], q1[0], r5, r6 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q1[0], r6 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: vmov.u8 r2, q6[14] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.u8 r2, q6[15] +; CHECK-NEXT: vmov.u8 r6, q7[15] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q1[2], r6 +; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vand q1, q1, q5 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vmov r4, s6 ; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 -; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.32 q2[1], r6 +; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: umull r2, r6, r6, r2 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r6 +; CHECK-NEXT: vand q0, q2, q3 ; CHECK-NEXT: vmov r6, s0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r5, s2 @@ -2351,7 +2560,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: adcs r2, r6 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #80 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: @@ -2368,18 +2577,15 @@ entry: define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vmov.u8 r4, q0[1] ; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: vmov.u8 r2, q4[0] -; CHECK-NEXT: vmov.u8 r5, q0[3] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[1] ; CHECK-NEXT: vmov.16 q5[1], r2 @@ -2395,132 +2601,151 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[7] ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: sxtb r5, r5 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q5, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q5[0] -; CHECK-NEXT: vmov.u16 r3, q5[2] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u16 r2, q5[1] -; CHECK-NEXT: vmov.u16 r3, q5[3] -; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: vmov.32 q6[1], r2 +; CHECK-NEXT: vmov.u16 r2, q5[2] +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.u16 r2, q5[3] +; CHECK-NEXT: vmov.32 q6[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmrs r12, p0 -; CHECK-NEXT: and r3, r12, #1 -; CHECK-NEXT: ubfx r2, r12, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: vmov.32 q6[1], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q6[2], r3 +; CHECK-NEXT: vmov.32 q6[3], r3 +; CHECK-NEXT: vmov.u8 r3, q1[0] +; CHECK-NEXT: sxtb.w r12, r3 ; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, lr, r3, r2 +; CHECK-NEXT: smull r3, r12, r3, r12 +; CHECK-NEXT: vmov.32 q7[0], r3 ; CHECK-NEXT: vmov.u8 r3, q1[1] +; CHECK-NEXT: vmov.32 q7[1], r12 +; CHECK-NEXT: sxtb.w r12, r3 +; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 -; CHECK-NEXT: vmov q7[3], q7[1], r4, lr +; CHECK-NEXT: smull r3, r12, r3, r12 +; CHECK-NEXT: vmov.32 q7[2], r3 +; CHECK-NEXT: vmov.32 q7[3], r12 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r4, s26 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov lr, s27 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: adds r6, r2, r4 -; CHECK-NEXT: ubfx r4, r12, #8, #1 -; CHECK-NEXT: ubfx r2, r12, #12, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 +; CHECK-NEXT: vmov r3, s26 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov r12, s27 +; CHECK-NEXT: vmov r5, s25 +; CHECK-NEXT: adds.w lr, r4, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: vmov.32 q6[0], r3 ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r4 -; CHECK-NEXT: vmov.u8 r3, q1[3] -; CHECK-NEXT: vmov q6[3], q6[1], r2, r4 -; CHECK-NEXT: vmov.u8 r2, q1[2] -; CHECK-NEXT: vmov.u8 r4, q0[2] +; CHECK-NEXT: vmov.32 q6[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[2] +; CHECK-NEXT: vmov.32 q6[2], r2 ; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q6[3], r2 +; CHECK-NEXT: vmov.u8 r2, q1[2] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smull r2, r4, r4, r2 -; CHECK-NEXT: smull r3, r5, r5, r3 -; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 -; CHECK-NEXT: vmov q7[3], q7[1], r5, r4 -; CHECK-NEXT: vmov.u8 r4, q1[5] +; CHECK-NEXT: adc.w r12, r12, r5 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q7[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[3] +; CHECK-NEXT: vmov.32 q7[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q7[2], r2 +; CHECK-NEXT: vmov.32 q7[3], r3 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: vmov r3, s24 ; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov r5, s26 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s27 -; CHECK-NEXT: adc.w r2, r2, lr -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r3, r4, r3, r4 -; CHECK-NEXT: adc.w lr, r2, r6 +; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov r5, s27 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds.w r12, r3, r4 +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: adc.w r3, r2, r5 ; CHECK-NEXT: vmov.u16 r2, q5[4] -; CHECK-NEXT: vmov.u16 r6, q5[6] -; CHECK-NEXT: vmov q6[2], q6[0], r6, r2 +; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u16 r2, q5[5] -; CHECK-NEXT: vmov.u16 r6, q5[7] -; CHECK-NEXT: vmov q6[3], q6[1], r6, r2 +; CHECK-NEXT: vmov.32 q6[1], r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.32 q6[3], r2 +; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r6, r2, #1 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q5[0], r5 +; CHECK-NEXT: vmov.32 q5[1], r5 ; CHECK-NEXT: ubfx r5, r2, #4, #1 -; CHECK-NEXT: rsbs r6, r6, #0 ; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r5, r6 -; CHECK-NEXT: vmov q5[3], q5[1], r5, r6 -; CHECK-NEXT: vmov.u8 r6, q1[4] -; CHECK-NEXT: vmov.u8 r5, q0[4] -; CHECK-NEXT: sxtb r6, r6 +; CHECK-NEXT: vmov.32 q5[2], r5 +; CHECK-NEXT: vmov.32 q5[3], r5 +; CHECK-NEXT: vmov.u8 r5, q1[4] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q6[0], r5 +; CHECK-NEXT: vmov.u8 r5, q1[5] +; CHECK-NEXT: vmov.32 q6[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[5] ; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: smull r6, r5, r5, r6 -; CHECK-NEXT: vmov q6[2], q6[0], r3, r6 -; CHECK-NEXT: vmov q6[3], q6[1], r4, r5 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q6[2], r5 +; CHECK-NEXT: vmov.32 q6[3], r4 ; CHECK-NEXT: vand q5, q6, q5 ; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: vmov r5, s23 -; CHECK-NEXT: adds.w r6, r12, r4 +; CHECK-NEXT: vmov r5, s21 +; CHECK-NEXT: adds.w r12, r12, r4 ; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds r6, r6, r4 -; CHECK-NEXT: vmov.u8 r4, q1[7] -; CHECK-NEXT: adc.w r12, r3, r5 -; CHECK-NEXT: ubfx r5, r2, #8, #1 +; CHECK-NEXT: adcs r5, r3 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, r5, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q5[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov q5[2], q5[0], r2, r5 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r5 -; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: vmov.u8 r5, q0[6] +; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: vmov.32 q5[2], r2 ; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q5[3], r2 +; CHECK-NEXT: vmov.u8 r2, q1[6] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: smull r2, r5, r5, r2 -; CHECK-NEXT: smull r3, r4, r3, r4 -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 -; CHECK-NEXT: vmov q6[3], q6[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q1[9] +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q6[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: vmov.32 q6[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.32 q6[3], r3 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: vmov r3, s20 ; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s23 +; CHECK-NEXT: vmov r5, s23 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s22 ; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r3, r4, r3, r4 -; CHECK-NEXT: adc.w lr, r2, r6 +; CHECK-NEXT: adds.w r12, r3, r4 +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: adc.w r3, r2, r5 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -2537,131 +2762,155 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.u16 r6, q2[2] -; CHECK-NEXT: vmov q3[2], q3[0], r6, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.u16 r6, q2[3] -; CHECK-NEXT: vmov q3[3], q3[1], r6, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r6, r2, #1 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q3[0], r5 +; CHECK-NEXT: vmov.32 q3[1], r5 ; CHECK-NEXT: ubfx r5, r2, #4, #1 -; CHECK-NEXT: rsbs r6, r6, #0 ; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 -; CHECK-NEXT: vmov.u8 r6, q1[8] -; CHECK-NEXT: vmov.u8 r5, q0[8] -; CHECK-NEXT: sxtb r6, r6 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.32 q3[3], r5 +; CHECK-NEXT: vmov.u8 r5, q1[8] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q4[0], r5 +; CHECK-NEXT: vmov.u8 r5, q1[9] +; CHECK-NEXT: vmov.32 q4[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[9] ; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: smull r6, r5, r5, r6 -; CHECK-NEXT: vmov q4[2], q4[0], r3, r6 -; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q4[2], r5 +; CHECK-NEXT: vmov.32 q4[3], r4 ; CHECK-NEXT: vand q3, q4, q3 ; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: adds.w r6, r12, r4 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: adds.w r12, r12, r4 ; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds r6, r6, r4 -; CHECK-NEXT: vmov.u8 r4, q1[11] -; CHECK-NEXT: adc.w r12, r3, r5 -; CHECK-NEXT: ubfx r5, r2, #8, #1 +; CHECK-NEXT: adcs r5, r3 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, r5, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r5 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r5 -; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: vmov.u8 r5, q0[10] +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: vmov.u8 r2, q1[10] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: smull r2, r5, r5, r2 -; CHECK-NEXT: smull r3, r4, r3, r4 -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 -; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q1[13] +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[11] +; CHECK-NEXT: vmov.32 q4[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vmov.32 q4[3], r3 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s15 +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s14 ; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r3, r4, r3, r4 -; CHECK-NEXT: adc.w lr, r2, r6 +; CHECK-NEXT: adds.w r12, r3, r4 +; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: adc.w r3, r2, r5 ; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.u16 r6, q2[6] -; CHECK-NEXT: vmov q3[2], q3[0], r6, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.u16 r6, q2[7] -; CHECK-NEXT: vmov q3[3], q3[1], r6, r2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r6, r2, #1 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q2[0], r5 +; CHECK-NEXT: vmov.32 q2[1], r5 ; CHECK-NEXT: ubfx r5, r2, #4, #1 -; CHECK-NEXT: rsbs r6, r6, #0 ; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r5, r6 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r6 -; CHECK-NEXT: vmov.u8 r6, q1[12] -; CHECK-NEXT: vmov.u8 r5, q0[12] -; CHECK-NEXT: sxtb r6, r6 +; CHECK-NEXT: vmov.32 q2[2], r5 +; CHECK-NEXT: vmov.32 q2[3], r5 +; CHECK-NEXT: vmov.u8 r5, q1[12] ; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: smull r6, r5, r5, r6 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q3[0], r5 +; CHECK-NEXT: vmov.u8 r5, q1[13] +; CHECK-NEXT: vmov.32 q3[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.32 q3[3], r4 ; CHECK-NEXT: vand q2, q3, q2 ; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: vmov r5, s11 -; CHECK-NEXT: adds.w r6, r12, r4 +; CHECK-NEXT: vmov r5, s9 +; CHECK-NEXT: adds.w r12, r12, r4 ; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds r6, r6, r4 -; CHECK-NEXT: vmov.u8 r4, q1[15] -; CHECK-NEXT: adc.w r12, r3, r5 -; CHECK-NEXT: ubfx r5, r2, #8, #1 +; CHECK-NEXT: adcs r5, r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adc.w r12, r5, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r5 -; CHECK-NEXT: vmov.u8 r2, q1[14] -; CHECK-NEXT: vmov.u8 r5, q0[14] +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vmov.u8 r2, q1[14] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: smull r2, r5, r5, r2 -; CHECK-NEXT: smull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[15] +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q3[3], r3 +; CHECK-NEXT: vand q0, q3, q2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s3 +; CHECK-NEXT: vmov r5, s3 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r2, r6 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r2, r5 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i64> @@ -2678,31 +2927,37 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i64 q3, #0xff ; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vand q4, q0, q3 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull lr, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vand q1, q2, q3 -; CHECK-NEXT: umull r2, r3, r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.32 q0[3], r3 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 @@ -2711,6 +2966,7 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 ; CHECK-NEXT: adc.w r3, lr, r12 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -2729,32 +2985,36 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vand q3, q2, q3 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull lr, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r2, r3, r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q3[3], r3 +; CHECK-NEXT: vand q0, q3, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 @@ -2778,48 +3038,52 @@ entry: define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) { ; CHECK-LABEL: add_v2i64_v2i64_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: vmov r4, s5 ; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: umull r6, r7, r5, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r6, r12 -; CHECK-NEXT: vmov r6, s5 -; CHECK-NEXT: mla r3, r3, r6, lr -; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: mla r2, r6, r2, r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: mla r3, r5, r3, r7 -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: mla r3, r7, r4, r3 -; CHECK-NEXT: vmov r7, s10 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: mla r3, r3, r4, lr +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov.32 q3[0], r12 +; CHECK-NEXT: mla r2, r4, r2, r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: umull r4, r12, r3, r2 +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: mla r3, r3, r4, r12 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: mla r2, r4, r2, r3 ; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: tst.w r2, #1 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s11 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: orrs r3, r7 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: tst.w r3, #1 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vand q0, q3, q0 -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r12, s3 ; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r7, r7, r6 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds r0, r0, r7 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r4, pc} entry: %c = icmp eq <2 x i64> %b, zeroinitializer %m = mul <2 x i64> %x, %y diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll index 29136b0..0a5f210 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -76,20 +76,22 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r12, s7 +; CHECK-NEXT: vmov lr, s3 ; CHECK-NEXT: adds r6, r3, r2 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adc.w r12, r12, lr ; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 +; CHECK-NEXT: vmov.32 q0[0], r5 ; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], r6 +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll index 26aa66b..7c8c0ba 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -269,20 +269,22 @@ define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r12, s7 ; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, pc} entry: @@ -297,58 +299,62 @@ entry: define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) { ; CHECK-LABEL: vld2_v4i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s10, s22 -; CHECK-NEXT: vmov.f32 s2, s20 -; CHECK-NEXT: vmov.f32 s11, s23 -; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r12, s9 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s7, s17 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.f64 d8, d7 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s21 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vmov r12, s19 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r3, s17 ; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov q3[2], q3[0], r5, r3 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.32 q3[1], r2 ; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov.32 q3[2], lr +; CHECK-NEXT: vmov.32 q3[3], r12 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: adds r4, r4, r6 -; CHECK-NEXT: vmov q1[2], q1[0], r4, lr -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r12 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: adds.w lr, r4, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: adc.w r12, r2, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 4 %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index 46a02a6..ad7f805 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -293,24 +293,30 @@ define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) { ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrh.u32 q3, [r0, #16] ; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.u16 r3, q0[7] ; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov q2[3], q2[1], r0, r2 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.32 q2[2], r0 ; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov q2[3], q2[1], r0, r2 +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vadd.i32 q0, q1, q2 ; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -328,78 +334,82 @@ entry: define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.u16 r0, q3[4] ; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.u16 r2, q1[5] ; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov.u16 r2, q2[0] ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.u16 r0, q3[7] ; CHECK-NEXT: vmov.16 q5[7], r0 ; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.f32 s22, s8 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.f32 s22, s12 ; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmovnb.i32 q3, q4 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s18, s11 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmovnb.i32 q6, q4 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.u16 r0, q2[1] ; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.u16 r0, q2[4] ; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.u16 r0, q3[2] ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.u16 r0, q3[5] ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.u16 r0, q1[1] ; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.u16 r0, q2[2] ; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q2[5] ; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovnb.i32 q1, q5 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r2 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 -; CHECK-NEXT: vadd.i16 q0, q4, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vmovnb.i32 q2, q5 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vadd.i16 q1, q4, q3 +; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x i16>, <24 x i16>* %src, align 4 @@ -417,143 +427,151 @@ define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmov.u16 r2, q1[2] ; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.u16 r2, q1[5] ; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov.u16 r2, q2[0] ; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.u16 r2, q2[3] ; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[4] ; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.u16 r2, q3[7] ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmovnb.i32 q3, q4 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov.f32 s22, s12 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmovnb.i32 q6, q4 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vmov.u16 r2, q1[0] ; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.u16 r2, q1[3] ; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov.u16 r2, q1[6] ; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov.u16 r2, q2[1] ; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.u16 r2, q2[4] ; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.u16 r2, q3[2] ; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov.u16 r2, q3[5] ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov.u16 r2, q1[1] ; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.u16 r2, q1[4] ; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q1[7] ; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.u16 r2, q2[2] ; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.u16 r2, q2[5] ; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmovnb.i32 q1, q5 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q3[0] +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.u16 r2, q3[6] +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vadd.i16 q0, q4, q1 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q3[1] -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vmov.f32 s30, s16 +; CHECK-NEXT: vmovnb.i32 q2, q5 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vadd.i16 q1, q4, q3 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.u16 r0, q3[4] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s7, s23 -; CHECK-NEXT: vmov q5, q7 -; CHECK-NEXT: vmovnb.i32 q5, q6 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov q5[2], q5[0], r0, r2 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.u16 r2, q1[2] ; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[0] +; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov.16 q5[2], r2 +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.f32 s26, s12 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmovnb.i32 q7, q5 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q7[0], r0 +; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q7[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q7[2], r0 +; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q7[3], r0 +; CHECK-NEXT: vmov.16 q6[6], r0 ; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] ; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov q2, q6 -; CHECK-NEXT: vmovnb.i32 q2, q7 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vadd.i16 q1, q1, q2 -; CHECK-NEXT: vadd.i16 q1, q1, q5 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmovnb.i32 q3, q6 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vadd.i16 q1, q5, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q4 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -607,38 +625,42 @@ entry: define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) { ; CHECK-LABEL: vld3_v4i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrb.u16 q0, [r0] -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: str r0, [sp] -; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: vmov.u16 r12, q0[5] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vmov.u16 lr, q0[2] -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vldrb.u16 q0, [r3] -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vmov q2[2], q2[0], r0, lr -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov q2[3], q2[1], r0, r12 -; CHECK-NEXT: vadd.i32 q0, q1, q2 +; CHECK-NEXT: vldrb.u16 q2, [r0] +; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: str r3, [sp] +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vldrb.u16 q2, [r2] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vadd.i32 q1, q1, q3 +; CHECK-NEXT: vmov.32 q0[3], r0 +; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vstrb.32 q0, [r1] ; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: bx lr entry: %l1 = load <12 x i8>, <12 x i8>* %src, align 4 %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> @@ -721,38 +743,41 @@ entry: define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld3_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: vmov.8 q4[8], r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: vmov.8 q4[9], r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: vmov.8 q4[10], r2 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vmov.u8 r2, q2[0] -; CHECK-NEXT: vmov.8 q3[0], r2 +; CHECK-NEXT: vmov.8 q1[0], r2 ; CHECK-NEXT: vmov.u8 r2, q2[3] -; CHECK-NEXT: vmov.8 q3[1], r2 +; CHECK-NEXT: vmov.8 q1[1], r2 ; CHECK-NEXT: vmov.u8 r2, q2[6] -; CHECK-NEXT: vmov.8 q3[2], r2 +; CHECK-NEXT: vmov.8 q1[2], r2 ; CHECK-NEXT: vmov.u8 r2, q2[9] -; CHECK-NEXT: vmov.8 q3[3], r2 +; CHECK-NEXT: vmov.8 q1[3], r2 ; CHECK-NEXT: vmov.u8 r2, q2[12] -; CHECK-NEXT: vmov.8 q3[4], r2 +; CHECK-NEXT: vmov.8 q1[4], r2 ; CHECK-NEXT: vmov.u8 r2, q2[15] -; CHECK-NEXT: vmov.8 q3[5], r2 +; CHECK-NEXT: vmov.8 q1[5], r2 ; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: vmov.8 q3[6], r2 +; CHECK-NEXT: vmov.8 q1[6], r2 ; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.8 q1[7], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.8 q4[8], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.8 q4[9], r2 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov.8 q4[10], r2 ; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q3[7], r2 ; CHECK-NEXT: vmov.8 q4[11], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.32 q3[2], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] ; CHECK-NEXT: vmov.8 q5[0], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] @@ -762,51 +787,51 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { ; CHECK-NEXT: vmov.u8 r0, q2[10] ; CHECK-NEXT: vmov.8 q5[3], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: vmov.8 q5[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.8 q6[8], r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: vmov.8 q5[5], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.8 q6[9], r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.8 q5[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.8 q6[10], r2 -; CHECK-NEXT: vmov.u8 r2, q1[2] ; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.8 q6[11], r2 ; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r0 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q5[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q5[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q5[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q5[11], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.32 q4[2], r0 ; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q7[12], r0 +; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q7[13], r0 +; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q7[14], r0 +; CHECK-NEXT: vmov.8 q5[14], r0 ; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q7[15], r0 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r0 -; CHECK-NEXT: vmov.u8 r2, q1[4] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.8 q3[12], r2 -; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.8 q3[13], r2 -; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: vmov.8 q3[14], r2 -; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: vmov.8 q3[15], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r0 +; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.8 q5[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q5[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q5[14], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vadd.i8 q3, q4, q6 +; CHECK-NEXT: vadd.i8 q3, q3, q4 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.8 q4[1], r0 ; CHECK-NEXT: vmov.u8 r0, q2[8] ; CHECK-NEXT: vmov.8 q4[2], r0 @@ -815,35 +840,38 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { ; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q4[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.8 q2[8], r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.8 q2[9], r2 -; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: vmov.8 q4[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.8 q2[10], r2 -; CHECK-NEXT: vmov.u8 r2, q1[3] -; CHECK-NEXT: vmov.8 q2[11], r2 ; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q4[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q4[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.8 q4[11], r0 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.32 q2[2], r0 ; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q2[12], r0 +; CHECK-NEXT: vmov.8 q0[12], r0 ; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q2[13], r0 +; CHECK-NEXT: vmov.8 q0[13], r0 ; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q2[14], r0 +; CHECK-NEXT: vmov.8 q0[14], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vadd.i8 q0, q3, q0 +; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vadd.i8 q0, q3, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x i8>, <48 x i8>* %src, align 4 @@ -874,32 +902,34 @@ define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vmov.f32 s15, s17 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov.f64 d0, d4 ; CHECK-NEXT: vmov.f32 s1, s9 ; CHECK-NEXT: vmov.f32 s2, s18 ; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov r12, s13 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov r12, s15 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adc.w r2, r2, r12 ; CHECK-NEXT: adds.w lr, lr, r0 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: adc.w r12, r2, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: vmov r2, s5 ; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, pc} @@ -917,89 +947,93 @@ entry: define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) { ; CHECK-LABEL: vld3_v4i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f64 d8, d5 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: vmov.f64 d12, d11 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f32 s25, s23 -; CHECK-NEXT: vmov.f32 s26, s4 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmov.f32 s27, s5 -; CHECK-NEXT: vmov.f32 s23, s15 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r12, s25 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov.f64 d14, d6 -; CHECK-NEXT: vmov.f32 s29, s13 -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vmov.f32 s31, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r6, s10 -; CHECK-NEXT: vmov r7, s26 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f64 d4, d0 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vmov.f64 d14, d11 +; CHECK-NEXT: vmov.f32 s29, s23 +; CHECK-NEXT: vmov.f32 s30, s0 +; CHECK-NEXT: vmov.f32 s22, s26 +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.f32 s31, s1 +; CHECK-NEXT: vmov r3, s30 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.f64 d6, d3 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmov.f32 s11, s19 +; CHECK-NEXT: vmov.f32 s15, s17 +; CHECK-NEXT: vmov.f64 d8, d12 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s19, s3 +; CHECK-NEXT: vmov r12, s31 +; CHECK-NEXT: vmov r2, s23 ; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: adc.w r3, r2, r12 -; CHECK-NEXT: vmov r2, s29 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: adc.w r2, r2, r12 ; CHECK-NEXT: adds.w lr, lr, r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: adc.w r12, r2, r3 +; CHECK-NEXT: vmov r3, s29 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: adcs r2, r3 ; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vmov r2, s9 ; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov r0, s15 ; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: adcs r0, r2 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: adds.w lr, r3, r4 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r4, s5 +; CHECK-NEXT: adc.w r12, r0, r2 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adc.w r8, r2, r3 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s23 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r0 -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s27 -; CHECK-NEXT: adds r2, r2, r7 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r8 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: vmov r6, s30 -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: adcs r0, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <12 x i64>, <12 x i64>* %src, align 4 %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> @@ -1319,93 +1353,97 @@ entry: define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q0[0], r3 ; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s12, s5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s12, s9 ; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmovx.f16 s12, s19 ; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vmovx.f16 s20, s16 ; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov.16 q3[6], r3 ; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s24, s11 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r5, s8 ; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmovx.f16 s24, s8 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov r12, s22 +; CHECK-NEXT: vmov lr, s22 ; CHECK-NEXT: vmovx.f16 s20, s17 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov.16 q5[6], r3 ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov lr, s23 -; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r12, s23 +; CHECK-NEXT: vmovx.f16 s20, s10 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov.16 q5[4], r3 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: vmovx.f16 s20, s18 ; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q4[6], r2 +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmovx.f16 s20, s5 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov r0, s24 ; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r0, s24 ; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.16 q5[5], r4 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov.16 q5[0], r5 -; CHECK-NEXT: vmov.16 q5[1], r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmovx.f16 s8, s18 -; CHECK-NEXT: vmov.16 q5[3], r4 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmovx.f16 s24, s11 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmovx.f16 s24, s4 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q6[0], r2 ; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov.16 q2[6], r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q2[7], r5 -; CHECK-NEXT: vmov.16 q5[5], r4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q1[2], q1[0], r12, r4 +; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.32 q1[1], r4 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q1[2], lr +; CHECK-NEXT: vmov.32 q0[2], r3 ; CHECK-NEXT: vmov r4, s15 -; CHECK-NEXT: vmov.f32 s23, s11 -; CHECK-NEXT: vmov q0[3], q0[1], lr, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r5 +; CHECK-NEXT: vmov.f32 s23, s19 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vmov.32 q1[3], r4 ; CHECK-NEXT: vadd.f16 q0, q5, q0 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> @@ -1420,174 +1458,182 @@ entry: define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld3_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmovx.f16 s4, s16 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmovx.f16 s20, s13 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r12, s8 -; CHECK-NEXT: vmovx.f16 s8, s14 -; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmov r12, s4 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vmov r2, s9 ; CHECK-NEXT: vmov.16 q0[7], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.16 q2[1], r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.16 q1[2], r2 ; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov.f32 s10, s19 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmovx.f16 s28, s16 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmovx.f16 s24, s12 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmovx.f16 s16, s18 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov.16 q5[4], r2 ; CHECK-NEXT: vmov.16 q5[5], r12 -; CHECK-NEXT: vmov r12, s22 -; CHECK-NEXT: vmovx.f16 s20, s6 +; CHECK-NEXT: vmov lr, s22 +; CHECK-NEXT: vmovx.f16 s20, s14 ; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov.16 q5[5], r3 +; CHECK-NEXT: vmov r12, s22 +; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: vmov r3, s20 ; CHECK-NEXT: vmov.16 q5[6], r3 -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r3, s17 ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.16 q4[7], r4 +; CHECK-NEXT: vmovx.f16 s20, s9 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov.16 q5[1], r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov.16 q5[2], r3 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov.16 q5[3], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmovx.f16 s24, s15 +; CHECK-NEXT: vmov.16 q5[4], r3 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmovx.f16 s24, s8 +; CHECK-NEXT: vmov.16 q5[5], r3 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: vmov.16 q6[0], r3 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmovx.f16 s28, s19 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmovx.f16 s28, s18 -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov lr, s30 -; CHECK-NEXT: vmovx.f16 s28, s12 -; CHECK-NEXT: vmov r3, s28 -; CHECK-NEXT: vmovx.f16 s12, s15 -; CHECK-NEXT: vmov.16 q7[0], r3 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.16 q7[1], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov.16 q7[2], r2 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov.16 q1[6], r5 -; CHECK-NEXT: vmov.16 q7[3], r2 -; CHECK-NEXT: vmov.16 q1[7], r4 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov q0[2], q0[0], lr, r2 -; CHECK-NEXT: vmov r6, s29 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 -; CHECK-NEXT: vmov q1[2], q1[0], r12, r5 -; CHECK-NEXT: vmov r5, s9 -; CHECK-NEXT: vmov.f32 s27, s23 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r5 -; CHECK-NEXT: vadd.f16 q0, q6, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmov.16 q6[1], r4 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.32 q2[0], r5 +; CHECK-NEXT: vmov.16 q6[2], r3 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: vmov.16 q6[3], r3 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov.32 q2[1], r5 +; CHECK-NEXT: vmov r5, s3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov.32 q2[2], lr +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.f32 s23, s19 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vmov.32 q2[3], r5 +; CHECK-NEXT: vadd.f16 q0, q5, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vadd.f16 q0, q0, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmovx.f16 s12, s16 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmovx.f16 s16, s4 ; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s14 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.16 q4[1], r3 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s10 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.16 q3[1], r3 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmovx.f16 s20, s5 +; CHECK-NEXT: vmov.16 q3[2], r2 ; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmovx.f16 s28, s8 -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov r5, s7 -; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmovx.f16 s24, s4 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmovx.f16 s16, s18 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov r2, s5 ; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov r2, s22 ; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[6], r2 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q5[5], r3 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r12, s22 +; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: vmov r5, s20 +; CHECK-NEXT: vmov.16 q5[6], r5 +; CHECK-NEXT: vmov r5, s17 ; CHECK-NEXT: vmov.16 q5[7], r3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q6[0], r3 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmovx.f16 s28, s11 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmovx.f16 s28, s10 -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r2, s30 -; CHECK-NEXT: vmovx.f16 s28, s12 -; CHECK-NEXT: vmov r6, s28 -; CHECK-NEXT: vmovx.f16 s12, s15 -; CHECK-NEXT: vmov.16 q7[0], r6 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.16 q7[1], r3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q7[2], r3 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: vmov.16 q1[6], r4 -; CHECK-NEXT: vmov.16 q7[3], r3 -; CHECK-NEXT: vmov.16 q1[7], r5 -; CHECK-NEXT: vmov r3, s28 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r5, s7 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov r6, s29 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 -; CHECK-NEXT: vmov.f32 s27, s23 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r0 -; CHECK-NEXT: vadd.f16 q0, q6, q0 +; CHECK-NEXT: vmov.16 q4[6], r5 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmov.16 q4[7], r4 +; CHECK-NEXT: vmovx.f16 s20, s9 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r5, s20 +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov.16 q5[1], r5 +; CHECK-NEXT: vmov r5, s11 +; CHECK-NEXT: vmov.16 q5[2], r5 +; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: vmov.16 q5[3], r5 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmovx.f16 s24, s7 +; CHECK-NEXT: vmov.16 q5[4], r5 +; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: vmovx.f16 s24, s8 +; CHECK-NEXT: vmov.16 q5[5], r5 +; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov.16 q6[0], r5 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmov.16 q6[1], r4 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov.16 q6[2], r5 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: vmov.16 q6[3], r5 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.f32 s23, s19 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov.32 q0[3], r3 +; CHECK-NEXT: vadd.f16 q0, q5, q0 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll index 548a188..e8f3368 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -119,41 +119,43 @@ define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: vmov.f32 s14, s20 ; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s14 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r12, s17 -; CHECK-NEXT: vmov lr, s13 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r7, s2 +; CHECK-NEXT: vmov r12, s19 +; CHECK-NEXT: vmov lr, s15 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: vmov r7, s0 ; CHECK-NEXT: adds r6, r3, r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adc.w r12, r12, lr ; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: adcs r2, r3 ; CHECK-NEXT: adds.w lr, r5, r6 ; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r6, s19 -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r6, s17 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r4, s1 ; CHECK-NEXT: adcs r6, r5 -; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r5, s5 ; CHECK-NEXT: adds r3, r3, r7 ; CHECK-NEXT: adcs r4, r5 ; CHECK-NEXT: adds r2, r2, r3 ; CHECK-NEXT: adc.w r3, r4, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index cb49257..776598d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -229,33 +229,41 @@ define void @vld4_v4i16(<16 x i16> *%src, <4 x i16> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.32 q3[1], r0 ; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 +; CHECK-NEXT: vmov.32 q3[2], r0 ; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.32 q4[1], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 +; CHECK-NEXT: vmov.32 q4[2], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov q4[3], q4[1], r2, r0 +; CHECK-NEXT: vmov.32 q4[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vadd.i32 q0, q3, q4 ; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vstrh.32 q0, [r1] @@ -382,12 +390,14 @@ define void @vld4_v4i8(<16 x i8> *%src, <4 x i8> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vrev32.8 q2, q0 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vadd.i32 q1, q1, q2 ; CHECK-NEXT: vrev16.8 q2, q0 ; CHECK-NEXT: vadd.i32 q0, q0, q2 @@ -545,42 +555,44 @@ define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: vmov.f32 s14, s20 ; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r12, s17 -; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov r0, s14 ; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vmov r12, s19 +; CHECK-NEXT: vmov r2, s15 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r6, s0 ; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s15 +; CHECK-NEXT: vmov r4, s13 ; CHECK-NEXT: adcs r0, r3 ; CHECK-NEXT: adds.w lr, lr, r2 ; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s17 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r2, r4, r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov r4, s1 ; CHECK-NEXT: adds r5, r5, r6 ; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r5 ; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -600,118 +612,123 @@ entry: define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) { ; CHECK-LABEL: vld4_v4i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .pad #72 +; CHECK-NEXT: sub sp, #72 ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: vldrw.u32 q0, [r0, #96] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vmov.f64 d8, d3 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vmov.f64 d14, d9 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d12, d11 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s25, s23 +; CHECK-NEXT: vmov.f32 s26, s2 +; CHECK-NEXT: vmov.f64 d6, d3 +; CHECK-NEXT: vmov.f32 s27, s3 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s14, s2 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vmov.f64 d4, d13 -; CHECK-NEXT: vmov.f32 s31, s3 +; CHECK-NEXT: vmov.f64 d4, d15 +; CHECK-NEXT: vmov.f32 s15, s3 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vmov.f32 s9, s27 +; CHECK-NEXT: vmov.f32 s9, s31 ; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s26, s0 +; CHECK-NEXT: vmov.f32 s30, s0 ; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s27, s1 +; CHECK-NEXT: vmov.f32 s31, s1 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov r12, s9 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov.f64 d10, d7 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov.f32 s18, s0 -; CHECK-NEXT: vmov.f32 s19, s1 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: vmov r2, s31 +; CHECK-NEXT: vmov.f32 s22, s0 +; CHECK-NEXT: vmov.f32 s23, s1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r7, s16 ; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload ; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov r6, s2 ; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s30 +; CHECK-NEXT: vmov r4, s23 ; CHECK-NEXT: adcs r0, r2 ; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s31 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov r3, s27 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r2, r4, r3 ; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r7, s4 ; CHECK-NEXT: vmov r4, s3 ; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: adcs r4, r0 -; CHECK-NEXT: adds.w r9, r5, r2 -; CHECK-NEXT: vmov r5, s28 -; CHECK-NEXT: adc.w r8, r4, r3 -; CHECK-NEXT: vmov r2, s29 -; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov r6, s20 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: adc.w r8, r3, r2 +; CHECK-NEXT: vmov r2, s25 +; CHECK-NEXT: vmov r4, s21 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: adds r5, r5, r6 ; CHECK-NEXT: vmov r6, s1 ; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov r4, s5 +; CHECK-NEXT: vmov r4, s17 ; CHECK-NEXT: adds r3, r3, r7 -; CHECK-NEXT: vmov r7, s14 +; CHECK-NEXT: vmov r7, s28 ; CHECK-NEXT: adcs r4, r6 ; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: vmov r6, s22 -; CHECK-NEXT: adc.w r10, r4, r2 -; CHECK-NEXT: vmov r4, s23 -; CHECK-NEXT: vmov q1[2], q1[0], r9, r3 -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov q1[3], q1[1], r8, r10 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vmov r6, s8 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov r5, s29 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: vmov.32 q0[3], r8 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: adds r6, r6, r7 -; CHECK-NEXT: vmov r7, s27 ; CHECK-NEXT: adcs r4, r5 -; CHECK-NEXT: vmov r5, s11 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r7, r5 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: adcs r2, r3 ; CHECK-NEXT: adds r0, r0, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: adc.w r0, r4, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #72 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <16 x i64>, <16 x i64>* %src, align 4 %s1 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll index 0ea0bd3..2796656 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -6,11 +6,13 @@ define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmullb.s32 q2, q0, q1 ; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: bx lr entry: %s0s = sext <2 x i32> %s0 to <2 x i64> @@ -46,28 +48,28 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vmulhs_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vmov.f32 s10, s5 ; CHECK-NEXT: vmov.f32 s14, s1 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmullb.s32 q5, q1, q4 +; CHECK-NEXT: vmullb.s32 q0, q1, q3 ; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s23 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: smmul r1, r2, r1 +; CHECK-NEXT: vmov.32 q2[0], r1 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %s0s = sext <4 x i32> %s0 to <4 x i64> @@ -140,18 +142,21 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vmulhs_v8i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmullb.s16 q2, q3, q2 ; CHECK-NEXT: vshr.s32 q3, q2, #16 ; CHECK-NEXT: vmov r0, s12 @@ -163,16 +168,21 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-NEXT: vmov r0, s15 ; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmullb.s16 q0, q1, q3 ; CHECK-NEXT: vshr.s32 q0, q0, #16 ; CHECK-NEXT: vmov r0, s0 @@ -198,18 +208,21 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vmulhu_v8i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmullb.u16 q2, q3, q2 ; CHECK-NEXT: vshr.u32 q3, q2, #16 ; CHECK-NEXT: vmov r0, s12 @@ -221,16 +234,21 @@ define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-NEXT: vmov r0, s15 ; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmullb.u16 q0, q1, q3 ; CHECK-NEXT: vshr.u32 q0, q0, #16 ; CHECK-NEXT: vmov r0, s0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll index d17e4f0..ecb1198 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll @@ -4,33 +4,38 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32* nocapture %z, i32 %n) { ; CHECK-LABEL: test32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r5, lr} +; CHECK-NEXT: push {r5, lr} ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: blt .LBB0_2 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r5, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vmullt.s32 q0, q2, q1 -; CHECK-NEXT: vmullb.s32 q3, q2, q1 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vmov r7, s3 +; CHECK-NEXT: vmullt.s32 q3, q2, q1 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: vmov r12, s12 ; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: lsrl r4, r7, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r12 +; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: lsrl r12, r5, #31 +; CHECK-NEXT: vmullb.s32 q3, q2, q1 +; CHECK-NEXT: vmov.32 q0[2], r12 ; CHECK-NEXT: vmov r12, s12 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r5 +; CHECK-NEXT: vmov.32 q0[3], r5 ; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov r7, s15 ; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: lsrl r4, r7, #31 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r12 -; CHECK-NEXT: vmov q1[3], q1[1], r7, r5 +; CHECK-NEXT: vmov.32 q1[0], r12 +; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov.32 q1[1], r5 +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: lsrl r12, r5, #31 +; CHECK-NEXT: vmov.32 q1[2], r12 +; CHECK-NEXT: vmov.32 q1[3], r5 ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s9, s7 ; CHECK-NEXT: vmov.f32 s6, s0 @@ -42,8 +47,8 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noa ; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB0_1 -; CHECK-NEXT: .LBB0_2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r5, pc} entry: %0 = and i32 %n, 3 %cmp = icmp eq i32 %0, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll index 0a96bd65..995ac7d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -74,18 +74,21 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.u16 r1, q2[3] -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q2[6] +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmullb.s16 q0, q3, q0 ; CHECK-NEXT: vmov.i32 q3, #0x7fff ; CHECK-NEXT: vshl.i32 q0, q0, #10 @@ -101,16 +104,21 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 +; CHECK-NEXT: vmov.32 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.u16 r1, q2[7] -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.32 q4[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vmullb.s16 q1, q2, q4 ; CHECK-NEXT: vshl.i32 q1, q1, #10 ; CHECK-NEXT: vshr.s32 q1, q1, #10 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll index 22faf84..b8304cf 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll @@ -165,28 +165,30 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_smaxmin: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: mvn r12, #-2147483648 +; CHECK-NEXT: mvn r3, #-2147483648 ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: subs.w r2, r2, r12 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: subs r2, r2, r3 ; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: subs.w r3, r3, r12 -; CHECK-NEXT: mov.w r12, #-1 -; CHECK-NEXT: sbcs r2, r2, #0 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: subs r2, r2, r3 +; CHECK-NEXT: mov.w r3, #-1 +; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.32 q1[3], r1 ; CHECK-NEXT: adr r1, .LCPI12_0 ; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vand q0, q0, q1 @@ -194,23 +196,25 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) { ; CHECK-NEXT: vorr q0, q0, q2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: sbcs.w r1, r12, r1 +; CHECK-NEXT: sbcs.w r1, r3, r1 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: rsbs.w r3, r3, #-2147483648 -; CHECK-NEXT: sbcs.w r2, r12, r2 +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r3, r1 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: adr r0, .LCPI12_1 ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vand q0, q0, q1 @@ -241,28 +245,30 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_sminmax: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: mov.w r12, #-1 +; CHECK-NEXT: mov.w r3, #-1 ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: sbcs.w r1, r12, r1 +; CHECK-NEXT: sbcs.w r1, r3, r1 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: rsbs.w r3, r3, #-2147483648 -; CHECK-NEXT: sbcs.w r2, r12, r2 -; CHECK-NEXT: mvn r12, #-2147483648 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r3, r1 +; CHECK-NEXT: mvn r3, #-2147483648 +; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.32 q1[3], r1 ; CHECK-NEXT: adr r1, .LCPI13_0 ; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vand q0, q0, q1 @@ -270,23 +276,25 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) { ; CHECK-NEXT: vorr q0, q0, q2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: subs.w r2, r2, r12 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: subs r2, r2, r3 ; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: subs.w r3, r3, r12 -; CHECK-NEXT: sbcs r2, r2, #0 +; CHECK-NEXT: vmov.32 q1[0], r1 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: subs r2, r2, r3 +; CHECK-NEXT: sbcs r1, r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: adr r0, .LCPI13_1 ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vand q0, q0, q1 @@ -320,23 +328,25 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_umaxmin(<2 x i64> %s0) { ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: subs.w r1, r1, #-1 -; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: sbcs r0, r0, #0 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r3, r3, #-1 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: subs.w r1, r1, #-1 +; CHECK-NEXT: sbcs r0, r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vbic q2, q2, q1 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vorr q0, q0, q2 @@ -354,23 +364,25 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_uminmax(<2 x i64> %s0) { ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: subs.w r1, r1, #-1 -; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: sbcs r0, r0, #0 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r3, r3, #-1 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: subs.w r1, r1, #-1 +; CHECK-NEXT: sbcs r0, r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vbic q2, q2, q1 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vorr q0, q0, q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll index 39d9cf2..c221092 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll @@ -180,65 +180,71 @@ entry: define arm_aapcs_vfpcc <2 x i64> @vqshrni64_smaxmin(<2 x i64> %so) { ; CHECK-LABEL: vqshrni64_smaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: mvn r12, #-2147483648 +; CHECK-NEXT: .save {r5, lr} +; CHECK-NEXT: push {r5, lr} +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: mvn lr, #-2147483648 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: mov.w lr, #0 -; CHECK-NEXT: asrl r2, r1, #3 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: subs.w r3, r2, r12 -; CHECK-NEXT: sbcs r3, r1, #0 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: asrl r2, r5, #3 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: asrl r4, r3, #3 -; CHECK-NEXT: subs.w r0, r4, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: sbcs r0, r3, #0 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r1 +; CHECK-NEXT: subs.w r0, r2, lr +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: sbcs r0, r5, #0 +; CHECK-NEXT: vmov.32 q2[1], r5 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: mov.w r2, #-1 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r5 -; CHECK-NEXT: adr r0, .LCPI12_0 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: asrl r0, r3, #3 +; CHECK-NEXT: subs.w r1, r0, lr +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: adr r1, .LCPI12_0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vbic q0, q0, q1 +; CHECK-NEXT: vand q1, q2, q1 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: sbcs.w r0, r2, r0 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs.w r3, r3, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r2, r1 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r2, r0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w lr, #1 -; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: movlt.w r12, #1 +; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: adr r0, .LCPI12_1 ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vbic q2, q2, q1 ; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r5, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI12_0: @@ -263,65 +269,71 @@ entry: define arm_aapcs_vfpcc <2 x i64> @vqshrni64_sminmax(<2 x i64> %so) { ; CHECK-LABEL: vqshrni64_sminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: mov.w r12, #-1 +; CHECK-NEXT: mov.w lr, #-1 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: mov.w lr, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: asrl r2, r1, #3 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: rsbs.w r3, r2, #-2147483648 -; CHECK-NEXT: sbcs.w r3, r12, r1 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: asrl r4, r3, #3 -; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: sbcs.w r5, r12, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r1 -; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: rsbs.w r0, r2, #-2147483648 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: sbcs.w r0, lr, r1 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov q0[2], q0[0], r5, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 -; CHECK-NEXT: adr r0, .LCPI13_0 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: asrl r0, r3, #3 +; CHECK-NEXT: rsbs.w r4, r0, #-2147483648 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: sbcs.w r4, lr, r3 +; CHECK-NEXT: vmov.32 q2[3], r3 +; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: vmov.32 q1[3], r4 +; CHECK-NEXT: adr r4, .LCPI13_0 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vbic q0, q0, q1 +; CHECK-NEXT: vand q1, q2, q1 +; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: subs r1, r1, r2 -; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: sbcs r0, r0, #0 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: subs r1, r1, r2 +; CHECK-NEXT: sbcs r0, r0, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w lr, #1 -; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: movlt.w r12, #1 +; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: adr r0, .LCPI13_1 ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vbic q2, q2, q1 ; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI13_0: @@ -346,37 +358,41 @@ entry: define arm_aapcs_vfpcc <2 x i64> @vqshrni64_umaxmin(<2 x i64> %so) { ; CHECK-LABEL: vqshrni64_umaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: .save {r5, r6, r7, lr} +; CHECK-NEXT: push {r5, r6, r7, lr} +; CHECK-NEXT: vmov r7, s1 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: lsrl r0, r5, #3 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: subs.w r3, r0, #-1 -; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r3, #1 -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: lsrl r0, r7, #3 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: csetm r12, ne -; CHECK-NEXT: lsrl r4, r3, #3 -; CHECK-NEXT: subs.w r1, r4, #-1 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r0 -; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: subs.w r2, r0, #-1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: sbcs r2, r7, #0 +; CHECK-NEXT: vmov.32 q2[1], r7 +; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r5 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: lsrl r2, r3, #3 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: subs.w r5, r2, #-1 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: sbcs r5, r3, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r12 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r12 -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vbic q0, q0, q1 +; CHECK-NEXT: vand q1, q2, q1 +; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: pop {r5, r6, r7, pc} entry: %s0 = lshr <2 x i64> %so, %c1 = icmp ult <2 x i64> %s0, @@ -387,37 +403,41 @@ entry: define arm_aapcs_vfpcc <2 x i64> @vqshrni64_uminmax(<2 x i64> %so) { ; CHECK-LABEL: vqshrni64_uminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: .save {r5, r6, r7, lr} +; CHECK-NEXT: push {r5, r6, r7, lr} +; CHECK-NEXT: vmov r7, s1 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: lsrl r0, r5, #3 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: subs.w r3, r0, #-1 -; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r3, #1 -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: lsrl r0, r7, #3 ; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: csetm r12, ne -; CHECK-NEXT: lsrl r4, r3, #3 -; CHECK-NEXT: subs.w r1, r4, #-1 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r0 -; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: subs.w r2, r0, #-1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: sbcs r2, r7, #0 +; CHECK-NEXT: vmov.32 q2[1], r7 +; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r5 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: lsrl r2, r3, #3 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: subs.w r5, r2, #-1 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: sbcs r5, r3, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r12 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r12 -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vbic q0, q0, q1 +; CHECK-NEXT: vand q1, q2, q1 +; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: pop {r5, r6, r7, pc} entry: %s0 = lshr <2 x i64> %so, %c2 = icmp ult <2 x i64> %s0, diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll index 2ee7bbe..b815ed2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -6,19 +6,21 @@ define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vst2_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r12, r3, [r0] -; CHECK-NEXT: ldrd r2, r0, [r0, #8] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r12 -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: ldrd r2, r12, [r0] +; CHECK-NEXT: ldrd r3, r0, [r0, #8] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.f32 s9, s3 ; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s10 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -111,12 +113,14 @@ entry: define void @vst2_v2i16(<2 x i16> *%src, <4 x i16> *%dst) { ; CHECK-LABEL: vst2_v2i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: ldrh r3, [r0, #2] -; CHECK-NEXT: ldrh.w r12, [r0, #4] -; CHECK-NEXT: ldrh r0, [r0, #6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: ldrh r3, [r0] +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: ldrh.w r12, [r0, #6] +; CHECK-NEXT: ldrh r0, [r0, #2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -193,11 +197,13 @@ define void @vst2_v2i8(<2 x i8> *%src, <4 x i8> *%dst) { ; CHECK-LABEL: vst2_v2i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: ldrb.w r12, [r0, #2] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #1] +; CHECK-NEXT: vmov.32 q0[1], r3 ; CHECK-NEXT: ldrb r0, [r0, #3] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], r0 ; CHECK-NEXT: vstrb.32 q0, [r1] ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index a3fe349..1ae74c1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -8,13 +8,14 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], r3, lr +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r3 ; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r12 +; CHECK-NEXT: vmov.32 q1[2], r12 ; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q1[3], lr ; CHECK-NEXT: vmov.f32 s8, s7 ; CHECK-NEXT: vmov.f32 s10, s1 ; CHECK-NEXT: vmov r2, s8 @@ -301,16 +302,18 @@ define void @vst3_v2i16(<2 x i16> *%src, <6 x i16> *%dst) { ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: ldrh.w lr, [r0, #4] +; CHECK-NEXT: ldrh r3, [r0, #4] ; CHECK-NEXT: ldrh.w r12, [r0, #8] ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrh r3, [r0, #2] -; CHECK-NEXT: vmov q1[2], q1[0], r2, lr +; CHECK-NEXT: ldrh.w lr, [r0, #2] +; CHECK-NEXT: vmov.32 q1[0], r3 ; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: vmov.32 q1[2], r2 ; CHECK-NEXT: ldrh r0, [r0, #10] ; CHECK-NEXT: vmov.16 q0[5], r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r4 +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vmov.f32 s3, s2 @@ -686,8 +689,9 @@ define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) { ; CHECK-NEXT: ldrb r2, [r0] ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q0[2], r3 ; CHECK-NEXT: ldrb.w lr, [r0, #3] ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: ldrb r5, [r0, #5] @@ -1457,21 +1461,23 @@ entry: define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { ; CHECK-LABEL: vst3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q0[2], q0[0], r3, lr -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 -; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.16 q2[0], r3 ; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: ldrd r2, r0, [r0, #16] +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vmov.16 q2[2], r0 ; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmovx.f16 s12, s2 @@ -1480,7 +1486,6 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { ; CHECK-NEXT: vmovx.f16 s12, s4 ; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov.16 q2[6], r0 @@ -1500,7 +1505,7 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { ; CHECK-NEXT: vmov r0, s9 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: strd r2, r0, [r1, #16] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 0f089cb..bc54155 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -8,16 +8,18 @@ define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], r0, r4 +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: vmov.32 q1[2], r0 ; CHECK-NEXT: vmov.f64 d0, d2 ; CHECK-NEXT: vmov.f32 s1, s6 ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vmov q1[2], q1[0], r3, lr -; CHECK-NEXT: vmov q1[3], q1[1], r2, r12 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov.32 q1[2], r12 +; CHECK-NEXT: vmov.32 q1[3], lr ; CHECK-NEXT: vmov.f64 d4, d2 ; CHECK-NEXT: vmov.f32 s9, s6 ; CHECK-NEXT: vmov.f32 s10, s0 @@ -207,22 +209,23 @@ define void @vst4_v2i16(<2 x i16> *%src, <8 x i16> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: ldrh.w r12, [r0, #4] +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: ldrh.w lr, [r0, #4] ; CHECK-NEXT: ldrh r3, [r0, #8] -; CHECK-NEXT: ldrh.w lr, [r0, #6] -; CHECK-NEXT: ldrh r4, [r0, #10] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: ldrh.w r12, [r0, #6] +; CHECK-NEXT: ldrh r2, [r0, #10] ; CHECK-NEXT: ldrh r0, [r0, #2] -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r12 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: vmov.16 q0[1], lr ; CHECK-NEXT: vmov.16 q0[2], r3 ; CHECK-NEXT: vmov.16 q0[3], r3 ; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], lr -; CHECK-NEXT: vmov.16 q0[6], r4 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[5], r12 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, pc} entry: @@ -373,8 +376,9 @@ define void @vst4_v2i8(<2 x i8> *%src, <8 x i8> *%dst) { ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: ldrb r2, [r0] ; CHECK-NEXT: ldrb r3, [r0, #1] +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q0[2], r3 ; CHECK-NEXT: ldrb.w lr, [r0, #3] ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: ldrb r4, [r0, #5] @@ -907,58 +911,61 @@ entry: define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst4_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], r3, lr -; CHECK-NEXT: vmov q1[3], q1[1], r2, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r0 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmovx.f16 s12, s1 +; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: vmov.16 q2[0], r3 ; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: ldrd r2, r0, [r0, #16] +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmovx.f16 s12, s5 +; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmovx.f16 s12, s3 ; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s1 +; CHECK-NEXT: vmovx.f16 s12, s5 ; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmovx.f16 s12, s4 ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s4 ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll index 6fb2e44..6d7fb9e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -276,36 +276,44 @@ define void @foo_int32_int8_both(<16 x i32>* %dest, <16 x i8>* readonly %src, i3 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s16 q1, [r1, #8] ; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.u16 r3, q1[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.u16 r3, q1[7] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vmovlb.u16 q2, q0 ; CHECK-NEXT: vldrb.s16 q0, [r1] ; CHECK-NEXT: vmov.u16 r1, q1[0] ; CHECK-NEXT: vstrw.32 q2, [r0, #48] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vmov.u16 r1, q1[3] +; CHECK-NEXT: vmov.32 q2[3], r1 ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: vmovlb.u16 q1, q2 -; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: vstrw.32 q1, [r0, #32] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.32 q1[3], r1 ; CHECK-NEXT: vmov.u16 r1, q0[0] ; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.32 q1[3], r1 ; CHECK-NEXT: vmovlb.u16 q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr