; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB9_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u16 q2, [r0], #16
-; CHECK-NEXT: vcvtb.f32.f16 s7, s11
-; CHECK-NEXT: vmovx.f16 s13, s11
-; CHECK-NEXT: vcvtb.f32.f16 s6, s10
-; CHECK-NEXT: vmovx.f16 s14, s8
-; CHECK-NEXT: vcvtb.f32.f16 s5, s9
-; CHECK-NEXT: vcvtb.f32.f16 s4, s8
-; CHECK-NEXT: vmovx.f16 s8, s10
-; CHECK-NEXT: vmul.f32 q1, q1, q0
-; CHECK-NEXT: vcvtb.f16.f32 s12, s4
-; CHECK-NEXT: vcvtb.f32.f16 s19, s13
-; CHECK-NEXT: vmov r2, s12
-; CHECK-NEXT: vmovx.f16 s12, s9
-; CHECK-NEXT: vcvtb.f32.f16 s18, s8
-; CHECK-NEXT: vcvtb.f32.f16 s17, s12
-; CHECK-NEXT: vcvtb.f32.f16 s16, s14
-; CHECK-NEXT: vmul.f32 q2, q4, q0
-; CHECK-NEXT: vcvtb.f16.f32 s12, s8
-; CHECK-NEXT: vcvtb.f16.f32 s16, s5
-; CHECK-NEXT: vmov r3, s12
-; CHECK-NEXT: vmov.16 q3[0], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vmov.16 q3[1], r3
-; CHECK-NEXT: vcvtb.f16.f32 s16, s9
-; CHECK-NEXT: vmov.16 q3[2], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s6
-; CHECK-NEXT: vmov.16 q3[3], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s10
-; CHECK-NEXT: vmov.16 q3[4], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q3[5], r2
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vcvtb.f16.f32 s4, s11
-; CHECK-NEXT: vmov.16 q3[6], r2
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmov.16 q3[7], r2
-; CHECK-NEXT: vstrb.8 q3, [r1], #16
+; CHECK-NEXT: vldrh.u16 q1, [r0], #16
+; CHECK-NEXT: vmovx.f16 s8, s7
+; CHECK-NEXT: vmovx.f16 s14, s6
+; CHECK-NEXT: vcvtb.f32.f16 s11, s8
+; CHECK-NEXT: vmovx.f16 s13, s5
+; CHECK-NEXT: vcvtb.f32.f16 s10, s14
+; CHECK-NEXT: vmovx.f16 s12, s4
+; CHECK-NEXT: vcvtb.f32.f16 s9, s13
+; CHECK-NEXT: vcvtb.f32.f16 s19, s7
+; CHECK-NEXT: vcvtb.f32.f16 s18, s6
+; CHECK-NEXT: vcvtb.f32.f16 s17, s5
+; CHECK-NEXT: vcvtb.f32.f16 s16, s4
+; CHECK-NEXT: vcvtb.f32.f16 s8, s12
+; CHECK-NEXT: vmul.f32 q1, q4, q0
+; CHECK-NEXT: vmul.f32 q2, q2, q0
+; CHECK-NEXT: vcvtb.f16.f32 q1, q1
+; CHECK-NEXT: vcvtt.f16.f32 q1, q2
+; CHECK-NEXT: vstrb.8 q1, [r1], #16
; CHECK-NEXT: le lr, .LBB9_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB10_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u16 q2, [r0]
-; CHECK-NEXT: vcvtb.f32.f16 s7, s11
-; CHECK-NEXT: vmovx.f16 s13, s11
-; CHECK-NEXT: vcvtb.f32.f16 s6, s10
-; CHECK-NEXT: vmovx.f16 s14, s8
-; CHECK-NEXT: vcvtb.f32.f16 s5, s9
-; CHECK-NEXT: vcvtb.f32.f16 s4, s8
-; CHECK-NEXT: vmovx.f16 s8, s10
-; CHECK-NEXT: vmul.f32 q1, q1, q0
-; CHECK-NEXT: vcvtb.f16.f32 s12, s4
-; CHECK-NEXT: vcvtb.f32.f16 s19, s13
-; CHECK-NEXT: vmov r2, s12
-; CHECK-NEXT: vmovx.f16 s12, s9
-; CHECK-NEXT: vcvtb.f32.f16 s18, s8
-; CHECK-NEXT: vcvtb.f32.f16 s17, s12
-; CHECK-NEXT: vcvtb.f32.f16 s16, s14
-; CHECK-NEXT: vmul.f32 q2, q4, q0
-; CHECK-NEXT: vcvtb.f16.f32 s12, s8
-; CHECK-NEXT: vcvtb.f16.f32 s16, s5
-; CHECK-NEXT: vmov r3, s12
-; CHECK-NEXT: vmov.16 q3[0], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vmov.16 q3[1], r3
-; CHECK-NEXT: vcvtb.f16.f32 s16, s9
-; CHECK-NEXT: vmov.16 q3[2], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s6
-; CHECK-NEXT: vmov.16 q3[3], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s10
-; CHECK-NEXT: vmov.16 q3[4], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q3[5], r2
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vcvtb.f16.f32 s4, s11
-; CHECK-NEXT: vmov.16 q3[6], r2
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vldrh.u16 q2, [r0, #16]!
-; CHECK-NEXT: vmov.16 q3[7], r2
-; CHECK-NEXT: vstrh.16 q3, [r1]
-; CHECK-NEXT: vmovx.f16 s12, s11
-; CHECK-NEXT: vmovx.f16 s14, s10
+; CHECK-NEXT: vldrh.u16 q1, [r0]
+; CHECK-NEXT: vmovx.f16 s8, s7
+; CHECK-NEXT: vmovx.f16 s14, s6
+; CHECK-NEXT: vcvtb.f32.f16 s11, s8
+; CHECK-NEXT: vmovx.f16 s13, s5
+; CHECK-NEXT: vcvtb.f32.f16 s10, s14
+; CHECK-NEXT: vmovx.f16 s12, s4
+; CHECK-NEXT: vcvtb.f32.f16 s9, s13
+; CHECK-NEXT: vcvtb.f32.f16 s19, s7
+; CHECK-NEXT: vcvtb.f32.f16 s18, s6
+; CHECK-NEXT: vcvtb.f32.f16 s17, s5
+; CHECK-NEXT: vcvtb.f32.f16 s16, s4
+; CHECK-NEXT: vcvtb.f32.f16 s8, s12
+; CHECK-NEXT: vmul.f32 q1, q4, q0
+; CHECK-NEXT: vmul.f32 q2, q2, q0
+; CHECK-NEXT: vcvtb.f16.f32 q1, q1
+; CHECK-NEXT: vcvtt.f16.f32 q1, q2
+; CHECK-NEXT: vstrh.16 q1, [r1]
+; CHECK-NEXT: vldrh.u16 q1, [r0, #16]!
+; CHECK-NEXT: vmovx.f16 s12, s7
+; CHECK-NEXT: vmovx.f16 s14, s6
; CHECK-NEXT: vcvtb.f32.f16 s19, s12
-; CHECK-NEXT: vmovx.f16 s4, s9
+; CHECK-NEXT: vmovx.f16 s8, s5
; CHECK-NEXT: vcvtb.f32.f16 s18, s14
-; CHECK-NEXT: vmovx.f16 s6, s8
-; CHECK-NEXT: vcvtb.f32.f16 s17, s4
-; CHECK-NEXT: vcvtb.f32.f16 s16, s6
+; CHECK-NEXT: vmovx.f16 s10, s4
+; CHECK-NEXT: vcvtb.f32.f16 s17, s8
+; CHECK-NEXT: vcvtb.f32.f16 s16, s10
+; CHECK-NEXT: vcvtb.f32.f16 s11, s7
+; CHECK-NEXT: vcvtb.f32.f16 s10, s6
+; CHECK-NEXT: vcvtb.f32.f16 s9, s5
+; CHECK-NEXT: vcvtb.f32.f16 s8, s4
; CHECK-NEXT: vmul.f32 q1, q4, q0
-; CHECK-NEXT: vcvtb.f16.f32 s12, s4
-; CHECK-NEXT: vmov r2, s12
-; CHECK-NEXT: vcvtb.f32.f16 s15, s11
-; CHECK-NEXT: vcvtb.f32.f16 s14, s10
-; CHECK-NEXT: vcvtb.f32.f16 s13, s9
-; CHECK-NEXT: vcvtb.f32.f16 s12, s8
-; CHECK-NEXT: vmul.f32 q3, q3, q0
-; CHECK-NEXT: vcvtb.f16.f32 s8, s12
-; CHECK-NEXT: vcvtb.f16.f32 s16, s13
-; CHECK-NEXT: vmov r3, s8
-; CHECK-NEXT: vmov.16 q2[0], r3
-; CHECK-NEXT: vmov.16 q2[1], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s5
-; CHECK-NEXT: vmov.16 q2[2], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s14
-; CHECK-NEXT: vmov.16 q2[3], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s6
-; CHECK-NEXT: vmov.16 q2[4], r2
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vcvtb.f16.f32 s12, s15
-; CHECK-NEXT: vmov.16 q2[5], r2
-; CHECK-NEXT: vmov r2, s12
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q2[6], r2
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vmov.16 q2[7], r2
+; CHECK-NEXT: vmul.f32 q2, q2, q0
+; CHECK-NEXT: vcvtb.f16.f32 q2, q2
+; CHECK-NEXT: vcvtt.f16.f32 q2, q1
; CHECK-NEXT: vstrb.8 q2, [r1, #16]!
; CHECK-NEXT: le lr, .LBB10_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
define arm_aapcs_vfpcc <8 x half> @shuffle_trunc1(<4 x float> %src1, <4 x float> %src2) {
; CHECK-LABEL: shuffle_trunc1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s8
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s4
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s12, s9
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s5
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s10
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s6
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s8, s11
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: vcvtb.f16.f32 q0, q0
+; CHECK-NEXT: vcvtt.f16.f32 q0, q1
; CHECK-NEXT: bx lr
entry:
%strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
define arm_aapcs_vfpcc <8 x half> @shuffle_trunc2(<4 x float> %src1, <4 x float> %src2) {
; CHECK-LABEL: shuffle_trunc2:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvtb.f16.f32 s8, s4
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s0
-; CHECK-NEXT: vmov r1, s8
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s12, s5
-; CHECK-NEXT: vmov.16 q2[1], r1
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s1
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s6
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s2
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vcvtb.f16.f32 s0, s3
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: vcvtb.f16.f32 q1, q1
+; CHECK-NEXT: vcvtt.f16.f32 q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
define arm_aapcs_vfpcc <16 x half> @shuffle_trunc3(<8 x float> %src1, <8 x float> %src2) {
; CHECK-LABEL: shuffle_trunc3:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10}
-; CHECK-NEXT: vpush {d8, d9, d10}
-; CHECK-NEXT: vmov q4, q0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s16
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s8
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s20, s17
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s9
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s18
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s10
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s16, s19
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s8, s11
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s4
-; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s12
-; CHECK-NEXT: vmov r1, s8
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s16, s5
-; CHECK-NEXT: vmov.16 q2[1], r1
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s13
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s6
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s14
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vcvtb.f16.f32 s4, s15
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: vpop {d8, d9, d10}
+; CHECK-NEXT: vcvtb.f16.f32 q0, q0
+; CHECK-NEXT: vcvtb.f16.f32 q1, q1
+; CHECK-NEXT: vcvtt.f16.f32 q0, q2
+; CHECK-NEXT: vcvtt.f16.f32 q1, q3
; CHECK-NEXT: bx lr
entry:
%strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
define arm_aapcs_vfpcc <16 x half> @shuffle_trunc4(<8 x float> %src1, <8 x float> %src2) {
; CHECK-LABEL: shuffle_trunc4:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10}
-; CHECK-NEXT: vpush {d8, d9, d10}
-; CHECK-NEXT: vmov q4, q0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s8
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s16
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s20, s9
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s17
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s10
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s18
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s8, s11
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s19
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s12
-; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s4
-; CHECK-NEXT: vmov r1, s8
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s16, s13
-; CHECK-NEXT: vmov.16 q2[1], r1
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s5
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s14
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s6
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s12, s15
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: vpop {d8, d9, d10}
+; CHECK-NEXT: vcvtb.f16.f32 q2, q2
+; CHECK-NEXT: vcvtb.f16.f32 q3, q3
+; CHECK-NEXT: vcvtt.f16.f32 q2, q0
+; CHECK-NEXT: vcvtt.f16.f32 q3, q1
+; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: bx lr
entry:
%strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
define arm_aapcs_vfpcc <8 x half> @shuffle_trunc5(<4 x float> %src1, <4 x float> %src2) {
; CHECK-LABEL: shuffle_trunc5:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s8
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s4
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s12, s9
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s5
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s10
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s6
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s8, s11
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: vcvtb.f16.f32 q0, q0
+; CHECK-NEXT: vcvtt.f16.f32 q0, q1
; CHECK-NEXT: bx lr
entry:
%out1 = fptrunc <4 x float> %src1 to <4 x half>
define arm_aapcs_vfpcc <8 x half> @shuffle_trunc6(<4 x float> %src1, <4 x float> %src2) {
; CHECK-LABEL: shuffle_trunc6:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvtb.f16.f32 s8, s4
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s0
-; CHECK-NEXT: vmov r1, s8
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s12, s5
-; CHECK-NEXT: vmov.16 q2[1], r1
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s1
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s6
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s2
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vcvtb.f16.f32 s0, s3
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: vcvtb.f16.f32 q1, q1
+; CHECK-NEXT: vcvtt.f16.f32 q1, q0
+; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
%out1 = fptrunc <4 x float> %src1 to <4 x half>
define arm_aapcs_vfpcc <16 x half> @shuffle_trunc7(<8 x float> %src1, <8 x float> %src2) {
; CHECK-LABEL: shuffle_trunc7:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10}
-; CHECK-NEXT: vpush {d8, d9, d10}
-; CHECK-NEXT: vmov q4, q0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s16
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s8
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s20, s17
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s9
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s18
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s10
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s16, s19
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s8, s11
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s4
-; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s12
-; CHECK-NEXT: vmov r1, s8
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s16, s5
-; CHECK-NEXT: vmov.16 q2[1], r1
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s13
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s6
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s14
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vcvtb.f16.f32 s4, s15
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: vpop {d8, d9, d10}
+; CHECK-NEXT: vcvtb.f16.f32 q0, q0
+; CHECK-NEXT: vcvtb.f16.f32 q1, q1
+; CHECK-NEXT: vcvtt.f16.f32 q0, q2
+; CHECK-NEXT: vcvtt.f16.f32 q1, q3
; CHECK-NEXT: bx lr
entry:
%out1 = fptrunc <8 x float> %src1 to <8 x half>
define arm_aapcs_vfpcc <16 x half> @shuffle_trunc8(<8 x float> %src1, <8 x float> %src2) {
; CHECK-LABEL: shuffle_trunc8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10}
-; CHECK-NEXT: vpush {d8, d9, d10}
-; CHECK-NEXT: vmov q4, q0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s8
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s16
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s20, s9
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s17
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s10
-; CHECK-NEXT: vmov.16 q0[3], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s18
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov r0, s20
-; CHECK-NEXT: vcvtb.f16.f32 s8, s11
-; CHECK-NEXT: vmov.16 q0[5], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s19
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s12
-; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s4
-; CHECK-NEXT: vmov r1, s8
-; CHECK-NEXT: vmov.16 q2[0], r0
-; CHECK-NEXT: vcvtb.f16.f32 s16, s13
-; CHECK-NEXT: vmov.16 q2[1], r1
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s5
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s14
-; CHECK-NEXT: vmov.16 q2[3], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s6
-; CHECK-NEXT: vmov.16 q2[4], r0
-; CHECK-NEXT: vmov r0, s16
-; CHECK-NEXT: vcvtb.f16.f32 s12, s15
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q2[6], r0
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov.16 q2[7], r0
-; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: vpop {d8, d9, d10}
+; CHECK-NEXT: vcvtb.f16.f32 q2, q2
+; CHECK-NEXT: vcvtb.f16.f32 q3, q3
+; CHECK-NEXT: vcvtt.f16.f32 q2, q0
+; CHECK-NEXT: vcvtt.f16.f32 q3, q1
+; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: bx lr
entry:
%out1 = fptrunc <8 x float> %src1 to <8 x half>
define arm_aapcs_vfpcc void @store_shuffletrunc_8(<8 x half>* %src, <4 x float> %val1, <4 x float> %val2) {
; CHECK-LABEL: store_shuffletrunc_8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vcvtb.f16.f32 s8, s0
-; CHECK-NEXT: vmov r1, s8
-; CHECK-NEXT: vcvtb.f16.f32 s8, s4
-; CHECK-NEXT: vmov r2, s8
-; CHECK-NEXT: vmov.16 q2[0], r1
-; CHECK-NEXT: vcvtb.f16.f32 s12, s1
-; CHECK-NEXT: vmov.16 q2[1], r2
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s5
-; CHECK-NEXT: vmov.16 q2[2], r1
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s2
-; CHECK-NEXT: vmov.16 q2[3], r1
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s6
-; CHECK-NEXT: vmov.16 q2[4], r1
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: vcvtb.f16.f32 s0, s3
-; CHECK-NEXT: vmov.16 q2[5], r1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s7
-; CHECK-NEXT: vmov.16 q2[6], r1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov.16 q2[7], r1
-; CHECK-NEXT: vstrw.32 q2, [r0]
+; CHECK-NEXT: vcvtb.f16.f32 q0, q0
+; CHECK-NEXT: vcvtt.f16.f32 q0, q1
+; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: bx lr
entry:
%strided.vec = shufflevector <4 x float> %val1, <4 x float> %val2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
define arm_aapcs_vfpcc void @store_shuffletrunc_16(<16 x half>* %src, <8 x float> %val1, <8 x float> %val2) {
; CHECK-LABEL: store_shuffletrunc_16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10}
-; CHECK-NEXT: vpush {d8, d9, d10}
-; CHECK-NEXT: vcvtb.f16.f32 s16, s4
-; CHECK-NEXT: vmov r1, s16
-; CHECK-NEXT: vcvtb.f16.f32 s16, s12
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vmov.16 q4[0], r1
-; CHECK-NEXT: vcvtb.f16.f32 s20, s5
-; CHECK-NEXT: vmov.16 q4[1], r2
-; CHECK-NEXT: vmov r1, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s13
-; CHECK-NEXT: vmov.16 q4[2], r1
-; CHECK-NEXT: vmov r1, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s6
-; CHECK-NEXT: vmov.16 q4[3], r1
-; CHECK-NEXT: vmov r1, s20
-; CHECK-NEXT: vcvtb.f16.f32 s20, s14
-; CHECK-NEXT: vmov.16 q4[4], r1
-; CHECK-NEXT: vmov r1, s20
-; CHECK-NEXT: vcvtb.f16.f32 s4, s7
-; CHECK-NEXT: vmov.16 q4[5], r1
-; CHECK-NEXT: vmov r1, s4
-; CHECK-NEXT: vcvtb.f16.f32 s4, s15
-; CHECK-NEXT: vmov.16 q4[6], r1
-; CHECK-NEXT: vmov r1, s4
-; CHECK-NEXT: vmov.16 q4[7], r1
-; CHECK-NEXT: vstrw.32 q4, [r0, #16]
-; CHECK-NEXT: vcvtb.f16.f32 s4, s8
-; CHECK-NEXT: vmov r1, s4
-; CHECK-NEXT: vcvtb.f16.f32 s4, s0
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: vcvtb.f16.f32 s12, s1
-; CHECK-NEXT: vmov.16 q1[0], r2
-; CHECK-NEXT: vmov.16 q1[1], r1
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s9
-; CHECK-NEXT: vmov.16 q1[2], r1
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s2
-; CHECK-NEXT: vmov.16 q1[3], r1
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: vcvtb.f16.f32 s12, s10
-; CHECK-NEXT: vmov.16 q1[4], r1
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: vcvtb.f16.f32 s0, s3
-; CHECK-NEXT: vmov.16 q1[5], r1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vcvtb.f16.f32 s0, s11
-; CHECK-NEXT: vmov.16 q1[6], r1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: vmov.16 q1[7], r1
-; CHECK-NEXT: vstrw.32 q1, [r0]
-; CHECK-NEXT: vpop {d8, d9, d10}
+; CHECK-NEXT: vcvtb.f16.f32 q1, q1
+; CHECK-NEXT: vcvtb.f16.f32 q0, q0
+; CHECK-NEXT: vcvtt.f16.f32 q1, q3
+; CHECK-NEXT: vcvtt.f16.f32 q0, q2
+; CHECK-NEXT: vstrw.32 q1, [r0, #16]
+; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: bx lr
entry:
%strided.vec = shufflevector <8 x float> %val1, <8 x float> %val2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>