define arm_aapcs_vfpcc <16 x i8> @vabd_s8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: vabd_s8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmovlt.s8 q2, q1
-; CHECK-NEXT: vmovlt.s8 q3, q0
-; CHECK-NEXT: vmovlb.s8 q1, q1
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vsub.i16 q2, q3, q2
-; CHECK-NEXT: vsub.i16 q0, q0, q1
-; CHECK-NEXT: vabs.s16 q2, q2
-; CHECK-NEXT: vabs.s16 q0, q0
-; CHECK-NEXT: vmovnt.i16 q0, q2
+; CHECK-NEXT: vabd.s8 q0, q0, q1
; CHECK-NEXT: bx lr
%sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
%sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: vabd_s16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmovlt.s16 q2, q1
-; CHECK-NEXT: vmovlt.s16 q3, q0
-; CHECK-NEXT: vmovlb.s16 q1, q1
-; CHECK-NEXT: vmovlb.s16 q0, q0
-; CHECK-NEXT: vsub.i32 q2, q3, q2
-; CHECK-NEXT: vsub.i32 q0, q0, q1
-; CHECK-NEXT: vabs.s32 q2, q2
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vmovnt.i32 q0, q2
+; CHECK-NEXT: vabd.s16 q0, q0, q1
; CHECK-NEXT: bx lr
%sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
%sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: vabd_s32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vmov.f32 s12, s2
-; CHECK-NEXT: vmov.f32 s14, s3
-; CHECK-NEXT: vmov.f32 s16, s6
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vmov.f32 s18, s7
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: asrs r1, r0, #31
-; CHECK-NEXT: subs r0, r0, r2
-; CHECK-NEXT: sbc.w r1, r1, r2, asr #31
-; CHECK-NEXT: add.w r0, r0, r1, asr #31
-; CHECK-NEXT: eor.w r0, r0, r1, asr #31
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: asrs r2, r1, #31
-; CHECK-NEXT: subs r1, r1, r3
-; CHECK-NEXT: sbc.w r2, r2, r3, asr #31
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: add.w r1, r1, r2, asr #31
-; CHECK-NEXT: eor.w r1, r1, r2, asr #31
-; CHECK-NEXT: vmov r2, s18
-; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT: vmov r0, s14
-; CHECK-NEXT: asrs r1, r0, #31
-; CHECK-NEXT: subs r0, r0, r2
-; CHECK-NEXT: sbc.w r1, r1, r2, asr #31
-; CHECK-NEXT: add.w r0, r0, r1, asr #31
-; CHECK-NEXT: eor.w r0, r0, r1, asr #31
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: asrs r2, r1, #31
-; CHECK-NEXT: subs r1, r1, r3
-; CHECK-NEXT: sbc.w r2, r2, r3, asr #31
-; CHECK-NEXT: add.w r1, r1, r2, asr #31
-; CHECK-NEXT: eor.w r1, r1, r2, asr #31
-; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT: vmov q0, q2
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vabd.s32 q0, q0, q1
; CHECK-NEXT: bx lr
%sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
%sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
define arm_aapcs_vfpcc <16 x i8> @vabd_u8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: vabd_u8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmovlt.u8 q2, q1
-; CHECK-NEXT: vmovlt.u8 q3, q0
-; CHECK-NEXT: vmovlb.u8 q1, q1
-; CHECK-NEXT: vmovlb.u8 q0, q0
-; CHECK-NEXT: vsub.i16 q2, q3, q2
-; CHECK-NEXT: vsub.i16 q0, q0, q1
-; CHECK-NEXT: vabs.s16 q2, q2
-; CHECK-NEXT: vabs.s16 q0, q0
-; CHECK-NEXT: vmovnt.i16 q0, q2
+; CHECK-NEXT: vabd.u8 q0, q0, q1
; CHECK-NEXT: bx lr
%zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
%zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: vabd_u16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmovlt.u16 q2, q1
-; CHECK-NEXT: vmovlt.u16 q3, q0
-; CHECK-NEXT: vmovlb.u16 q1, q1
-; CHECK-NEXT: vmovlb.u16 q0, q0
-; CHECK-NEXT: vsub.i32 q2, q3, q2
-; CHECK-NEXT: vsub.i32 q0, q0, q1
-; CHECK-NEXT: vabs.s32 q2, q2
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vmovnt.i32 q0, q2
+; CHECK-NEXT: vabd.u16 q0, q0, q1
; CHECK-NEXT: bx lr
%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
%zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: vabd_u32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vmov.f32 s8, s6
-; CHECK-NEXT: vmov.i64 q4, #0xffffffff
-; CHECK-NEXT: vmov.f32 s12, s2
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vmov.f32 s14, s3
-; CHECK-NEXT: vand q2, q2, q4
-; CHECK-NEXT: vand q3, q3, q4
-; CHECK-NEXT: vmov r0, r1, d4
-; CHECK-NEXT: vmov r2, r3, d6
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: vand q1, q1, q4
-; CHECK-NEXT: vand q4, q0, q4
-; CHECK-NEXT: subs r0, r2, r0
-; CHECK-NEXT: sbc.w r1, r3, r1
-; CHECK-NEXT: add.w r0, r0, r1, asr #31
-; CHECK-NEXT: eor.w r12, r0, r1, asr #31
-; CHECK-NEXT: vmov r1, r2, d2
-; CHECK-NEXT: vmov r3, r0, d8
-; CHECK-NEXT: subs r1, r3, r1
-; CHECK-NEXT: sbcs r0, r2
-; CHECK-NEXT: vmov r2, r3, d7
-; CHECK-NEXT: add.w r1, r1, r0, asr #31
-; CHECK-NEXT: eor.w r0, r1, r0, asr #31
-; CHECK-NEXT: vmov q0[2], q0[0], r0, r12
-; CHECK-NEXT: vmov r0, r1, d5
-; CHECK-NEXT: subs r0, r2, r0
-; CHECK-NEXT: sbc.w r1, r3, r1
-; CHECK-NEXT: add.w r0, r0, r1, asr #31
-; CHECK-NEXT: eor.w r12, r0, r1, asr #31
-; CHECK-NEXT: vmov r1, r2, d3
-; CHECK-NEXT: vmov r3, r0, d9
-; CHECK-NEXT: subs r1, r3, r1
-; CHECK-NEXT: sbcs r0, r2
-; CHECK-NEXT: add.w r1, r1, r0, asr #31
-; CHECK-NEXT: eor.w r0, r1, r0, asr #31
-; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vabd.u32 q0, q0, q1
; CHECK-NEXT: bx lr
%zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
%zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
; CHECK-NEXT: mov.w lr, #64
; CHECK-NEXT: .LBB6_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrb.s32 q0, [r1, #12]
-; CHECK-NEXT: vldrb.s32 q1, [r0, #12]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.s32 q1, [r0, #8]
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #12]
-; CHECK-NEXT: vldrb.s32 q0, [r1, #8]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.s32 q1, [r0, #4]
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #8]
-; CHECK-NEXT: vldrb.s32 q0, [r1, #4]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.s32 q1, [r0], #16
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #4]
-; CHECK-NEXT: vldrb.s32 q0, [r1], #16
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2], #16
+; CHECK-NEXT: vldrb.u8 q0, [r1], #16
+; CHECK-NEXT: vldrb.u8 q1, [r0], #16
+; CHECK-NEXT: vabd.s8 q0, q1, q0
+; CHECK-NEXT: vmov.u8 r12, q0[14]
+; CHECK-NEXT: vmov.u8 r3, q0[12]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[15]
+; CHECK-NEXT: vmov.u8 r3, q0[13]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[10]
+; CHECK-NEXT: vmov.u8 r3, q0[8]
+; CHECK-NEXT: vstrb.32 q1, [r2, #12]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[11]
+; CHECK-NEXT: vmov.u8 r3, q0[9]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[6]
+; CHECK-NEXT: vmov.u8 r3, q0[4]
+; CHECK-NEXT: vstrb.32 q1, [r2, #8]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[7]
+; CHECK-NEXT: vmov.u8 r3, q0[5]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[2]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: vstrb.32 q1, [r2, #4]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[3]
+; CHECK-NEXT: vmov.u8 r3, q0[1]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vstrb.32 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB6_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: mov.w lr, #128
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.s32 q0, [r1, #8]
-; CHECK-NEXT: vldrh.s32 q1, [r0, #8]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrh.s32 q1, [r0], #16
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrh.32 q0, [r2, #8]
-; CHECK-NEXT: vldrh.s32 q0, [r1], #16
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrh.32 q0, [r2], #16
+; CHECK-NEXT: vldrh.u16 q0, [r1], #16
+; CHECK-NEXT: vldrh.u16 q1, [r0], #16
+; CHECK-NEXT: vabd.s16 q0, q1, q0
+; CHECK-NEXT: vmov.u16 r12, q0[6]
+; CHECK-NEXT: vmov.u16 r3, q0[4]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[7]
+; CHECK-NEXT: vmov.u16 r3, q0[5]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: vstrh.32 q1, [r2, #8]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[3]
+; CHECK-NEXT: vmov.u16 r3, q0[1]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vstrh.32 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: mov.w lr, #64
; CHECK-NEXT: .LBB9_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrb.u32 q0, [r1, #12]
-; CHECK-NEXT: vldrb.u32 q1, [r0, #12]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.u32 q1, [r0, #8]
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #12]
-; CHECK-NEXT: vldrb.u32 q0, [r1, #8]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.u32 q1, [r0, #4]
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #8]
-; CHECK-NEXT: vldrb.u32 q0, [r1, #4]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.u32 q1, [r0], #16
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #4]
-; CHECK-NEXT: vldrb.u32 q0, [r1], #16
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2], #16
+; CHECK-NEXT: vldrb.u8 q0, [r1], #16
+; CHECK-NEXT: vldrb.u8 q1, [r0], #16
+; CHECK-NEXT: vabd.u8 q0, q1, q0
+; CHECK-NEXT: vmov.u8 r12, q0[14]
+; CHECK-NEXT: vmov.u8 r3, q0[12]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[15]
+; CHECK-NEXT: vmov.u8 r3, q0[13]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[10]
+; CHECK-NEXT: vmov.u8 r3, q0[8]
+; CHECK-NEXT: vstrb.32 q1, [r2, #12]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[11]
+; CHECK-NEXT: vmov.u8 r3, q0[9]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[6]
+; CHECK-NEXT: vmov.u8 r3, q0[4]
+; CHECK-NEXT: vstrb.32 q1, [r2, #8]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[7]
+; CHECK-NEXT: vmov.u8 r3, q0[5]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[2]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: vstrb.32 q1, [r2, #4]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[3]
+; CHECK-NEXT: vmov.u8 r3, q0[1]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vstrb.32 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB9_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: mov.w lr, #128
; CHECK-NEXT: .LBB10_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u32 q0, [r1, #8]
-; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrh.u32 q1, [r0], #16
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrh.32 q0, [r2, #8]
-; CHECK-NEXT: vldrh.u32 q0, [r1], #16
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrh.32 q0, [r2], #16
+; CHECK-NEXT: vldrh.u16 q0, [r1], #16
+; CHECK-NEXT: vldrh.u16 q1, [r0], #16
+; CHECK-NEXT: vabd.u16 q0, q1, q0
+; CHECK-NEXT: vmov.u16 r12, q0[6]
+; CHECK-NEXT: vmov.u16 r3, q0[4]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[7]
+; CHECK-NEXT: vmov.u16 r3, q0[5]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: vstrh.32 q1, [r2, #8]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[3]
+; CHECK-NEXT: vmov.u16 r3, q0[1]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vstrh.32 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB10_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}