From dfe11c00212405f1da1f09d7c1125d7661e8da5a Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 7 Feb 2023 10:06:58 +0000 Subject: [PATCH] [ARM] Add various tests for reductions of shuffles. NFC --- llvm/lib/Target/ARM/ARMISelLowering.h | 3 +- .../Thumb2/mve-complex-deinterleaving-i64-add.ll | 2 +- llvm/test/CodeGen/Thumb2/mve-vabdus.ll | 37 +++ .../CodeGen/Thumb2/mve-vecreduce-add-combine.ll | 369 +++++++++++++++++++++ llvm/test/CodeGen/Thumb2/mve-vhadd.ll | 65 ++++ llvm/test/CodeGen/Thumb2/mve-vmulh.ll | 34 ++ 6 files changed, 507 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 06da997..86ad9a4 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -244,8 +244,7 @@ class VectorType; VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask VADDLVApu, VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply - // them - VMLAVu, // and add the results together, returning an i32 of their sum + VMLAVu, // them and add the results together, returning an i32 of their sum VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask VMLAVpu, VMLALVs, // Same as VMLAV but with i64, returning the low and diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i64-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i64-add.ll index 413dbbd..dea6990 100644 --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i64-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i64-add.ll @@ -161,4 +161,4 @@ entry: %1 = add <4 x i64> %b.imag, %a.real %interleaved.vec = shufflevector <4 x i64> %0, <4 x i64> %1, <8 x i32> ret <8 x i64> %interleaved.vec -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll index bdb1168..654f7a3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -658,3 +658,40 @@ define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32_shuffle(<4 x i32> %src1, <4 x i32> %aresult = trunc <4 x i64> %as to <4 x i32> ret <4 x i32> %aresult } + + +define arm_aapcs_vfpcc i16 @vabds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-LABEL: vabds_reduce_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vabd.s8 q0, q0, q1 +; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %sextsrc1 = sext <16 x i8> %s0 to <16 x i16> + %sextsrc2 = sext <16 x i8> %s1 to <16 x i16> + %add1 = sub <16 x i16> %sextsrc1, %sextsrc2 + %add2 = sub <16 x i16> zeroinitializer, %add1 + %c = icmp sge <16 x i16> %add1, zeroinitializer + %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2 + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) + ret i16 %result +} + +define arm_aapcs_vfpcc i16 @vabdu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-LABEL: vabdu_reduce_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vabd.u8 q0, q0, q1 +; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %sextsrc1 = zext <16 x i8> %s0 to <16 x i16> + %sextsrc2 = zext <16 x i8> %s1 to <16 x i16> + %add1 = sub <16 x i16> %sextsrc1, %sextsrc2 + %add2 = sub <16 x i16> zeroinitializer, %add1 + %c = icmp sge <16 x i16> %add1, zeroinitializer + %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2 + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) + ret i16 %result +} + +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add-combine.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add-combine.ll index 16abf16..6454310 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add-combine.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add-combine.ll @@ -100,4 +100,373 @@ entry: ret i32 %add2 } +define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8(<16 x i8> %s0) { +; CHECK-LABEL: vaddv_shuffle_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.8 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.8 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.8 q1[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vaddv.u8 r0, q1 +; CHECK-NEXT: bx lr +entry: + %s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> + %s1 = zext <16 x i8> %s2 to <16 x i16> + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) + ret i16 %result +} + +define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8_duplicate(<16 x i8> %s0) { +; CHECK-LABEL: vaddv_shuffle_v16i8_duplicate: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: vmov.8 q1[0], r0 +; CHECK-NEXT: vmov.8 q1[1], r1 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.8 q1[2], r1 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: vmov.8 q1[3], r1 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: vmov.8 q1[4], r1 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.8 q1[5], r1 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.8 q1[6], r1 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.8 q1[7], r1 +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vaddv.u8 r0, q1 +; CHECK-NEXT: bx lr +entry: + %s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> + %s1 = zext <16 x i8> %s2 to <16 x i16> + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) + ret i16 %result +} + +define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8_undef(<16 x i8> %s0) { +; CHECK-LABEL: vaddv_shuffle_v16i8_undef: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.8 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.8 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.8 q1[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vaddv.u8 r0, q1 +; CHECK-NEXT: bx lr +entry: + %s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> + %s1 = zext <16 x i8> %s2 to <16 x i16> + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) + ret i16 %result +} + +define arm_aapcs_vfpcc i64 @vaddv_shuffle_v4i32_long(<4 x i32> %s0) { +; CHECK-LABEL: vaddv_shuffle_v4i32_long: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vaddlv.u32 r0, r1, q1 +; CHECK-NEXT: bx lr +entry: + %s2 = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> + %s1 = zext <4 x i32> %s2 to <4 x i64> + %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1) + ret i64 %r +} + +define arm_aapcs_vfpcc i64 @vaddv_shuffle_v4i32_long_a(<4 x i32> %s0, i64 %a) { +; CHECK-LABEL: vaddv_shuffle_v4i32_long_a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vaddlva.u32 r0, r1, q1 +; CHECK-NEXT: bx lr +entry: + %s2 = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> + %s1 = zext <4 x i32> %s2 to <4 x i64> + %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1) + %r2 = add i64 %r, %a + ret i64 %r2 +} + +define arm_aapcs_vfpcc i16 @vmla_shuffle_v16i8(<16 x i8> %s0, <16 x i8> %s0b) { +; CHECK-LABEL: vmla_shuffle_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.8 q2[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.8 q2[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q2[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.8 q2[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q2[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.8 q2[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.8 q2[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.8 q2[8], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.8 q2[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q2[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q2[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q2[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.8 q2[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q2[14], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.8 q2[15], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.8 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.8 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.8 q1[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vmlav.s8 r0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %s2a = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> + %s1a = sext <16 x i8> %s2a to <16 x i16> + %s2b = shufflevector <16 x i8> %s0b, <16 x i8> %s0, <16 x i32> + %s1b = sext <16 x i8> %s2b to <16 x i16> + %s1 = mul <16 x i16> %s1a, %s1b + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) + ret i16 %result +} + +define arm_aapcs_vfpcc i16 @vmla_shuffle_v16i8_unequal(<16 x i8> %s0, <16 x i8> %s0b) { +; CHECK-LABEL: vmla_shuffle_v16i8_unequal: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.8 q2[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.8 q2[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q2[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.8 q2[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q2[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.8 q2[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.8 q2[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.8 q2[8], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.8 q2[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q2[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q2[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q2[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.8 q2[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q2[14], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.8 q2[15], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q1[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.8 q1[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.8 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.8 q1[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vmlav.s8 r0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %s2a = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> + %s1a = sext <16 x i8> %s2a to <16 x i16> + %s2b = shufflevector <16 x i8> %s0b, <16 x i8> %s0, <16 x i32> + %s1b = sext <16 x i8> %s2b to <16 x i16> + %s1 = mul <16 x i16> %s1a, %s1b + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) + ret i16 %result +} + +define arm_aapcs_vfpcc i64 @vmla_shuffle_v4i32_long(<4 x i32> %s0, <4 x i32> %s0b) { +; CHECK-LABEL: vmla_shuffle_v4i32_long: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s7 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov.f32 s11, s4 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmlalv.u32 r0, r1, q1, q2 +; CHECK-NEXT: bx lr +entry: + %s2a = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> + %s1a = zext <4 x i32> %s2a to <4 x i64> + %s2b = shufflevector <4 x i32> %s0b, <4 x i32> %s0, <4 x i32> + %s1b = zext <4 x i32> %s2b to <4 x i64> + %s1 = mul <4 x i64> %s1a, %s1b + %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1) + ret i64 %r +} + +define arm_aapcs_vfpcc i64 @vmla_shuffle_v4i32_long_a(<4 x i32> %s0, <4 x i32> %s0b, i64 %a) { +; CHECK-LABEL: vmla_shuffle_v4i32_long_a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s7 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov.f32 s11, s4 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmlalva.u32 r0, r1, q1, q2 +; CHECK-NEXT: bx lr +entry: + %s2a = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> + %s1a = zext <4 x i32> %s2a to <4 x i64> + %s2b = shufflevector <4 x i32> %s0b, <4 x i32> %s0, <4 x i32> + %s1b = zext <4 x i32> %s2b to <4 x i64> + %s1 = mul <4 x i64> %s1a, %s1b + %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1) + %r2 = add i64 %r, %a + ret i64 %r2 +} + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll index aa4edfc..1c21e71 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -869,3 +869,68 @@ vector.body: ; preds = %vector.body, %entry for.cond.cleanup: ; preds = %vector.body ret void } + + +define arm_aapcs_vfpcc i16 @vhadds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-LABEL: vhadds_reduce_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.s8 q0, q0, q1 +; CHECK-NEXT: vaddv.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %s0s = sext <16 x i8> %s0 to <16 x i16> + %s1s = sext <16 x i8> %s1 to <16 x i16> + %add = add <16 x i16> %s0s, %s1s + %s = ashr <16 x i16> %add, + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) + ret i16 %result +} + +define arm_aapcs_vfpcc i16 @vhaddu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-LABEL: vhaddu_reduce_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.u8 q0, q0, q1 +; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %s0s = zext <16 x i8> %s0 to <16 x i16> + %s1s = zext <16 x i8> %s1 to <16 x i16> + %add = add <16 x i16> %s0s, %s1s + %s = lshr <16 x i16> %add, + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) + ret i16 %result +} + +define arm_aapcs_vfpcc i16 @vrhadds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-LABEL: vrhadds_reduce_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrhadd.s8 q0, q0, q1 +; CHECK-NEXT: vaddv.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %s0s = sext <16 x i8> %s0 to <16 x i16> + %s1s = sext <16 x i8> %s1 to <16 x i16> + %add = add <16 x i16> %s0s, %s1s + %add2 = add <16 x i16> %add, + %s = ashr <16 x i16> %add2, + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) + ret i16 %result +} + +define arm_aapcs_vfpcc i16 @vrhaddu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-LABEL: vrhaddu_reduce_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrhadd.u8 q0, q0, q1 +; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %s0s = zext <16 x i8> %s0 to <16 x i16> + %s1s = zext <16 x i8> %s1 to <16 x i16> + %add = add <16 x i16> %s0s, %s1s + %add2 = add <16 x i16> %add, + %s = lshr <16 x i16> %add2, + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) + ret i16 %result +} + +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll index 3c65339..eb1527f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -759,6 +759,40 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } + +define arm_aapcs_vfpcc i16 @vmulhs_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-LABEL: vmulhs_reduce_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmulh.s8 q0, q0, q1 +; CHECK-NEXT: vaddv.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %s0s = sext <16 x i8> %s0 to <16 x i16> + %s1s = sext <16 x i8> %s1 to <16 x i16> + %m = mul <16 x i16> %s0s, %s1s + %s = ashr <16 x i16> %m, + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) + ret i16 %result +} + +define arm_aapcs_vfpcc i16 @vmulhu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { +; CHECK-LABEL: vmulhu_reduce_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmulh.u8 q0, q0, q1 +; CHECK-NEXT: vaddv.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %s0s = zext <16 x i8> %s0 to <16 x i16> + %s1s = zext <16 x i8> %s1 to <16 x i16> + %m = mul <16 x i16> %s0s, %s1s + %s = ashr <16 x i16> %m, + %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) + ret i16 %result +} + +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) + + declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) -- 2.7.4