From 1a6306420090409cb397e2e042256eb1905f415f Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Thu, 15 Oct 2020 17:13:59 +0000 Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics This patch adds implementations for vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics. vceqq_p64 uses the existing vceq_p64 after splitting the input vectors into their high and low halves. vceqz[q] simply call the vceq and vceqq with a second argument equal to zero. The added (executable) testcases make sure that the poly64x2_t variants have results with one element of all zeroes (false) and the other element with all bits set to one (true). 2021-01-15 Christophe Lyon gcc/ PR target/71233 * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64): New. gcc/testsuite/ PR target/71233 * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for vceqz_p64, vceqq_p64 and vceqzq_p64. --- gcc/config/arm/arm_neon.h | 31 +++++++++++++++ .../aarch64/advsimd-intrinsics/p64_p128.c | 46 +++++++++++++++++++++- 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index f99b939..dc28b92 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b) return vreinterpret_u64_u32 (__m); } +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqz_p64 (poly64x1_t __a) +{ + poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0)); + return vceq_p64 (__a, __b); +} + +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements. */ +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqq_p64 (poly64x2_t __a, poly64x2_t __b) +{ + poly64_t __high_a = vget_high_p64 (__a); + poly64_t __high_b = vget_high_p64 (__b); + uint64x1_t __high = vceq_p64 (__high_a, __high_b); + + poly64_t __low_a = vget_low_p64 (__a); + poly64_t __low_b = vget_low_p64 (__b); + uint64x1_t __low = vceq_p64 (__low_a, __low_b); + return vcombine_u64 (__low, __high); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqzq_p64 (poly64x2_t __a) +{ + poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0)); + return vceqq_p64 (__a, __b); +} + /* The vtst_p64 intrinsic does not map to a single instruction. We emulate it in way similar to vceq_p64 above but here we do a reduction with max since if any two corresponding bits diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c index a3210a9..6aed096 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = { 0xfffffff1, /* Expected results: vceq. */ VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 }; +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; + +/* Expected results: vceqz. */ +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 }; +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; /* Expected results: vcombine. */ VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x88 }; @@ -213,7 +218,7 @@ int main (void) /* vceq_p64 tests. */ #undef TEST_MSG -#define TEST_MSG "VCEQ" +#define TEST_MSG "VCEQ/VCEQQ" #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N) \ VECT_VAR(vceq_vector_res, T3, W, N) = \ @@ -227,16 +232,55 @@ int main (void) DECL_VARIABLE(vceq_vector, poly, 64, 1); DECL_VARIABLE(vceq_vector2, poly, 64, 1); DECL_VARIABLE(vceq_vector_res, uint, 64, 1); + DECL_VARIABLE(vceq_vector, poly, 64, 2); + DECL_VARIABLE(vceq_vector2, poly, 64, 2); + DECL_VARIABLE(vceq_vector_res, uint, 64, 2); CLEAN(result, uint, 64, 1); + CLEAN(result, uint, 64, 2); VLOAD(vceq_vector, buffer, , poly, p, 64, 1); + VLOAD(vceq_vector, buffer, q, poly, p, 64, 2); VDUP(vceq_vector2, , poly, p, 64, 1, 0x88); + VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88); + VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1); TEST_VCOMP(vceq, , poly, p, uint, 64, 1); + TEST_VCOMP(vceq, q, poly, p, uint, 64, 2); CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, ""); + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, ""); + + /* vceqz_p64 tests. */ +#undef TEST_MSG +#define TEST_MSG "VCEQZ/VCEQZQ" + +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N) \ + VECT_VAR(vceqz_vector_res, T3, W, N) = \ + INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N)); \ + vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceqz_vector_res, T3, W, N)) + +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N) \ + TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N) + + DECL_VARIABLE(vceqz_vector, poly, 64, 1); + DECL_VARIABLE(vceqz_vector_res, uint, 64, 1); + DECL_VARIABLE(vceqz_vector, poly, 64, 2); + DECL_VARIABLE(vceqz_vector_res, uint, 64, 2); + + CLEAN(result, uint, 64, 1); + CLEAN(result, uint, 64, 2); + + VLOAD(vceqz_vector, buffer, , poly, p, 64, 1); + VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2); + VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0); + + TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1); + TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2); + + CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, ""); + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, ""); /* vcombine_p64 tests. */ #undef TEST_MSG -- 2.7.4