From 5f65676eba16f38e5e22122e6885c0bd8e504276 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 8 Jul 2021 12:32:45 +0100 Subject: [PATCH] aarch64: Use memcpy to copy vector tables in vqtbx[234] intrinsics Use __builtin_memcpy to copy vector structures instead of building a new opaque structure one vector at a time in each of the vqtbx[234] Neon intrinsics in arm_neon.h. This simplifies the header file and also improves code generation - superfluous move instructions were emitted for every register extraction/set in this additional structure. Add new code generation tests to verify that superfluous move instructions are no longer generated for the vqtbx[234] intrinsics. gcc/ChangeLog: 2021-07-08 Jonathan Wright * config/aarch64/arm_neon.h (vqtbx2_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_oi one vector at a time. (vqtbx2_u8): Likewise. (vqtbx2_p8): Likewise. (vqtbx2q_s8): Likewise. (vqtbx2q_u8): Likewise. (vqtbx2q_p8): Likewise. (vqtbx3_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_ci one vector at a time. (vqtbx3_u8): Likewise. (vqtbx3_p8): Likewise. (vqtbx3q_s8): Likewise. (vqtbx3q_u8): Likewise. (vqtbx3q_p8): Likewise. (vqtbx4_s8): Use __builtin_memcpy instead of constructing __builtin_aarch64_simd_xi one vector at a time. (vqtbx4_u8): Likewise. (vqtbx4_p8): Likewise. (vqtbx4q_s8): Likewise. (vqtbx4q_u8): Likewise. (vqtbx4q_p8): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: New tests. --- gcc/config/aarch64/arm_neon.h | 77 ++++++---------------- .../aarch64/vector_structure_intrinsics.c | 44 +++++++++++++ 2 files changed, 65 insertions(+), 56 deletions(-) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 31ae86e..a7b8449 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -23482,15 +23482,14 @@ vqtbl4q_p8 (poly8x16x4_t __tab, uint8x16_t __idx) return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } - /* vqtbx2 */ + __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2_s8 (int8x8_t __r, int8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx2v8qi (__r, __o, (int8x8_t)__idx); } @@ -23499,8 +23498,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2_u8 (uint8x8_t __r, uint8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23510,8 +23508,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2_p8 (poly8x8_t __r, poly8x16x2_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbx2v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23521,8 +23518,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2q_s8 (int8x16_t __r, int8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx2v16qi (__r, __o, (int8x16_t)__idx); } @@ -23531,10 +23527,9 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2q_u8 (uint8x16_t __r, uint8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbx2v16qi ((int8x16_t)__r, __o, - (int8x16_t)__idx); + (int8x16_t)__idx); } __extension__ extern __inline poly8x16_t @@ -23542,21 +23537,19 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx2q_p8 (poly8x16_t __r, poly8x16x2_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbx2v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } /* vqtbx3 */ + __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3_s8 (int8x8_t __r, int8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx3v8qi (__r, __o, (int8x8_t)__idx); } @@ -23565,9 +23558,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3_u8 (uint8x8_t __r, uint8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23577,9 +23568,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3_p8 (poly8x8_t __r, poly8x16x3_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23589,9 +23578,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3q_s8 (int8x16_t __r, int8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx3v16qi (__r, __o, (int8x16_t)__idx); } @@ -23600,9 +23587,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3q_u8 (uint8x16_t __r, uint8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } @@ -23612,9 +23597,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx3q_p8 (poly8x16_t __r, poly8x16x3_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } @@ -23626,10 +23609,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4_s8 (int8x8_t __r, int8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx4v8qi (__r, __o, (int8x8_t)__idx); } @@ -23638,10 +23618,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4_u8 (uint8x8_t __r, uint8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23651,10 +23628,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4_p8 (poly8x8_t __r, poly8x16x4_t __tab, uint8x8_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o, (int8x8_t)__idx); } @@ -23664,10 +23638,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4q_s8 (int8x16_t __r, int8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return __builtin_aarch64_qtbx4v16qi (__r, __o, (int8x16_t)__idx); } @@ -23676,10 +23647,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4q_u8 (uint8x16_t __r, uint8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } @@ -23689,10 +23657,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqtbx4q_p8 (poly8x16_t __r, poly8x16x4_t __tab, uint8x16_t __idx) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + __builtin_memcpy (&__o, &__tab, sizeof (__tab)); return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o, (int8x16_t)__idx); } diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c index 0b07e9e..b60e80e 100644 --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c @@ -39,6 +39,50 @@ TEST_TBL3 (vqtbl3q, int8x16_t, int8x16x3_t, uint8x16_t, s8) TEST_TBL3 (vqtbl3q, uint8x16_t, uint8x16x3_t, uint8x16_t, u8) TEST_TBL3 (vqtbl3q, poly8x16_t, poly8x16x3_t, uint8x16_t, p8) +#define TEST_TBX2(name, rettype, tbltype, idxtype, ts) \ + rettype test_ ## name ## _ ## ts (rettype a, idxtype b, tbltype c) \ + { \ + return name ## _ ## ts (a, c, b); \ + } + +TEST_TBX2 (vqtbx2, int8x8_t, int8x16x2_t, uint8x8_t, s8) +TEST_TBX2 (vqtbx2, uint8x8_t, uint8x16x2_t, uint8x8_t, u8) +TEST_TBX2 (vqtbx2, poly8x8_t, poly8x16x2_t, uint8x8_t, p8) + +TEST_TBX2 (vqtbx2q, int8x16_t, int8x16x2_t, uint8x16_t, s8) +TEST_TBX2 (vqtbx2q, uint8x16_t, uint8x16x2_t, uint8x16_t, u8) +TEST_TBX2 (vqtbx2q, poly8x16_t, poly8x16x2_t, uint8x16_t, p8) + +#define TEST_TBX3(name, rettype, tbltype, idxtype, ts) \ + rettype test_ ## name ## _ ## ts (rettype a, tbltype b, idxtype c) \ + { \ + return name ## _ ## ts (a, b, c); \ + } + +TEST_TBX3 (vqtbx3, int8x8_t, int8x16x3_t, uint8x8_t, s8) +TEST_TBX3 (vqtbx3, uint8x8_t, uint8x16x3_t, uint8x8_t, u8) +TEST_TBX3 (vqtbx3, poly8x8_t, poly8x16x3_t, uint8x8_t, p8) + +TEST_TBX3 (vqtbx3q, int8x16_t, int8x16x3_t, uint8x16_t, s8) +TEST_TBX3 (vqtbx3q, uint8x16_t, uint8x16x3_t, uint8x16_t, u8) +TEST_TBX3 (vqtbx3q, poly8x16_t, poly8x16x3_t, uint8x16_t, p8) + +#define TEST_TBX4(name, rettype, tbltype, idxtype, ts) \ + rettype test_ ## name ## _ ## ts (rettype a, idxtype b, idxtype dummy1, \ + idxtype dummy2, tbltype c) \ + { \ + return name ## _ ## ts (a, c, b); \ + } + +TEST_TBX4 (vqtbx4, int8x8_t, int8x16x4_t, uint8x8_t, s8) +TEST_TBX4 (vqtbx4, uint8x8_t, uint8x16x4_t, uint8x8_t, u8) +TEST_TBX4 (vqtbx4, poly8x8_t, poly8x16x4_t, uint8x8_t, p8) + +TEST_TBX4 (vqtbx4q, int8x16_t, int8x16x4_t, uint8x16_t, s8) +TEST_TBX4 (vqtbx4q, uint8x16_t, uint8x16x4_t, uint8x16_t, u8) +TEST_TBX4 (vqtbx4q, poly8x16_t, poly8x16x4_t, uint8x16_t, p8) + /* { dg-final { scan-assembler-not "mov\\t" } } */ /* { dg-final { scan-assembler-times "tbl\\t" 18} } */ +/* { dg-final { scan-assembler-times "tbx\\t" 18} } */ -- 2.7.4