From 7211512a5f817ad08e333774bd186c0c5ff48533 Mon Sep 17 00:00:00 2001 From: Alan Lawrence Date: Wed, 30 Apr 2014 17:04:53 +0000 Subject: [PATCH] Rewrite AArch64 UZP Intrinsics using __builtin_shuffle. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vuzps32_1.c: Expect zip1/2 insn rather than uzp1/2. * gcc.target/aarch64/vuzpu32_1.c: Likewise. * gcc.target/aarch64/vuzpf32_1.c: Likewise. gcc/ChangeLog: * config/aarch64/arm_neon.h (vuzp1_f32, vuzp1_p8, vuzp1_p16, vuzp1_s8, vuzp1_s16, vuzp1_s32, vuzp1_u8, vuzp1_u16, vuzp1_u32, vuzp1q_f32, vuzp1q_f64, vuzp1q_p8, vuzp1q_p16, vuzp1q_s8, vuzp1q_s16, vuzp1q_s32, vuzp1q_s64, vuzp1q_u8, vuzp1q_u16, vuzp1q_u32, vuzp1q_u64, vuzp2_f32, vuzp2_p8, vuzp2_p16, vuzp2_s8, vuzp2_s16, vuzp2_s32, vuzp2_u8, vuzp2_u16, vuzp2_u32, vuzp2q_f32, vuzp2q_f64, vuzp2q_p8, vuzp2q_p16, vuzp2q_s8, vuzp2q_s16, vuzp2q_s32, vuzp2q_s64, vuzp2q_u8, vuzp2q_u16, vuzp2q_u32, vuzp2q_u64): Replace temporary asm with __builtin_shuffle. From-SVN: r209943 --- gcc/ChangeLog | 11 + gcc/config/aarch64/arm_neon.h | 1335 ++++++++++----------- gcc/testsuite/ChangeLog | 6 + gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c | 4 +- gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c | 4 +- gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c | 4 +- 6 files changed, 676 insertions(+), 688 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1ac6d14..e3ae8dc 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2014-04-30 Alan Lawrence + + * config/aarch64/arm_neon.h (vuzp1_f32, vuzp1_p8, vuzp1_p16, vuzp1_s8, + vuzp1_s16, vuzp1_s32, vuzp1_u8, vuzp1_u16, vuzp1_u32, vuzp1q_f32, + vuzp1q_f64, vuzp1q_p8, vuzp1q_p16, vuzp1q_s8, vuzp1q_s16, vuzp1q_s32, + vuzp1q_s64, vuzp1q_u8, vuzp1q_u16, vuzp1q_u32, vuzp1q_u64, vuzp2_f32, + vuzp2_p8, vuzp2_p16, vuzp2_s8, vuzp2_s16, vuzp2_s32, vuzp2_u8, + vuzp2_u16, vuzp2_u32, vuzp2q_f32, vuzp2q_f64, vuzp2q_p8, vuzp2q_p16, + vuzp2q_s8, vuzp2q_s16, vuzp2q_s32, vuzp2q_s64, vuzp2q_u8, vuzp2q_u16, + vuzp2q_u32, vuzp2q_u64): Replace temporary asm with __builtin_shuffle. + 2014-04-30 Joern Rennecke * config/arc/arc.opt (mlra): Move comment above option name diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index fa57667..f6213ce 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -13199,467 +13199,6 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b) : /* No clobbers */); return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vuzp1_f32 (float32x2_t a, float32x2_t b) -{ - float32x2_t result; - __asm__ ("uzp1 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vuzp1_p8 (poly8x8_t a, poly8x8_t b) -{ - poly8x8_t result; - __asm__ ("uzp1 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vuzp1_p16 (poly16x4_t a, poly16x4_t b) -{ - poly16x4_t result; - __asm__ ("uzp1 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vuzp1_s8 (int8x8_t a, int8x8_t b) -{ - int8x8_t result; - __asm__ ("uzp1 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vuzp1_s16 (int16x4_t a, int16x4_t b) -{ - int16x4_t result; - __asm__ ("uzp1 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vuzp1_s32 (int32x2_t a, int32x2_t b) -{ - int32x2_t result; - __asm__ ("uzp1 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vuzp1_u8 (uint8x8_t a, uint8x8_t b) -{ - uint8x8_t result; - __asm__ ("uzp1 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vuzp1_u16 (uint16x4_t a, uint16x4_t b) -{ - uint16x4_t result; - __asm__ ("uzp1 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vuzp1_u32 (uint32x2_t a, uint32x2_t b) -{ - uint32x2_t result; - __asm__ ("uzp1 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vuzp1q_f32 (float32x4_t a, float32x4_t b) -{ - float32x4_t result; - __asm__ ("uzp1 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vuzp1q_f64 (float64x2_t a, float64x2_t b) -{ - float64x2_t result; - __asm__ ("uzp1 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vuzp1q_p8 (poly8x16_t a, poly8x16_t b) -{ - poly8x16_t result; - __asm__ ("uzp1 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vuzp1q_p16 (poly16x8_t a, poly16x8_t b) -{ - poly16x8_t result; - __asm__ ("uzp1 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vuzp1q_s8 (int8x16_t a, int8x16_t b) -{ - int8x16_t result; - __asm__ ("uzp1 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vuzp1q_s16 (int16x8_t a, int16x8_t b) -{ - int16x8_t result; - __asm__ ("uzp1 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vuzp1q_s32 (int32x4_t a, int32x4_t b) -{ - int32x4_t result; - __asm__ ("uzp1 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vuzp1q_s64 (int64x2_t a, int64x2_t b) -{ - int64x2_t result; - __asm__ ("uzp1 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vuzp1q_u8 (uint8x16_t a, uint8x16_t b) -{ - uint8x16_t result; - __asm__ ("uzp1 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vuzp1q_u16 (uint16x8_t a, uint16x8_t b) -{ - uint16x8_t result; - __asm__ ("uzp1 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vuzp1q_u32 (uint32x4_t a, uint32x4_t b) -{ - uint32x4_t result; - __asm__ ("uzp1 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vuzp1q_u64 (uint64x2_t a, uint64x2_t b) -{ - uint64x2_t result; - __asm__ ("uzp1 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vuzp2_f32 (float32x2_t a, float32x2_t b) -{ - float32x2_t result; - __asm__ ("uzp2 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vuzp2_p8 (poly8x8_t a, poly8x8_t b) -{ - poly8x8_t result; - __asm__ ("uzp2 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vuzp2_p16 (poly16x4_t a, poly16x4_t b) -{ - poly16x4_t result; - __asm__ ("uzp2 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vuzp2_s8 (int8x8_t a, int8x8_t b) -{ - int8x8_t result; - __asm__ ("uzp2 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vuzp2_s16 (int16x4_t a, int16x4_t b) -{ - int16x4_t result; - __asm__ ("uzp2 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vuzp2_s32 (int32x2_t a, int32x2_t b) -{ - int32x2_t result; - __asm__ ("uzp2 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vuzp2_u8 (uint8x8_t a, uint8x8_t b) -{ - uint8x8_t result; - __asm__ ("uzp2 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vuzp2_u16 (uint16x4_t a, uint16x4_t b) -{ - uint16x4_t result; - __asm__ ("uzp2 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vuzp2_u32 (uint32x2_t a, uint32x2_t b) -{ - uint32x2_t result; - __asm__ ("uzp2 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vuzp2q_f32 (float32x4_t a, float32x4_t b) -{ - float32x4_t result; - __asm__ ("uzp2 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vuzp2q_f64 (float64x2_t a, float64x2_t b) -{ - float64x2_t result; - __asm__ ("uzp2 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vuzp2q_p8 (poly8x16_t a, poly8x16_t b) -{ - poly8x16_t result; - __asm__ ("uzp2 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vuzp2q_p16 (poly16x8_t a, poly16x8_t b) -{ - poly16x8_t result; - __asm__ ("uzp2 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vuzp2q_s8 (int8x16_t a, int8x16_t b) -{ - int8x16_t result; - __asm__ ("uzp2 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vuzp2q_s16 (int16x8_t a, int16x8_t b) -{ - int16x8_t result; - __asm__ ("uzp2 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vuzp2q_s32 (int32x4_t a, int32x4_t b) -{ - int32x4_t result; - __asm__ ("uzp2 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vuzp2q_s64 (int64x2_t a, int64x2_t b) -{ - int64x2_t result; - __asm__ ("uzp2 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vuzp2q_u8 (uint8x16_t a, uint8x16_t b) -{ - uint8x16_t result; - __asm__ ("uzp2 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vuzp2q_u16 (uint16x8_t a, uint16x8_t b) -{ - uint16x8_t result; - __asm__ ("uzp2 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vuzp2q_u32 (uint32x4_t a, uint32x4_t b) -{ - uint32x4_t result; - __asm__ ("uzp2 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vuzp2q_u64 (uint64x2_t a, uint64x2_t b) -{ - uint64x2_t result; - __asm__ ("uzp2 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} /* End of temporary inline asm implementations. */ @@ -24844,407 +24383,839 @@ vst4q_f64 (float64_t * __a, float64x2x4_t val) __extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) vsubd_s64 (int64x1_t __a, int64x1_t __b) { - return __a - __b; + return __a - __b; +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsubd_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return __a - __b; +} + +/* vtbx1 */ + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx) +{ + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), + vmov_n_u8 (8)); + int8x8_t __tbl = vtbl1_s8 (__tab, __idx); + + return vbsl_s8 (__mask, __tbl, __r); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx) +{ + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); + uint8x8_t __tbl = vtbl1_u8 (__tab, __idx); + + return vbsl_u8 (__mask, __tbl, __r); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx) +{ + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); + poly8x8_t __tbl = vtbl1_p8 (__tab, __idx); + + return vbsl_p8 (__mask, __tbl, __r); +} + +/* vtbx3 */ + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx) +{ + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), + vmov_n_u8 (24)); + int8x8_t __tbl = vtbl3_s8 (__tab, __idx); + + return vbsl_s8 (__mask, __tbl, __r); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx) +{ + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); + uint8x8_t __tbl = vtbl3_u8 (__tab, __idx); + + return vbsl_u8 (__mask, __tbl, __r); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) +{ + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); + poly8x8_t __tbl = vtbl3_p8 (__tab, __idx); + + return vbsl_p8 (__mask, __tbl, __r); +} + +/* vtrn */ + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vtrn_f32 (float32x2_t a, float32x2_t b) +{ + return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)}; +} + +__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) +vtrn_p8 (poly8x8_t a, poly8x8_t b) +{ + return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)}; +} + +__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) +vtrn_p16 (poly16x4_t a, poly16x4_t b) +{ + return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)}; +} + +__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) +vtrn_s8 (int8x8_t a, int8x8_t b) +{ + return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)}; +} + +__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) +vtrn_s16 (int16x4_t a, int16x4_t b) +{ + return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)}; +} + +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vtrn_s32 (int32x2_t a, int32x2_t b) +{ + return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)}; +} + +__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) +vtrn_u8 (uint8x8_t a, uint8x8_t b) +{ + return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)}; +} + +__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) +vtrn_u16 (uint16x4_t a, uint16x4_t b) +{ + return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)}; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vtrn_u32 (uint32x2_t a, uint32x2_t b) +{ + return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)}; +} + +__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) +vtrnq_f32 (float32x4_t a, float32x4_t b) +{ + return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)}; +} + +__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) +vtrnq_p8 (poly8x16_t a, poly8x16_t b) +{ + return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)}; +} + +__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) +vtrnq_p16 (poly16x8_t a, poly16x8_t b) +{ + return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)}; +} + +__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) +vtrnq_s8 (int8x16_t a, int8x16_t b) +{ + return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)}; +} + +__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) +vtrnq_s16 (int16x8_t a, int16x8_t b) +{ + return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)}; +} + +__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) +vtrnq_s32 (int32x4_t a, int32x4_t b) +{ + return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)}; +} + +__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) +vtrnq_u8 (uint8x16_t a, uint8x16_t b) +{ + return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)}; +} + +__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) +vtrnq_u16 (uint16x8_t a, uint16x8_t b) +{ + return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)}; +} + +__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) +vtrnq_u32 (uint32x4_t a, uint32x4_t b) +{ + return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)}; +} + +/* vtst */ + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtst_s8 (int8x8_t __a, int8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_cmtstv8qi (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vtst_s16 (int16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_cmtstv4hi (__a, __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vtst_s32 (int32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmtstv2si (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vtst_s64 (int64x1_t __a, int64x1_t __b) +{ + return (__a & __b) ? -1ll : 0ll; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtst_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_cmtstv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vtst_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_cmtstv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vtst_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmtstv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vtst_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (__a & __b) ? -1ll : 0ll; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vtstq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_cmtstv16qi (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vtstq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t) __builtin_aarch64_cmtstv8hi (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vtstq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_cmtstv4si (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vtstq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmtstv2di (__a, __b); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vsubd_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vtstq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __a - __b; + return (uint8x16_t) __builtin_aarch64_cmtstv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -/* vtbx1 */ - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vtstq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), - vmov_n_u8 (8)); - int8x8_t __tbl = vtbl1_s8 (__tab, __idx); - - return vbsl_s8 (__mask, __tbl, __r); + return (uint16x8_t) __builtin_aarch64_cmtstv8hi ((int16x8_t) __a, + (int16x8_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vtstq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); - uint8x8_t __tbl = vtbl1_u8 (__tab, __idx); + return (uint32x4_t) __builtin_aarch64_cmtstv4si ((int32x4_t) __a, + (int32x4_t) __b); +} - return vbsl_u8 (__mask, __tbl, __r); +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vtstq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmtstv2di ((int64x2_t) __a, + (int64x2_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vtstd_s64 (int64x1_t __a, int64x1_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); - poly8x8_t __tbl = vtbl1_p8 (__tab, __idx); + return (__a & __b) ? -1ll : 0ll; +} - return vbsl_p8 (__mask, __tbl, __r); +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vtstd_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (__a & __b) ? -1ll : 0ll; } -/* vtbx3 */ +/* vuqadd */ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx) +vuqadd_s8 (int8x8_t __a, uint8x8_t __b) { - uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), - vmov_n_u8 (24)); - int8x8_t __tbl = vtbl3_s8 (__tab, __idx); - - return vbsl_s8 (__mask, __tbl, __r); + return (int8x8_t) __builtin_aarch64_suqaddv8qi (__a, (int8x8_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vuqadd_s16 (int16x4_t __a, uint16x4_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); - uint8x8_t __tbl = vtbl3_u8 (__tab, __idx); - - return vbsl_u8 (__mask, __tbl, __r); + return (int16x4_t) __builtin_aarch64_suqaddv4hi (__a, (int16x4_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vuqadd_s32 (int32x2_t __a, uint32x2_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); - poly8x8_t __tbl = vtbl3_p8 (__tab, __idx); - - return vbsl_p8 (__mask, __tbl, __r); + return (int32x2_t) __builtin_aarch64_suqaddv2si (__a, (int32x2_t) __b); } -/* vtrn */ - -__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) -vtrn_f32 (float32x2_t a, float32x2_t b) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vuqadd_s64 (int64x1_t __a, uint64x1_t __b) { - return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)}; + return (int64x1_t) __builtin_aarch64_suqadddi (__a, (int64x1_t) __b); } -__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) -vtrn_p8 (poly8x8_t a, poly8x8_t b) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vuqaddq_s8 (int8x16_t __a, uint8x16_t __b) { - return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)}; + return (int8x16_t) __builtin_aarch64_suqaddv16qi (__a, (int8x16_t) __b); } -__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) -vtrn_p16 (poly16x4_t a, poly16x4_t b) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vuqaddq_s16 (int16x8_t __a, uint16x8_t __b) { - return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)}; + return (int16x8_t) __builtin_aarch64_suqaddv8hi (__a, (int16x8_t) __b); } -__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) -vtrn_s8 (int8x8_t a, int8x8_t b) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vuqaddq_s32 (int32x4_t __a, uint32x4_t __b) { - return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)}; + return (int32x4_t) __builtin_aarch64_suqaddv4si (__a, (int32x4_t) __b); } -__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) -vtrn_s16 (int16x4_t a, int16x4_t b) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vuqaddq_s64 (int64x2_t __a, uint64x2_t __b) { - return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)}; + return (int64x2_t) __builtin_aarch64_suqaddv2di (__a, (int64x2_t) __b); } -__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) -vtrn_s32 (int32x2_t a, int32x2_t b) +__extension__ static __inline int8x1_t __attribute__ ((__always_inline__)) +vuqaddb_s8 (int8x1_t __a, uint8x1_t __b) { - return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)}; + return (int8x1_t) __builtin_aarch64_suqaddqi (__a, (int8x1_t) __b); } -__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) -vtrn_u8 (uint8x8_t a, uint8x8_t b) +__extension__ static __inline int16x1_t __attribute__ ((__always_inline__)) +vuqaddh_s16 (int16x1_t __a, uint16x1_t __b) { - return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)}; + return (int16x1_t) __builtin_aarch64_suqaddhi (__a, (int16x1_t) __b); } -__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) -vtrn_u16 (uint16x4_t a, uint16x4_t b) +__extension__ static __inline int32x1_t __attribute__ ((__always_inline__)) +vuqadds_s32 (int32x1_t __a, uint32x1_t __b) { - return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)}; + return (int32x1_t) __builtin_aarch64_suqaddsi (__a, (int32x1_t) __b); } -__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) -vtrn_u32 (uint32x2_t a, uint32x2_t b) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vuqaddd_s64 (int64x1_t __a, uint64x1_t __b) { - return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)}; + return (int64x1_t) __builtin_aarch64_suqadddi (__a, (int64x1_t) __b); } -__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) -vtrnq_f32 (float32x4_t a, float32x4_t b) +#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) \ + __extension__ static __inline rettype \ + __attribute__ ((__always_inline__)) \ + v ## op ## Q ## _ ## funcsuffix (intype a, intype b) \ + { \ + return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b), \ + v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)}; \ + } + +#define __INTERLEAVE_LIST(op) \ + __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,) \ + __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,) \ + __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,) \ + __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,) \ + __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,) \ + __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,) \ + __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,) \ + __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,) \ + __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,) \ + __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q) \ + __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q) \ + __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q) \ + __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q) \ + __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q) \ + __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q) \ + __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q) \ + __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q) \ + __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q) + +/* vuzp */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vuzp1_f32 (float32x2_t __a, float32x2_t __b) { - return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) -vtrnq_p8 (poly8x16_t a, poly8x16_t b) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vuzp1_p8 (poly8x8_t __a, poly8x8_t __b) { - return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } -__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) -vtrnq_p16 (poly16x8_t a, poly16x8_t b) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vuzp1_p16 (poly16x4_t __a, poly16x4_t __b) { - return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); +#endif } -__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) -vtrnq_s8 (int8x16_t a, int8x16_t b) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vuzp1_s8 (int8x8_t __a, int8x8_t __b) { - return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } -__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) -vtrnq_s16 (int16x8_t a, int16x8_t b) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vuzp1_s16 (int16x4_t __a, int16x4_t __b) { - return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); +#endif } -__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) -vtrnq_s32 (int32x4_t a, int32x4_t b) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vuzp1_s32 (int32x2_t __a, int32x2_t __b) { - return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) -vtrnq_u8 (uint8x16_t a, uint8x16_t b) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vuzp1_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } -__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) -vtrnq_u16 (uint16x8_t a, uint16x8_t b) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vuzp1_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); +#endif } -__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) -vtrnq_u32 (uint32x4_t a, uint32x4_t b) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vuzp1_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -/* vtst */ - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtst_s8 (int8x8_t __a, int8x8_t __b) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vuzp1q_f32 (float32x4_t __a, float32x4_t __b) { - return (uint8x8_t) __builtin_aarch64_cmtstv8qi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6}); +#endif } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vtst_s16 (int16x4_t __a, int16x4_t __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vuzp1q_f64 (float64x2_t __a, float64x2_t __b) { - return (uint16x4_t) __builtin_aarch64_cmtstv4hi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vtst_s32 (int32x2_t __a, int32x2_t __b) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vuzp1q_p8 (poly8x16_t __a, poly8x16_t __b) { - return (uint32x2_t) __builtin_aarch64_cmtstv2si (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +#endif } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vtst_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vuzp1q_p16 (poly16x8_t __a, poly16x8_t __b) { - return (__a & __b) ? -1ll : 0ll; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtst_u8 (uint8x8_t __a, uint8x8_t __b) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vuzp1q_s8 (int8x16_t __a, int8x16_t __b) { - return (uint8x8_t) __builtin_aarch64_cmtstv8qi ((int8x8_t) __a, - (int8x8_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +#endif } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vtst_u16 (uint16x4_t __a, uint16x4_t __b) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vuzp1q_s16 (int16x8_t __a, int16x8_t __b) { - return (uint16x4_t) __builtin_aarch64_cmtstv4hi ((int16x4_t) __a, - (int16x4_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vtst_u32 (uint32x2_t __a, uint32x2_t __b) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vuzp1q_s32 (int32x4_t __a, int32x4_t __b) { - return (uint32x2_t) __builtin_aarch64_cmtstv2si ((int32x2_t) __a, - (int32x2_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6}); +#endif } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vtst_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vuzp1q_s64 (int64x2_t __a, int64x2_t __b) { - return (__a & __b) ? -1ll : 0ll; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vtstq_s8 (int8x16_t __a, int8x16_t __b) +vuzp1q_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmtstv16qi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +#endif } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vtstq_s16 (int16x8_t __a, int16x8_t __b) +vuzp1q_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmtstv8hi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vtstq_s32 (int32x4_t __a, int32x4_t __b) +vuzp1q_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmtstv4si (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6}); +#endif } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vtstq_s64 (int64x2_t __a, int64x2_t __b) +vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmtstv2di (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vtstq_u8 (uint8x16_t __a, uint8x16_t __b) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vuzp2_f32 (float32x2_t __a, float32x2_t __b) { - return (uint8x16_t) __builtin_aarch64_cmtstv16qi ((int8x16_t) __a, - (int8x16_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vtstq_u16 (uint16x8_t __a, uint16x8_t __b) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vuzp2_p8 (poly8x8_t __a, poly8x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmtstv8hi ((int16x8_t) __a, - (int16x8_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vtstq_u32 (uint32x4_t __a, uint32x4_t __b) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vuzp2_p16 (poly16x4_t __a, poly16x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmtstv4si ((int32x4_t) __a, - (int32x4_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); +#endif } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vtstq_u64 (uint64x2_t __a, uint64x2_t __b) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vuzp2_s8 (int8x8_t __a, int8x8_t __b) { - return (uint64x2_t) __builtin_aarch64_cmtstv2di ((int64x2_t) __a, - (int64x2_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vtstd_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vuzp2_s16 (int16x4_t __a, int16x4_t __b) { - return (__a & __b) ? -1ll : 0ll; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); +#endif } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vtstd_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vuzp2_s32 (int32x2_t __a, int32x2_t __b) { - return (__a & __b) ? -1ll : 0ll; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -/* vuqadd */ +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vuzp2_u8 (uint8x8_t __a, uint8x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif +} -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vuqadd_s8 (int8x8_t __a, uint8x8_t __b) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vuzp2_u16 (uint16x4_t __a, uint16x4_t __b) { - return (int8x8_t) __builtin_aarch64_suqaddv8qi (__a, (int8x8_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); +#endif } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vuqadd_s16 (int16x4_t __a, uint16x4_t __b) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vuzp2_u32 (uint32x2_t __a, uint32x2_t __b) { - return (int16x4_t) __builtin_aarch64_suqaddv4hi (__a, (int16x4_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vuqadd_s32 (int32x2_t __a, uint32x2_t __b) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vuzp2q_f32 (float32x4_t __a, float32x4_t __b) { - return (int32x2_t) __builtin_aarch64_suqaddv2si (__a, (int32x2_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7}); +#endif } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vuqadd_s64 (int64x1_t __a, uint64x1_t __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vuzp2q_f64 (float64x2_t __a, float64x2_t __b) { - return (int64x1_t) __builtin_aarch64_suqadddi (__a, (int64x1_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vuzp2q_p8 (poly8x16_t __a, poly8x16_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); +#endif +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vuzp2q_p16 (poly16x8_t __a, poly16x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } __extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vuqaddq_s8 (int8x16_t __a, uint8x16_t __b) +vuzp2q_s8 (int8x16_t __a, int8x16_t __b) { - return (int8x16_t) __builtin_aarch64_suqaddv16qi (__a, (int8x16_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); +#endif } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vuqaddq_s16 (int16x8_t __a, uint16x8_t __b) +vuzp2q_s16 (int16x8_t __a, int16x8_t __b) { - return (int16x8_t) __builtin_aarch64_suqaddv8hi (__a, (int16x8_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vuqaddq_s32 (int32x4_t __a, uint32x4_t __b) +vuzp2q_s32 (int32x4_t __a, int32x4_t __b) { - return (int32x4_t) __builtin_aarch64_suqaddv4si (__a, (int32x4_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7}); +#endif } __extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vuqaddq_s64 (int64x2_t __a, uint64x2_t __b) +vuzp2q_s64 (int64x2_t __a, int64x2_t __b) { - return (int64x2_t) __builtin_aarch64_suqaddv2di (__a, (int64x2_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ static __inline int8x1_t __attribute__ ((__always_inline__)) -vuqaddb_s8 (int8x1_t __a, uint8x1_t __b) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vuzp2q_u8 (uint8x16_t __a, uint8x16_t __b) { - return (int8x1_t) __builtin_aarch64_suqaddqi (__a, (int8x1_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); +#endif } -__extension__ static __inline int16x1_t __attribute__ ((__always_inline__)) -vuqaddh_s16 (int16x1_t __a, uint16x1_t __b) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vuzp2q_u16 (uint16x8_t __a, uint16x8_t __b) { - return (int16x1_t) __builtin_aarch64_suqaddhi (__a, (int16x1_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -__extension__ static __inline int32x1_t __attribute__ ((__always_inline__)) -vuqadds_s32 (int32x1_t __a, uint32x1_t __b) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vuzp2q_u32 (uint32x4_t __a, uint32x4_t __b) { - return (int32x1_t) __builtin_aarch64_suqaddsi (__a, (int32x1_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7}); +#endif } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vuqaddd_s64 (int64x1_t __a, uint64x1_t __b) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b) { - return (int64x1_t) __builtin_aarch64_suqadddi (__a, (int64x1_t) __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - v ## op ## Q ## _ ## funcsuffix (intype a, intype b) \ - { \ - return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b), \ - v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)}; \ - } - -#define __INTERLEAVE_LIST(op) \ - __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,) \ - __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,) \ - __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,) \ - __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,) \ - __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,) \ - __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,) \ - __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,) \ - __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,) \ - __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,) \ - __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q) \ - __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q) \ - __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q) \ - __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q) \ - __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q) \ - __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q) \ - __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q) \ - __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q) \ - __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q) - -/* vuzp */ - __INTERLEAVE_LIST (uzp) /* vzip */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index b409f94..00ae0d1 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,11 @@ 2014-04-30 Alan Lawrence + * gcc.target/aarch64/vuzps32_1.c: Expect zip1/2 insn rather than uzp1/2. + * gcc.target/aarch64/vuzpu32_1.c: Likewise. + * gcc.target/aarch64/vuzpf32_1.c: Likewise. + +2014-04-30 Alan Lawrence + * gcc.target/aarch64/simd/vuzpf32_1.c: New file. * gcc.target/aarch64/simd/vuzpf32.x: New file. * gcc.target/aarch64/simd/vuzpp16_1.c: New file. diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c index fedee93..0daba1c 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c @@ -6,6 +6,6 @@ #include #include "vuzpf32.x" -/* { dg-final { scan-assembler-times "uzp1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ -/* { dg-final { scan-assembler-times "uzp2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ /* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c index c9de7c8..af48d63 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c @@ -6,6 +6,6 @@ #include #include "vuzps32.x" -/* { dg-final { scan-assembler-times "uzp1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ -/* { dg-final { scan-assembler-times "uzp2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ /* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c index daae84b..05e1c95 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c @@ -6,6 +6,6 @@ #include #include "vuzpu32.x" -/* { dg-final { scan-assembler-times "uzp1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ -/* { dg-final { scan-assembler-times "uzp2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ /* { dg-final { cleanup-saved-temps } } */ -- 2.7.4