From 7211512a5f817ad08e333774bd186c0c5ff48533 Mon Sep 17 00:00:00 2001
From: Alan Lawrence <alan.lawrence@arm.com>
Date: Wed, 30 Apr 2014 17:04:53 +0000
Subject: [PATCH] Rewrite AArch64 UZP Intrinsics using __builtin_shuffle.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vuzps32_1.c: Expect zip1/2 insn rather than uzp1/2.
	* gcc.target/aarch64/vuzpu32_1.c: Likewise.
	* gcc.target/aarch64/vuzpf32_1.c: Likewise.

gcc/ChangeLog:

	* config/aarch64/arm_neon.h (vuzp1_f32, vuzp1_p8, vuzp1_p16, vuzp1_s8,
	vuzp1_s16, vuzp1_s32, vuzp1_u8, vuzp1_u16, vuzp1_u32, vuzp1q_f32,
	vuzp1q_f64, vuzp1q_p8, vuzp1q_p16, vuzp1q_s8, vuzp1q_s16, vuzp1q_s32,
	vuzp1q_s64, vuzp1q_u8, vuzp1q_u16, vuzp1q_u32, vuzp1q_u64, vuzp2_f32,
	vuzp2_p8, vuzp2_p16, vuzp2_s8, vuzp2_s16, vuzp2_s32, vuzp2_u8,
	vuzp2_u16, vuzp2_u32, vuzp2q_f32, vuzp2q_f64, vuzp2q_p8, vuzp2q_p16,
	vuzp2q_s8, vuzp2q_s16, vuzp2q_s32, vuzp2q_s64, vuzp2q_u8, vuzp2q_u16,
	vuzp2q_u32, vuzp2q_u64): Replace temporary asm with __builtin_shuffle.

From-SVN: r209943
---
 gcc/ChangeLog                                     |   11 +
 gcc/config/aarch64/arm_neon.h                     | 1335 ++++++++++-----------
 gcc/testsuite/ChangeLog                           |    6 +
 gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c |    4 +-
 gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c |    4 +-
 gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c |    4 +-
 6 files changed, 676 insertions(+), 688 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 1ac6d14..e3ae8dc 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,14 @@
+2014-04-30  Alan Lawrence  <alan.lawrence@arm.com>
+
+	* config/aarch64/arm_neon.h (vuzp1_f32, vuzp1_p8, vuzp1_p16, vuzp1_s8,
+	vuzp1_s16, vuzp1_s32, vuzp1_u8, vuzp1_u16, vuzp1_u32, vuzp1q_f32,
+	vuzp1q_f64, vuzp1q_p8, vuzp1q_p16, vuzp1q_s8, vuzp1q_s16, vuzp1q_s32,
+	vuzp1q_s64, vuzp1q_u8, vuzp1q_u16, vuzp1q_u32, vuzp1q_u64, vuzp2_f32,
+	vuzp2_p8, vuzp2_p16, vuzp2_s8, vuzp2_s16, vuzp2_s32, vuzp2_u8,
+	vuzp2_u16, vuzp2_u32, vuzp2q_f32, vuzp2q_f64, vuzp2q_p8, vuzp2q_p16,
+	vuzp2q_s8, vuzp2q_s16, vuzp2q_s32, vuzp2q_s64, vuzp2q_u8, vuzp2q_u16,
+	vuzp2q_u32, vuzp2q_u64): Replace temporary asm with __builtin_shuffle.
+
 2014-04-30  Joern Rennecke  <joern.rennecke@embecosm.com>
 
 	* config/arc/arc.opt (mlra): Move comment above option name
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index fa57667..f6213ce 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -13199,467 +13199,6 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
            : /* No clobbers */);
   return result;
 }
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vuzp1_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("uzp1 %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vuzp1_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly8x8_t result;
-  __asm__ ("uzp1 %0.8b,%1.8b,%2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vuzp1_p16 (poly16x4_t a, poly16x4_t b)
-{
-  poly16x4_t result;
-  __asm__ ("uzp1 %0.4h,%1.4h,%2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vuzp1_s8 (int8x8_t a, int8x8_t b)
-{
-  int8x8_t result;
-  __asm__ ("uzp1 %0.8b,%1.8b,%2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vuzp1_s16 (int16x4_t a, int16x4_t b)
-{
-  int16x4_t result;
-  __asm__ ("uzp1 %0.4h,%1.4h,%2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vuzp1_s32 (int32x2_t a, int32x2_t b)
-{
-  int32x2_t result;
-  __asm__ ("uzp1 %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vuzp1_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint8x8_t result;
-  __asm__ ("uzp1 %0.8b,%1.8b,%2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vuzp1_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint16x4_t result;
-  __asm__ ("uzp1 %0.4h,%1.4h,%2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vuzp1_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("uzp1 %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vuzp1q_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("uzp1 %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vuzp1q_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("uzp1 %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vuzp1q_p8 (poly8x16_t a, poly8x16_t b)
-{
-  poly8x16_t result;
-  __asm__ ("uzp1 %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vuzp1q_p16 (poly16x8_t a, poly16x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("uzp1 %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vuzp1q_s8 (int8x16_t a, int8x16_t b)
-{
-  int8x16_t result;
-  __asm__ ("uzp1 %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vuzp1q_s16 (int16x8_t a, int16x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("uzp1 %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vuzp1q_s32 (int32x4_t a, int32x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("uzp1 %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vuzp1q_s64 (int64x2_t a, int64x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("uzp1 %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vuzp1q_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint8x16_t result;
-  __asm__ ("uzp1 %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vuzp1q_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("uzp1 %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vuzp1q_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("uzp1 %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vuzp1q_u64 (uint64x2_t a, uint64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("uzp1 %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vuzp2_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("uzp2 %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vuzp2_p8 (poly8x8_t a, poly8x8_t b)
-{
-  poly8x8_t result;
-  __asm__ ("uzp2 %0.8b,%1.8b,%2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vuzp2_p16 (poly16x4_t a, poly16x4_t b)
-{
-  poly16x4_t result;
-  __asm__ ("uzp2 %0.4h,%1.4h,%2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vuzp2_s8 (int8x8_t a, int8x8_t b)
-{
-  int8x8_t result;
-  __asm__ ("uzp2 %0.8b,%1.8b,%2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vuzp2_s16 (int16x4_t a, int16x4_t b)
-{
-  int16x4_t result;
-  __asm__ ("uzp2 %0.4h,%1.4h,%2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vuzp2_s32 (int32x2_t a, int32x2_t b)
-{
-  int32x2_t result;
-  __asm__ ("uzp2 %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vuzp2_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint8x8_t result;
-  __asm__ ("uzp2 %0.8b,%1.8b,%2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vuzp2_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint16x4_t result;
-  __asm__ ("uzp2 %0.4h,%1.4h,%2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vuzp2_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("uzp2 %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vuzp2q_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("uzp2 %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vuzp2q_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("uzp2 %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vuzp2q_p8 (poly8x16_t a, poly8x16_t b)
-{
-  poly8x16_t result;
-  __asm__ ("uzp2 %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vuzp2q_p16 (poly16x8_t a, poly16x8_t b)
-{
-  poly16x8_t result;
-  __asm__ ("uzp2 %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vuzp2q_s8 (int8x16_t a, int8x16_t b)
-{
-  int8x16_t result;
-  __asm__ ("uzp2 %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vuzp2q_s16 (int16x8_t a, int16x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("uzp2 %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vuzp2q_s32 (int32x4_t a, int32x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("uzp2 %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vuzp2q_s64 (int64x2_t a, int64x2_t b)
-{
-  int64x2_t result;
-  __asm__ ("uzp2 %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vuzp2q_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint8x16_t result;
-  __asm__ ("uzp2 %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vuzp2q_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("uzp2 %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vuzp2q_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("uzp2 %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vuzp2q_u64 (uint64x2_t a, uint64x2_t b)
-{
-  uint64x2_t result;
-  __asm__ ("uzp2 %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
 
 /* End of temporary inline asm implementations.  */
 
@@ -24844,407 +24383,839 @@ vst4q_f64 (float64_t * __a, float64x2x4_t val)
 __extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 vsubd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return __a - __b;
+  return __a - __b;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vsubd_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return __a - __b;
+}
+
+/* vtbx1  */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
+			      vmov_n_u8 (8));
+  int8x8_t __tbl = vtbl1_s8 (__tab, __idx);
+
+  return vbsl_s8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
+  uint8x8_t __tbl = vtbl1_u8 (__tab, __idx);
+
+  return vbsl_u8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
+  poly8x8_t __tbl = vtbl1_p8 (__tab, __idx);
+
+  return vbsl_p8 (__mask, __tbl, __r);
+}
+
+/* vtbx3  */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
+			      vmov_n_u8 (24));
+  int8x8_t __tbl = vtbl3_s8 (__tab, __idx);
+
+  return vbsl_s8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
+  uint8x8_t __tbl = vtbl3_u8 (__tab, __idx);
+
+  return vbsl_u8 (__mask, __tbl, __r);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
+{
+  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
+  poly8x8_t __tbl = vtbl3_p8 (__tab, __idx);
+
+  return vbsl_p8 (__mask, __tbl, __r);
+}
+
+/* vtrn */
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vtrn_f32 (float32x2_t a, float32x2_t b)
+{
+  return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)};
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vtrn_p8 (poly8x8_t a, poly8x8_t b)
+{
+  return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)};
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vtrn_p16 (poly16x4_t a, poly16x4_t b)
+{
+  return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)};
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vtrn_s8 (int8x8_t a, int8x8_t b)
+{
+  return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)};
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vtrn_s16 (int16x4_t a, int16x4_t b)
+{
+  return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)};
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vtrn_s32 (int32x2_t a, int32x2_t b)
+{
+  return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)};
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vtrn_u8 (uint8x8_t a, uint8x8_t b)
+{
+  return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)};
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vtrn_u16 (uint16x4_t a, uint16x4_t b)
+{
+  return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)};
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vtrn_u32 (uint32x2_t a, uint32x2_t b)
+{
+  return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vtrnq_f32 (float32x4_t a, float32x4_t b)
+{
+  return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)};
+}
+
+__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+vtrnq_p8 (poly8x16_t a, poly8x16_t b)
+{
+  return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)};
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_p16 (poly16x8_t a, poly16x8_t b)
+{
+  return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)};
+}
+
+__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+vtrnq_s8 (int8x16_t a, int8x16_t b)
+{
+  return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)};
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_s16 (int16x8_t a, int16x8_t b)
+{
+  return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)};
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vtrnq_s32 (int32x4_t a, int32x4_t b)
+{
+  return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)};
+}
+
+__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+vtrnq_u8 (uint8x16_t a, uint8x16_t b)
+{
+  return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)};
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_u16 (uint16x8_t a, uint16x8_t b)
+{
+  return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)};
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vtrnq_u32 (uint32x4_t a, uint32x4_t b)
+{
+  return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)};
+}
+
+/* vtst */
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtst_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_cmtstv8qi (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vtst_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_cmtstv4hi (__a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vtst_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmtstv2si (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vtst_s64 (int64x1_t __a, int64x1_t __b)
+{
+  return (__a & __b) ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtst_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_cmtstv8qi ((int8x8_t) __a,
+						 (int8x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vtst_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_cmtstv4hi ((int16x4_t) __a,
+						  (int16x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vtst_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmtstv2si ((int32x2_t) __a,
+						  (int32x2_t) __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vtst_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (__a & __b) ? -1ll : 0ll;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vtstq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (uint8x16_t) __builtin_aarch64_cmtstv16qi (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vtstq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_cmtstv8hi (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vtstq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_cmtstv4si (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vtstq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmtstv2di (__a, __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vsubd_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __a - __b;
+  return (uint8x16_t) __builtin_aarch64_cmtstv16qi ((int8x16_t) __a,
+						   (int8x16_t) __b);
 }
 
-/* vtbx1  */
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx)
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
-			      vmov_n_u8 (8));
-  int8x8_t __tbl = vtbl1_s8 (__tab, __idx);
-
-  return vbsl_s8 (__mask, __tbl, __r);
+  return (uint16x8_t) __builtin_aarch64_cmtstv8hi ((int16x8_t) __a,
+						  (int16x8_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
-  uint8x8_t __tbl = vtbl1_u8 (__tab, __idx);
+  return (uint32x4_t) __builtin_aarch64_cmtstv4si ((int32x4_t) __a,
+						  (int32x4_t) __b);
+}
 
-  return vbsl_u8 (__mask, __tbl, __r);
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vtstq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmtstv2di ((int64x2_t) __a,
+						  (int64x2_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vtstd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8));
-  poly8x8_t __tbl = vtbl1_p8 (__tab, __idx);
+  return (__a & __b) ? -1ll : 0ll;
+}
 
-  return vbsl_p8 (__mask, __tbl, __r);
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vtstd_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (__a & __b) ? -1ll : 0ll;
 }
 
-/* vtbx3  */
+/* vuqadd */
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx)
+vuqadd_s8 (int8x8_t __a, uint8x8_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx),
-			      vmov_n_u8 (24));
-  int8x8_t __tbl = vtbl3_s8 (__tab, __idx);
-
-  return vbsl_s8 (__mask, __tbl, __r);
+  return (int8x8_t) __builtin_aarch64_suqaddv8qi (__a, (int8x8_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx)
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vuqadd_s16 (int16x4_t __a, uint16x4_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
-  uint8x8_t __tbl = vtbl3_u8 (__tab, __idx);
-
-  return vbsl_u8 (__mask, __tbl, __r);
+  return (int16x4_t) __builtin_aarch64_suqaddv4hi (__a, (int16x4_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vuqadd_s32 (int32x2_t __a, uint32x2_t __b)
 {
-  uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24));
-  poly8x8_t __tbl = vtbl3_p8 (__tab, __idx);
-
-  return vbsl_p8 (__mask, __tbl, __r);
+  return (int32x2_t) __builtin_aarch64_suqaddv2si (__a, (int32x2_t) __b);
 }
 
-/* vtrn */
-
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vtrn_f32 (float32x2_t a, float32x2_t b)
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vuqadd_s64 (int64x1_t __a, uint64x1_t __b)
 {
-  return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)};
+  return (int64x1_t) __builtin_aarch64_suqadddi (__a, (int64x1_t) __b);
 }
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vtrn_p8 (poly8x8_t a, poly8x8_t b)
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vuqaddq_s8 (int8x16_t __a, uint8x16_t __b)
 {
-  return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)};
+  return (int8x16_t) __builtin_aarch64_suqaddv16qi (__a, (int8x16_t) __b);
 }
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vtrn_p16 (poly16x4_t a, poly16x4_t b)
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vuqaddq_s16 (int16x8_t __a, uint16x8_t __b)
 {
-  return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)};
+  return (int16x8_t) __builtin_aarch64_suqaddv8hi (__a, (int16x8_t) __b);
 }
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vtrn_s8 (int8x8_t a, int8x8_t b)
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vuqaddq_s32 (int32x4_t __a, uint32x4_t __b)
 {
-  return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)};
+  return (int32x4_t) __builtin_aarch64_suqaddv4si (__a, (int32x4_t) __b);
 }
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vtrn_s16 (int16x4_t a, int16x4_t b)
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vuqaddq_s64 (int64x2_t __a, uint64x2_t __b)
 {
-  return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)};
+  return (int64x2_t) __builtin_aarch64_suqaddv2di (__a, (int64x2_t) __b);
 }
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vtrn_s32 (int32x2_t a, int32x2_t b)
+__extension__ static __inline int8x1_t __attribute__ ((__always_inline__))
+vuqaddb_s8 (int8x1_t __a, uint8x1_t __b)
 {
-  return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)};
+  return (int8x1_t) __builtin_aarch64_suqaddqi (__a, (int8x1_t) __b);
 }
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vtrn_u8 (uint8x8_t a, uint8x8_t b)
+__extension__ static __inline int16x1_t __attribute__ ((__always_inline__))
+vuqaddh_s16 (int16x1_t __a, uint16x1_t __b)
 {
-  return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)};
+  return (int16x1_t) __builtin_aarch64_suqaddhi (__a, (int16x1_t) __b);
 }
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vtrn_u16 (uint16x4_t a, uint16x4_t b)
+__extension__ static __inline int32x1_t __attribute__ ((__always_inline__))
+vuqadds_s32 (int32x1_t __a, uint32x1_t __b)
 {
-  return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)};
+  return (int32x1_t) __builtin_aarch64_suqaddsi (__a, (int32x1_t) __b);
 }
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vtrn_u32 (uint32x2_t a, uint32x2_t b)
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vuqaddd_s64 (int64x1_t __a, uint64x1_t __b)
 {
-  return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
+  return (int64x1_t) __builtin_aarch64_suqadddi (__a, (int64x1_t) __b);
 }
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-vtrnq_f32 (float32x4_t a, float32x4_t b)
+#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) 		\
+  __extension__ static __inline rettype					\
+  __attribute__ ((__always_inline__))					\
+  v ## op ## Q ## _ ## funcsuffix (intype a, intype b)			\
+  {									\
+    return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b),	\
+		      v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)};	\
+  }
+
+#define __INTERLEAVE_LIST(op)					\
+  __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,)	\
+  __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,)		\
+  __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,)		\
+  __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,)		\
+  __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,)		\
+  __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,)		\
+  __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,)		\
+  __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,)		\
+  __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,)		\
+  __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q)	\
+  __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q)		\
+  __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q)	\
+  __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q)		\
+  __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q)		\
+  __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q)		\
+  __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q)		\
+  __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q)	\
+  __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q)
+
+/* vuzp */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vuzp1_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-vtrnq_p8 (poly8x16_t a, poly8x16_t b)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vuzp1_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-vtrnq_p16 (poly16x8_t a, poly16x8_t b)
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vuzp1_p16 (poly16x4_t __a, poly16x4_t __b)
 {
-  return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-vtrnq_s8 (int8x16_t a, int8x16_t b)
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vuzp1_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-vtrnq_s16 (int16x8_t a, int16x8_t b)
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vuzp1_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-vtrnq_s32 (int32x4_t a, int32x4_t b)
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vuzp1_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-vtrnq_u8 (uint8x16_t a, uint8x16_t b)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vuzp1_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-vtrnq_u16 (uint16x8_t a, uint16x8_t b)
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vuzp1_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-vtrnq_u32 (uint32x4_t a, uint32x4_t b)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vuzp1_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)};
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
 }
 
-/* vtst */
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtst_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vuzp1q_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmtstv8qi (__a, __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vtst_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vuzp1q_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmtstv4hi (__a, __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vtst_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vuzp1q_p8 (poly8x16_t __a, poly8x16_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmtstv2si (__a, __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+#endif
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vtst_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vuzp1q_p16 (poly16x8_t __a, poly16x8_t __b)
 {
-  return (__a & __b) ? -1ll : 0ll;
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtst_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vuzp1q_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmtstv8qi ((int8x8_t) __a,
-						 (int8x8_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+#endif
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vtst_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vuzp1q_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmtstv4hi ((int16x4_t) __a,
-						  (int16x4_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vtst_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vuzp1q_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmtstv2si ((int32x2_t) __a,
-						  (int32x2_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
+#endif
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vtst_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vuzp1q_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (__a & __b) ? -1ll : 0ll;
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtstq_s8 (int8x16_t __a, int8x16_t __b)
+vuzp1q_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmtstv16qi (__a, __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+#endif
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vtstq_s16 (int16x8_t __a, int16x8_t __b)
+vuzp1q_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmtstv8hi (__a, __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vtstq_s32 (int32x4_t __a, int32x4_t __b)
+vuzp1q_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmtstv4si (__a, __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
+#endif
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vtstq_s64 (int64x2_t __a, int64x2_t __b)
+vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return (uint64x2_t) __builtin_aarch64_cmtstv2di (__a, __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vuzp2_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmtstv16qi ((int8x16_t) __a,
-						   (int8x16_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vuzp2_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmtstv8hi ((int16x8_t) __a,
-						  (int16x8_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vuzp2_p16 (poly16x4_t __a, poly16x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmtstv4si ((int32x4_t) __a,
-						  (int32x4_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vtstq_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vuzp2_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (uint64x2_t) __builtin_aarch64_cmtstv2di ((int64x2_t) __a,
-						  (int64x2_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vtstd_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vuzp2_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (__a & __b) ? -1ll : 0ll;
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vtstd_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vuzp2_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (__a & __b) ? -1ll : 0ll;
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
 }
 
-/* vuqadd */
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vuzp2_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vuqadd_s8 (int8x8_t __a, uint8x8_t __b)
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vuzp2_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (int8x8_t) __builtin_aarch64_suqaddv8qi (__a, (int8x8_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vuqadd_s16 (int16x4_t __a, uint16x4_t __b)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vuzp2_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (int16x4_t) __builtin_aarch64_suqaddv4hi (__a, (int16x4_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vuqadd_s32 (int32x2_t __a, uint32x2_t __b)
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vuzp2q_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return (int32x2_t) __builtin_aarch64_suqaddv2si (__a, (int32x2_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vuqadd_s64 (int64x1_t __a, uint64x1_t __b)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vuzp2q_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return (int64x1_t) __builtin_aarch64_suqadddi (__a, (int64x1_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vuzp2q_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+#endif
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vuzp2q_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vuqaddq_s8 (int8x16_t __a, uint8x16_t __b)
+vuzp2q_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return (int8x16_t) __builtin_aarch64_suqaddv16qi (__a, (int8x16_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
+#else
+  return __builtin_shuffle (__a, __b,
+      (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+#endif
 }
 
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vuqaddq_s16 (int16x8_t __a, uint16x8_t __b)
+vuzp2q_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (int16x8_t) __builtin_aarch64_suqaddv8hi (__a, (int16x8_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vuqaddq_s32 (int32x4_t __a, uint32x4_t __b)
+vuzp2q_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (int32x4_t) __builtin_aarch64_suqaddv4si (__a, (int32x4_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
+#endif
 }
 
 __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vuqaddq_s64 (int64x2_t __a, uint64x2_t __b)
+vuzp2q_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (int64x2_t) __builtin_aarch64_suqaddv2di (__a, (int64x2_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
 }
 
-__extension__ static __inline int8x1_t __attribute__ ((__always_inline__))
-vuqaddb_s8 (int8x1_t __a, uint8x1_t __b)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vuzp2q_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (int8x1_t) __builtin_aarch64_suqaddqi (__a, (int8x1_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
+#else
+  return __builtin_shuffle (__a, __b, (uint8x16_t)
+      {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+#endif
 }
 
-__extension__ static __inline int16x1_t __attribute__ ((__always_inline__))
-vuqaddh_s16 (int16x1_t __a, uint16x1_t __b)
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vuzp2q_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (int16x1_t) __builtin_aarch64_suqaddhi (__a, (int16x1_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
 }
 
-__extension__ static __inline int32x1_t __attribute__ ((__always_inline__))
-vuqadds_s32 (int32x1_t __a, uint32x1_t __b)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vuzp2q_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (int32x1_t) __builtin_aarch64_suqaddsi (__a, (int32x1_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
+#endif
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vuqaddd_s64 (int64x1_t __a, uint64x1_t __b)
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return (int64x1_t) __builtin_aarch64_suqadddi (__a, (int64x1_t) __b);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+  return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
 }
 
-#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) 		\
-  __extension__ static __inline rettype					\
-  __attribute__ ((__always_inline__))					\
-  v ## op ## Q ## _ ## funcsuffix (intype a, intype b)			\
-  {									\
-    return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b),	\
-		      v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)};	\
-  }
-
-#define __INTERLEAVE_LIST(op)					\
-  __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,)	\
-  __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,)		\
-  __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,)		\
-  __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,)		\
-  __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,)		\
-  __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,)		\
-  __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,)		\
-  __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,)		\
-  __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,)		\
-  __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q)	\
-  __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q)		\
-  __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q)	\
-  __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q)		\
-  __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q)		\
-  __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q)		\
-  __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q)		\
-  __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q)	\
-  __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q)
-
-/* vuzp */
-
 __INTERLEAVE_LIST (uzp)
 
 /* vzip */
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index b409f94..00ae0d1 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,11 @@
 2014-04-30  Alan Lawrence  <alan.lawrence@arm.com>
 
+	* gcc.target/aarch64/vuzps32_1.c: Expect zip1/2 insn rather than uzp1/2.
+	* gcc.target/aarch64/vuzpu32_1.c: Likewise.
+	* gcc.target/aarch64/vuzpf32_1.c: Likewise.
+
+2014-04-30  Alan Lawrence  <alan.lawrence@arm.com>
+
 	* gcc.target/aarch64/simd/vuzpf32_1.c: New file.
 	* gcc.target/aarch64/simd/vuzpf32.x: New file.
 	* gcc.target/aarch64/simd/vuzpp16_1.c: New file.
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c
index fedee93..0daba1c 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vuzpf32_1.c
@@ -6,6 +6,6 @@
 #include <arm_neon.h>
 #include "vuzpf32.x"
 
-/* { dg-final { scan-assembler-times "uzp1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
-/* { dg-final { scan-assembler-times "uzp2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
+/* { dg-final { scan-assembler-times "zip1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
+/* { dg-final { scan-assembler-times "zip2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
 /* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c
index c9de7c8..af48d63 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vuzps32_1.c
@@ -6,6 +6,6 @@
 #include <arm_neon.h>
 #include "vuzps32.x"
 
-/* { dg-final { scan-assembler-times "uzp1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
-/* { dg-final { scan-assembler-times "uzp2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
+/* { dg-final { scan-assembler-times "zip1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
+/* { dg-final { scan-assembler-times "zip2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
 /* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c
index daae84b..05e1c95 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vuzpu32_1.c
@@ -6,6 +6,6 @@
 #include <arm_neon.h>
 #include "vuzpu32.x"
 
-/* { dg-final { scan-assembler-times "uzp1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
-/* { dg-final { scan-assembler-times "uzp2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
+/* { dg-final { scan-assembler-times "zip1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
+/* { dg-final { scan-assembler-times "zip2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */
 /* { dg-final { cleanup-saved-temps } } */
-- 
2.7.4