[AArch64] fcvt instructions - arm_neon.h changes.
authorJames Greenhalgh <james.greenhalgh@arm.com>
Mon, 29 Apr 2013 11:08:30 +0000 (11:08 +0000)
committerJames Greenhalgh <jgreenhalgh@gcc.gnu.org>
Mon, 29 Apr 2013 11:08:30 +0000 (11:08 +0000)
gcc/
* config/aarch64/arm_neon.h
(vcvt<sd>_f<32,64>_s<32,64>): Rewrite in C.
(vcvt<q>_f<32,64>_s<32,64>): Rewrite using builtins.
(vcvt_<high_>_f<32,64>_f<32,64>): Likewise.
(vcvt<qsd>_<su><32,64>_f<32,64>): Likewise.
(vcvta<qsd>_<su><32,64>_f<32,64>): Likewise.
(vcvtm<qsd>_<su><32,64>_f<32,64>): Likewise.
(vcvtn<qsd>_<su><32,64>_f<32,64>): Likewise.
(vcvtp<qsd>_<su><32,64>_f<32,64>): Likewise.

gcc/testsuite/
* gcc.target/aarch64/vect-vcvt.c: New.

From-SVN: r198404

gcc/ChangeLog
gcc/config/aarch64/arm_neon.h
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/vect-vcvt.c [new file with mode: 0644]

index 3c42d60..dc763cc 100644 (file)
@@ -1,5 +1,17 @@
 2013-04-29  James Greenhalgh  <james.greenhalgh@arm.com>
 
+       * config/aarch64/arm_neon.h
+       (vcvt<sd>_f<32,64>_s<32,64>): Rewrite in C.
+       (vcvt<q>_f<32,64>_s<32,64>): Rewrite using builtins.
+       (vcvt_<high_>_f<32,64>_f<32,64>): Likewise.
+       (vcvt<qsd>_<su><32,64>_f<32,64>): Likewise.
+       (vcvta<qsd>_<su><32,64>_f<32,64>): Likewise.
+       (vcvtm<qsd>_<su><32,64>_f<32,64>): Likewise.
+       (vcvtn<qsd>_<su><32,64>_f<32,64>): Likewise.
+       (vcvtp<qsd>_<su><32,64>_f<32,64>): Likewise.
+
+2013-04-29  James Greenhalgh  <james.greenhalgh@arm.com>
+
        * config/aarch64/aarch64-simd.md
        (<optab><VDQF:mode><fcvt_target>2): New, maps to fix, fixuns.
        (<fix_trunc_optab><VDQF:mode><fcvt_target>2): New, maps to
index c868a46..7d37744 100644 (file)
@@ -5882,100 +5882,12 @@ vcntq_u8 (uint8x16_t a)
 
 /* vcvt_f32_f16 not supported */
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_f32_f64 (float64x2_t a)
-{
-  float32x2_t result;
-  __asm__ ("fcvtn %0.2s,%1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_f32_s32 (int32x2_t a)
-{
-  float32x2_t result;
-  __asm__ ("scvtf %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_f32_u32 (uint32x2_t a)
-{
-  float32x2_t result;
-  __asm__ ("ucvtf %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vcvt_f64_f32 (float32x2_t a)
-{
-  float64x2_t result;
-  __asm__ ("fcvtl %0.2d,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vcvt_f64_s64 (uint64x1_t a)
-{
-  float64x1_t result;
-  __asm__ ("scvtf %d0, %d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vcvt_f64_u64 (uint64x1_t a)
-{
-  float64x1_t result;
-  __asm__ ("ucvtf %d0, %d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 /* vcvt_high_f16_f32 not supported */
 
 /* vcvt_high_f32_f16 not supported */
 
 static float32x2_t vdup_n_f32 (float32_t);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvt_high_f32_f64 (float32x2_t a, float64x2_t b)
-{
-  float32x4_t result = vcombine_f32 (a, vdup_n_f32 (0.0f));
-  __asm__ ("fcvtn2 %0.4s,%2.2d"
-           : "+w"(result)
-           : "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vcvt_high_f64_f32 (float32x4_t a)
-{
-  float64x2_t result;
-  __asm__ ("fcvtl2 %0.2d,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vcvt_n_f32_s32(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
@@ -6024,160 +5936,6 @@ vcvt_high_f64_f32 (float32x4_t a)
        result;                                                          \
      })
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvt_s32_f32 (float32x2_t a)
-{
-  int32x2_t result;
-  __asm__ ("fcvtzs %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvt_u32_f32 (float32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("fcvtzu %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvta_s32_f32 (float32x2_t a)
-{
-  int32x2_t result;
-  __asm__ ("fcvtas %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvta_u32_f32 (float32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("fcvtau %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtad_s64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtas %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtad_u64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtau %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtaq_s32_f32 (float32x4_t a)
-{
-  int32x4_t result;
-  __asm__ ("fcvtas %0.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtaq_s64_f64 (float64x2_t a)
-{
-  int64x2_t result;
-  __asm__ ("fcvtas %0.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtaq_u32_f32 (float32x4_t a)
-{
-  uint32x4_t result;
-  __asm__ ("fcvtau %0.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtaq_u64_f64 (float64x2_t a)
-{
-  uint64x2_t result;
-  __asm__ ("fcvtau %0.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtas_s64_f64 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("fcvtas %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtas_u64_f64 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("fcvtau %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vcvtd_f64_s64 (int64_t a)
-{
-  int64_t result;
-  __asm__ ("scvtf %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vcvtd_f64_u64 (uint64_t a)
-{
-  uint64_t result;
-  __asm__ ("ucvtf %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vcvtd_n_f64_s64(a, b)                                           \
   __extension__                                                         \
     ({                                                                  \
@@ -6226,220 +5984,166 @@ vcvtd_f64_u64 (uint64_t a)
        result;                                                          \
      })
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtd_s64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtzs %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvtq_n_f32_s32(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       int32x4_t a_ = (a);                                              \
+       float32x4_t result;                                              \
+       __asm__ ("scvtf %0.4s, %1.4s, #%2"                               \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtd_u64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtzu %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvtq_n_f32_u32(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       uint32x4_t a_ = (a);                                             \
+       float32x4_t result;                                              \
+       __asm__ ("ucvtf %0.4s, %1.4s, #%2"                               \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvtm_s32_f32 (float32x2_t a)
-{
-  int32x2_t result;
-  __asm__ ("fcvtms %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvtq_n_f64_s64(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       int64x2_t a_ = (a);                                              \
+       float64x2_t result;                                              \
+       __asm__ ("scvtf %0.2d, %1.2d, #%2"                               \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvtm_u32_f32 (float32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("fcvtmu %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtmd_s64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtms %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtmd_u64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtmu %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtmq_s32_f32 (float32x4_t a)
-{
-  int32x4_t result;
-  __asm__ ("fcvtms %0.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtmq_s64_f64 (float64x2_t a)
-{
-  int64x2_t result;
-  __asm__ ("fcvtms %0.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtmq_u32_f32 (float32x4_t a)
-{
-  uint32x4_t result;
-  __asm__ ("fcvtmu %0.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtmq_u64_f64 (float64x2_t a)
-{
-  uint64x2_t result;
-  __asm__ ("fcvtmu %0.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvtq_n_f64_u64(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       uint64x2_t a_ = (a);                                             \
+       float64x2_t result;                                              \
+       __asm__ ("ucvtf %0.2d, %1.2d, #%2"                               \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtms_s64_f64 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("fcvtms %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvtq_n_s32_f32(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       float32x4_t a_ = (a);                                            \
+       int32x4_t result;                                                \
+       __asm__ ("fcvtzs %0.4s, %1.4s, #%2"                              \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtms_u64_f64 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("fcvtmu %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvtq_n_s64_f64(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       float64x2_t a_ = (a);                                            \
+       int64x2_t result;                                                \
+       __asm__ ("fcvtzs %0.2d, %1.2d, #%2"                              \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvtn_s32_f32 (float32x2_t a)
-{
-  int32x2_t result;
-  __asm__ ("fcvtns %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvtq_n_u32_f32(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       float32x4_t a_ = (a);                                            \
+       uint32x4_t result;                                               \
+       __asm__ ("fcvtzu %0.4s, %1.4s, #%2"                              \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvtn_u32_f32 (float32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("fcvtnu %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvtq_n_u64_f64(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       float64x2_t a_ = (a);                                            \
+       uint64x2_t result;                                               \
+       __asm__ ("fcvtzu %0.2d, %1.2d, #%2"                              \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtnd_s64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtns %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvts_n_f32_s32(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       int32_t a_ = (a);                                                \
+       int32_t result;                                                  \
+       __asm__ ("scvtf %s0,%s1,%2"                                      \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtnd_u64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtnu %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvts_n_f32_u32(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       uint32_t a_ = (a);                                               \
+       uint32_t result;                                                 \
+       __asm__ ("ucvtf %s0,%s1,%2"                                      \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtnq_s32_f32 (float32x4_t a)
-{
-  int32x4_t result;
-  __asm__ ("fcvtns %0.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvts_n_s32_f32(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       float32_t a_ = (a);                                              \
+       float32_t result;                                                \
+       __asm__ ("fcvtzs %s0,%s1,%2"                                     \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtnq_s64_f64 (float64x2_t a)
-{
-  int64x2_t result;
-  __asm__ ("fcvtns %0.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vcvts_n_u32_f32(a, b)                                           \
+  __extension__                                                         \
+    ({                                                                  \
+       float32_t a_ = (a);                                              \
+       float32_t result;                                                \
+       __asm__ ("fcvtzu %s0,%s1,%2"                                     \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtnq_u32_f32 (float32x4_t a)
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvtx_f32_f64 (float64x2_t a)
 {
-  uint32x4_t result;
-  __asm__ ("fcvtnu %0.4s, %1.4s"
+  float32x2_t result;
+  __asm__ ("fcvtxn %0.2s,%1.2d"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtnq_u64_f64 (float64x2_t a)
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtx_high_f32_f64 (float64x2_t a)
 {
-  uint64x2_t result;
-  __asm__ ("fcvtnu %0.2d, %1.2d"
+  float32x4_t result;
+  __asm__ ("fcvtxn2 %0.4s,%1.2d"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
@@ -6447,1406 +6151,976 @@ vcvtnq_u64_f64 (float64x2_t a)
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtns_s64_f64 (float32_t a)
+vcvtxd_f32_f64 (float64_t a)
 {
   float32_t result;
-  __asm__ ("fcvtns %s0,%s1"
+  __asm__ ("fcvtxn %s0,%d1"
            : "=w"(result)
            : "w"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtns_u64_f64 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("fcvtnu %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vdup_lane_f32(a, b)                                             \
+  __extension__                                                         \
+    ({                                                                  \
+       float32x2_t a_ = (a);                                            \
+       float32x2_t result;                                              \
+       __asm__ ("dup %0.2s,%1.s[%2]"                                    \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvtp_s32_f32 (float32x2_t a)
-{
-  int32x2_t result;
-  __asm__ ("fcvtps %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvtp_u32_f32 (float32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("fcvtpu %0.2s, %1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtpd_s64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtps %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vcvtpd_u64_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("fcvtpu %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtpq_s32_f32 (float32x4_t a)
-{
-  int32x4_t result;
-  __asm__ ("fcvtps %0.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtpq_s64_f64 (float64x2_t a)
-{
-  int64x2_t result;
-  __asm__ ("fcvtps %0.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtpq_u32_f32 (float32x4_t a)
-{
-  uint32x4_t result;
-  __asm__ ("fcvtpu %0.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtpq_u64_f64 (float64x2_t a)
-{
-  uint64x2_t result;
-  __asm__ ("fcvtpu %0.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtps_s64_f64 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("fcvtps %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtps_u64_f64 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("fcvtpu %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtq_f32_s32 (int32x4_t a)
-{
-  float32x4_t result;
-  __asm__ ("scvtf %0.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtq_f32_u32 (uint32x4_t a)
-{
-  float32x4_t result;
-  __asm__ ("ucvtf %0.4s, %1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vcvtq_f64_s64 (int64x2_t a)
-{
-  float64x2_t result;
-  __asm__ ("scvtf %0.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vdup_lane_p8(a, b)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       poly8x8_t a_ = (a);                                              \
+       poly8x8_t result;                                                \
+       __asm__ ("dup %0.8b,%1.b[%2]"                                    \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vcvtq_f64_u64 (uint64x2_t a)
-{
-  float64x2_t result;
-  __asm__ ("ucvtf %0.2d, %1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vdup_lane_p16(a, b)                                             \
+  __extension__                                                         \
+    ({                                                                  \
+       poly16x4_t a_ = (a);                                             \
+       poly16x4_t result;                                               \
+       __asm__ ("dup %0.4h,%1.h[%2]"                                    \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-#define vcvtq_n_f32_s32(a, b)                                           \
+#define vdup_lane_s8(a, b)                                              \
   __extension__                                                         \
     ({                                                                  \
-       int32x4_t a_ = (a);                                              \
-       float32x4_t result;                                              \
-       __asm__ ("scvtf %0.4s, %1.4s, #%2"                               \
+       int8x8_t a_ = (a);                                               \
+       int8x8_t result;                                                 \
+       __asm__ ("dup %0.8b,%1.b[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_f32_u32(a, b)                                           \
+#define vdup_lane_s16(a, b)                                             \
   __extension__                                                         \
     ({                                                                  \
-       uint32x4_t a_ = (a);                                             \
-       float32x4_t result;                                              \
-       __asm__ ("ucvtf %0.4s, %1.4s, #%2"                               \
+       int16x4_t a_ = (a);                                              \
+       int16x4_t result;                                                \
+       __asm__ ("dup %0.4h,%1.h[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_f64_s64(a, b)                                           \
+#define vdup_lane_s32(a, b)                                             \
   __extension__                                                         \
     ({                                                                  \
-       int64x2_t a_ = (a);                                              \
-       float64x2_t result;                                              \
-       __asm__ ("scvtf %0.2d, %1.2d, #%2"                               \
+       int32x2_t a_ = (a);                                              \
+       int32x2_t result;                                                \
+       __asm__ ("dup %0.2s,%1.s[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_f64_u64(a, b)                                           \
+#define vdup_lane_s64(a, b)                                             \
   __extension__                                                         \
     ({                                                                  \
-       uint64x2_t a_ = (a);                                             \
-       float64x2_t result;                                              \
-       __asm__ ("ucvtf %0.2d, %1.2d, #%2"                               \
+       int64x1_t a_ = (a);                                              \
+       int64x1_t result;                                                \
+       __asm__ ("ins %0.d[0],%1.d[%2]"                                  \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_s32_f32(a, b)                                           \
+#define vdup_lane_u8(a, b)                                              \
   __extension__                                                         \
     ({                                                                  \
-       float32x4_t a_ = (a);                                            \
-       int32x4_t result;                                                \
-       __asm__ ("fcvtzs %0.4s, %1.4s, #%2"                              \
+       uint8x8_t a_ = (a);                                              \
+       uint8x8_t result;                                                \
+       __asm__ ("dup %0.8b,%1.b[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_s64_f64(a, b)                                           \
+#define vdup_lane_u16(a, b)                                             \
   __extension__                                                         \
     ({                                                                  \
-       float64x2_t a_ = (a);                                            \
-       int64x2_t result;                                                \
-       __asm__ ("fcvtzs %0.2d, %1.2d, #%2"                              \
+       uint16x4_t a_ = (a);                                             \
+       uint16x4_t result;                                               \
+       __asm__ ("dup %0.4h,%1.h[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_u32_f32(a, b)                                           \
+#define vdup_lane_u32(a, b)                                             \
   __extension__                                                         \
     ({                                                                  \
-       float32x4_t a_ = (a);                                            \
-       uint32x4_t result;                                               \
-       __asm__ ("fcvtzu %0.4s, %1.4s, #%2"                              \
+       uint32x2_t a_ = (a);                                             \
+       uint32x2_t result;                                               \
+       __asm__ ("dup %0.2s,%1.s[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vcvtq_n_u64_f64(a, b)                                           \
+#define vdup_lane_u64(a, b)                                             \
   __extension__                                                         \
     ({                                                                  \
-       float64x2_t a_ = (a);                                            \
-       uint64x2_t result;                                               \
-       __asm__ ("fcvtzu %0.2d, %1.2d, #%2"                              \
+       uint64x1_t a_ = (a);                                             \
+       uint64x1_t result;                                               \
+       __asm__ ("ins %0.d[0],%1.d[%2]"                                  \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtq_s32_f32 (float32x4_t a)
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vdup_n_f32 (float32_t a)
 {
-  int32x4_t result;
-  __asm__ ("fcvtzs %0.4s, %1.4s"
+  float32x2_t result;
+  __asm__ ("dup %0.2s, %w1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcvtq_s64_f64 (float64x2_t a)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vdup_n_p8 (uint32_t a)
 {
-  int64x2_t result;
-  __asm__ ("fcvtzs %0.2d, %1.2d"
+  poly8x8_t result;
+  __asm__ ("dup %0.8b,%w1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtq_u32_f32 (float32x4_t a)
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vdup_n_p16 (uint32_t a)
 {
-  uint32x4_t result;
-  __asm__ ("fcvtzu %0.4s, %1.4s"
+  poly16x4_t result;
+  __asm__ ("dup %0.4h,%w1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcvtq_u64_f64 (float64x2_t a)
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vdup_n_s8 (int32_t a)
 {
-  uint64x2_t result;
-  __asm__ ("fcvtzu %0.2d, %1.2d"
+  int8x8_t result;
+  __asm__ ("dup %0.8b,%w1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vcvts_f64_s32 (int32_t a)
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vdup_n_s16 (int32_t a)
 {
-  int32_t result;
-  __asm__ ("scvtf %s0,%s1"
+  int16x4_t result;
+  __asm__ ("dup %0.4h,%w1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vcvts_f64_u32 (uint32_t a)
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vdup_n_s32 (int32_t a)
 {
-  uint32_t result;
-  __asm__ ("ucvtf %s0,%s1"
+  int32x2_t result;
+  __asm__ ("dup %0.2s,%w1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-#define vcvts_n_f32_s32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int32_t a_ = (a);                                                \
-       int32_t result;                                                  \
-       __asm__ ("scvtf %s0,%s1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvts_n_f32_u32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32_t a_ = (a);                                               \
-       uint32_t result;                                                 \
-       __asm__ ("ucvtf %s0,%s1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvts_n_s32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32_t a_ = (a);                                              \
-       float32_t result;                                                \
-       __asm__ ("fcvtzs %s0,%s1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvts_n_u32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32_t a_ = (a);                                              \
-       float32_t result;                                                \
-       __asm__ ("fcvtzu %s0,%s1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvts_s64_f64 (float32_t a)
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vdup_n_s64 (int64_t a)
 {
-  float32_t result;
-  __asm__ ("fcvtzs %s0,%s1"
+  int64x1_t result;
+  __asm__ ("ins %0.d[0],%x1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvts_u64_f64 (float32_t a)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vdup_n_u8 (uint32_t a)
 {
-  float32_t result;
-  __asm__ ("fcvtzu %s0,%s1"
+  uint8x8_t result;
+  __asm__ ("dup %0.8b,%w1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvtx_f32_f64 (float64x2_t a)
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vdup_n_u16 (uint32_t a)
 {
-  float32x2_t result;
-  __asm__ ("fcvtxn %0.2s,%1.2d"
+  uint16x4_t result;
+  __asm__ ("dup %0.4h,%w1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtx_high_f32_f64 (float64x2_t a)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vdup_n_u32 (uint32_t a)
 {
-  float32x4_t result;
-  __asm__ ("fcvtxn2 %0.4s,%1.2d"
+  uint32x2_t result;
+  __asm__ ("dup %0.2s,%w1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vcvtxd_f32_f64 (float64_t a)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vdup_n_u64 (uint64_t a)
 {
-  float32_t result;
-  __asm__ ("fcvtxn %s0,%d1"
+  uint64x1_t result;
+  __asm__ ("ins %0.d[0],%x1"
            : "=w"(result)
-           : "w"(a)
+           : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-#define vdup_lane_f32(a, b)                                             \
+#define vdupd_lane_f64(a, b)                                            \
+  __extension__                                                         \
+    ({                                                                  \
+       float64x2_t a_ = (a);                                            \
+       float64_t result;                                                \
+       __asm__ ("dup %d0, %1.d[%2]"                                     \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vdupq_lane_f32(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
        float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("dup %0.2s,%1.s[%2]"                                    \
+       float32x4_t result;                                              \
+       __asm__ ("dup %0.4s,%1.s[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_p8(a, b)                                              \
+#define vdupq_lane_f64(a, b)                                            \
+  __extension__                                                         \
+    ({                                                                  \
+       float64x1_t a_ = (a);                                            \
+       float64x2_t result;                                              \
+       __asm__ ("dup %0.2d,%1.d[%2]"                                    \
+                : "=w"(result)                                          \
+                : "w"(a_), "i"(b)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
+
+#define vdupq_lane_p8(a, b)                                             \
   __extension__                                                         \
     ({                                                                  \
        poly8x8_t a_ = (a);                                              \
-       poly8x8_t result;                                                \
-       __asm__ ("dup %0.8b,%1.b[%2]"                                    \
+       poly8x16_t result;                                               \
+       __asm__ ("dup %0.16b,%1.b[%2]"                                   \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_p16(a, b)                                             \
+#define vdupq_lane_p16(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
        poly16x4_t a_ = (a);                                             \
-       poly16x4_t result;                                               \
-       __asm__ ("dup %0.4h,%1.h[%2]"                                    \
+       poly16x8_t result;                                               \
+       __asm__ ("dup %0.8h,%1.h[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_s8(a, b)                                              \
+#define vdupq_lane_s8(a, b)                                             \
   __extension__                                                         \
     ({                                                                  \
        int8x8_t a_ = (a);                                               \
-       int8x8_t result;                                                 \
-       __asm__ ("dup %0.8b,%1.b[%2]"                                    \
+       int8x16_t result;                                                \
+       __asm__ ("dup %0.16b,%1.b[%2]"                                   \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_s16(a, b)                                             \
+#define vdupq_lane_s16(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
        int16x4_t a_ = (a);                                              \
-       int16x4_t result;                                                \
-       __asm__ ("dup %0.4h,%1.h[%2]"                                    \
+       int16x8_t result;                                                \
+       __asm__ ("dup %0.8h,%1.h[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_s32(a, b)                                             \
+#define vdupq_lane_s32(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
        int32x2_t a_ = (a);                                              \
-       int32x2_t result;                                                \
-       __asm__ ("dup %0.2s,%1.s[%2]"                                    \
+       int32x4_t result;                                                \
+       __asm__ ("dup %0.4s,%1.s[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_s64(a, b)                                             \
+#define vdupq_lane_s64(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
        int64x1_t a_ = (a);                                              \
-       int64x1_t result;                                                \
-       __asm__ ("ins %0.d[0],%1.d[%2]"                                  \
+       int64x2_t result;                                                \
+       __asm__ ("dup %0.2d,%1.d[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_u8(a, b)                                              \
+#define vdupq_lane_u8(a, b)                                             \
   __extension__                                                         \
     ({                                                                  \
        uint8x8_t a_ = (a);                                              \
-       uint8x8_t result;                                                \
-       __asm__ ("dup %0.8b,%1.b[%2]"                                    \
+       uint8x16_t result;                                               \
+       __asm__ ("dup %0.16b,%1.b[%2]"                                   \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_u16(a, b)                                             \
+#define vdupq_lane_u16(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
        uint16x4_t a_ = (a);                                             \
-       uint16x4_t result;                                               \
-       __asm__ ("dup %0.4h,%1.h[%2]"                                    \
+       uint16x8_t result;                                               \
+       __asm__ ("dup %0.8h,%1.h[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_u32(a, b)                                             \
+#define vdupq_lane_u32(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
        uint32x2_t a_ = (a);                                             \
-       uint32x2_t result;                                               \
-       __asm__ ("dup %0.2s,%1.s[%2]"                                    \
+       uint32x4_t result;                                               \
+       __asm__ ("dup %0.4s,%1.s[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdup_lane_u64(a, b)                                             \
+#define vdupq_lane_u64(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
        uint64x1_t a_ = (a);                                             \
-       uint64x1_t result;                                               \
-       __asm__ ("ins %0.d[0],%1.d[%2]"                                  \
+       uint64x2_t result;                                               \
+       __asm__ ("dup %0.2d,%1.d[%2]"                                    \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vdup_n_f32 (float32_t a)
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vdupq_n_f32 (float32_t a)
 {
-  float32x2_t result;
-  __asm__ ("dup %0.2s, %w1"
+  float32x4_t result;
+  __asm__ ("dup %0.4s, %w1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vdup_n_p8 (uint32_t a)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vdupq_n_f64 (float64_t a)
 {
-  poly8x8_t result;
-  __asm__ ("dup %0.8b,%w1"
+  float64x2_t result;
+  __asm__ ("dup %0.2d, %x1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vdup_n_p16 (uint32_t a)
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vdupq_n_p8 (uint32_t a)
 {
-  poly16x4_t result;
-  __asm__ ("dup %0.4h,%w1"
+  poly8x16_t result;
+  __asm__ ("dup %0.16b,%w1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vdup_n_s8 (int32_t a)
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vdupq_n_p16 (uint32_t a)
 {
-  int8x8_t result;
-  __asm__ ("dup %0.8b,%w1"
+  poly16x8_t result;
+  __asm__ ("dup %0.8h,%w1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vdup_n_s16 (int32_t a)
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vdupq_n_s8 (int32_t a)
 {
-  int16x4_t result;
-  __asm__ ("dup %0.4h,%w1"
+  int8x16_t result;
+  __asm__ ("dup %0.16b,%w1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vdup_n_s32 (int32_t a)
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vdupq_n_s16 (int32_t a)
 {
-  int32x2_t result;
-  __asm__ ("dup %0.2s,%w1"
+  int16x8_t result;
+  __asm__ ("dup %0.8h,%w1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vdup_n_s64 (int64_t a)
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vdupq_n_s32 (int32_t a)
 {
-  int64x1_t result;
-  __asm__ ("ins %0.d[0],%x1"
+  int32x4_t result;
+  __asm__ ("dup %0.4s,%w1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vdup_n_u8 (uint32_t a)
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vdupq_n_s64 (int64_t a)
 {
-  uint8x8_t result;
-  __asm__ ("dup %0.8b,%w1"
+  int64x2_t result;
+  __asm__ ("dup %0.2d,%x1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vdup_n_u16 (uint32_t a)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vdupq_n_u8 (uint32_t a)
 {
-  uint16x4_t result;
-  __asm__ ("dup %0.4h,%w1"
+  uint8x16_t result;
+  __asm__ ("dup %0.16b,%w1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vdup_n_u32 (uint32_t a)
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vdupq_n_u16 (uint32_t a)
 {
-  uint32x2_t result;
-  __asm__ ("dup %0.2s,%w1"
+  uint16x8_t result;
+  __asm__ ("dup %0.8h,%w1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vdup_n_u64 (uint64_t a)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vdupq_n_u32 (uint32_t a)
 {
-  uint64x1_t result;
-  __asm__ ("ins %0.d[0],%x1"
+  uint32x4_t result;
+  __asm__ ("dup %0.4s,%w1"
            : "=w"(result)
            : "r"(a)
            : /* No clobbers */);
   return result;
 }
 
-#define vdupd_lane_f64(a, b)                                            \
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vdupq_n_u64 (uint64_t a)
+{
+  uint64x2_t result;
+  __asm__ ("dup %0.2d,%x1"
+           : "=w"(result)
+           : "r"(a)
+           : /* No clobbers */);
+  return result;
+}
+
+#define vdups_lane_f32(a, b)                                            \
   __extension__                                                         \
     ({                                                                  \
-       float64x2_t a_ = (a);                                            \
-       float64_t result;                                                \
-       __asm__ ("dup %d0, %1.d[%2]"                                     \
+       float32x4_t a_ = (a);                                            \
+       float32_t result;                                                \
+       __asm__ ("dup %s0, %1.s[%2]"                                     \
                 : "=w"(result)                                          \
                 : "w"(a_), "i"(b)                                       \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_f32(a, b)                                            \
+#define vext_f32(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
+       float32x2_t b_ = (b);                                            \
        float32x2_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("dup %0.4s,%1.s[%2]"                                    \
+       float32x2_t result;                                              \
+       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4"                        \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_f64(a, b)                                            \
+#define vext_f64(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
+       float64x1_t b_ = (b);                                            \
        float64x1_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("dup %0.2d,%1.d[%2]"                                    \
+       float64x1_t result;                                              \
+       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8"                        \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_p8(a, b)                                             \
+#define vext_p8(a, b, c)                                                \
   __extension__                                                         \
     ({                                                                  \
+       poly8x8_t b_ = (b);                                              \
        poly8x8_t a_ = (a);                                              \
-       poly8x16_t result;                                               \
-       __asm__ ("dup %0.16b,%1.b[%2]"                                   \
+       poly8x8_t result;                                                \
+       __asm__ ("ext %0.8b,%1.8b,%2.8b,%3"                              \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_p16(a, b)                                            \
+#define vext_p16(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
+       poly16x4_t b_ = (b);                                             \
        poly16x4_t a_ = (a);                                             \
-       poly16x8_t result;                                               \
-       __asm__ ("dup %0.8h,%1.h[%2]"                                    \
+       poly16x4_t result;                                               \
+       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2"                        \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_s8(a, b)                                             \
+#define vext_s8(a, b, c)                                                \
   __extension__                                                         \
     ({                                                                  \
+       int8x8_t b_ = (b);                                               \
        int8x8_t a_ = (a);                                               \
-       int8x16_t result;                                                \
-       __asm__ ("dup %0.16b,%1.b[%2]"                                   \
+       int8x8_t result;                                                 \
+       __asm__ ("ext %0.8b,%1.8b,%2.8b,%3"                              \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_s16(a, b)                                            \
+#define vext_s16(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
+       int16x4_t b_ = (b);                                              \
        int16x4_t a_ = (a);                                              \
-       int16x8_t result;                                                \
-       __asm__ ("dup %0.8h,%1.h[%2]"                                    \
+       int16x4_t result;                                                \
+       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2"                        \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_s32(a, b)                                            \
+#define vext_s32(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
+       int32x2_t b_ = (b);                                              \
        int32x2_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("dup %0.4s,%1.s[%2]"                                    \
+       int32x2_t result;                                                \
+       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4"                        \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_s64(a, b)                                            \
+#define vext_s64(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
+       int64x1_t b_ = (b);                                              \
        int64x1_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("dup %0.2d,%1.d[%2]"                                    \
+       int64x1_t result;                                                \
+       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8"                        \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_u8(a, b)                                             \
+#define vext_u8(a, b, c)                                                \
   __extension__                                                         \
     ({                                                                  \
+       uint8x8_t b_ = (b);                                              \
        uint8x8_t a_ = (a);                                              \
-       uint8x16_t result;                                               \
-       __asm__ ("dup %0.16b,%1.b[%2]"                                   \
+       uint8x8_t result;                                                \
+       __asm__ ("ext %0.8b,%1.8b,%2.8b,%3"                              \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_u16(a, b)                                            \
+#define vext_u16(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
+       uint16x4_t b_ = (b);                                             \
        uint16x4_t a_ = (a);                                             \
-       uint16x8_t result;                                               \
-       __asm__ ("dup %0.8h,%1.h[%2]"                                    \
+       uint16x4_t result;                                               \
+       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2"                        \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_u32(a, b)                                            \
+#define vext_u32(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
+       uint32x2_t b_ = (b);                                             \
        uint32x2_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("dup %0.4s,%1.s[%2]"                                    \
+       uint32x2_t result;                                               \
+       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4"                        \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vdupq_lane_u64(a, b)                                            \
+#define vext_u64(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
+       uint64x1_t b_ = (b);                                             \
        uint64x1_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("dup %0.2d,%1.d[%2]"                                    \
+       uint64x1_t result;                                               \
+       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8"                        \
                 : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
+                : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vdupq_n_f32 (float32_t a)
-{
-  float32x4_t result;
-  __asm__ ("dup %0.4s, %w1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vextq_f32(a, b, c)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       float32x4_t b_ = (b);                                            \
+       float32x4_t a_ = (a);                                            \
+       float32x4_t result;                                              \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4"                     \
+                : "=w"(result)                                          \
+                : "w"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vdupq_n_f64 (float64_t a)
-{
-  float64x2_t result;
-  __asm__ ("dup %0.2d, %x1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vextq_f64(a, b, c)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       float64x2_t b_ = (b);                                            \
+       float64x2_t a_ = (a);                                            \
+       float64x2_t result;                                              \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8"                     \
+                : "=w"(result)                                          \
+                : "w"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vdupq_n_p8 (uint32_t a)
-{
-  poly8x16_t result;
-  __asm__ ("dup %0.16b,%w1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vextq_p8(a, b, c)                                               \
+  __extension__                                                         \
+    ({                                                                  \
+       poly8x16_t b_ = (b);                                             \
+       poly8x16_t a_ = (a);                                             \
+       poly8x16_t result;                                               \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3"                       \
+                : "=w"(result)                                          \
+                : "w"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vdupq_n_p16 (uint32_t a)
-{
-  poly16x8_t result;
-  __asm__ ("dup %0.8h,%w1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vextq_p16(a, b, c)                                              \
+  __extension__                                                         \
+    ({                                                                  \
+       poly16x8_t b_ = (b);                                             \
+       poly16x8_t a_ = (a);                                             \
+       poly16x8_t result;                                               \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2"                     \
+                : "=w"(result)                                          \
+                : "w"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vdupq_n_s8 (int32_t a)
-{
-  int8x16_t result;
-  __asm__ ("dup %0.16b,%w1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
+#define vextq_s8(a, b, c)                                               \
+  __extension__                                                         \
+    ({                                                                  \
+       int8x16_t b_ = (b);                                              \
+       int8x16_t a_ = (a);                                              \
+       int8x16_t result;                                                \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3"                       \
+                : "=w"(result)                                          \
+                : "w"(a_), "w"(b_), "i"(c)                              \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vdupq_n_s16 (int32_t a)
-{
-  int16x8_t result;
-  __asm__ ("dup %0.8h,%w1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vdupq_n_s32 (int32_t a)
-{
-  int32x4_t result;
-  __asm__ ("dup %0.4s,%w1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vdupq_n_s64 (int64_t a)
-{
-  int64x2_t result;
-  __asm__ ("dup %0.2d,%x1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vdupq_n_u8 (uint32_t a)
-{
-  uint8x16_t result;
-  __asm__ ("dup %0.16b,%w1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vdupq_n_u16 (uint32_t a)
-{
-  uint16x8_t result;
-  __asm__ ("dup %0.8h,%w1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vdupq_n_u32 (uint32_t a)
-{
-  uint32x4_t result;
-  __asm__ ("dup %0.4s,%w1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vdupq_n_u64 (uint64_t a)
-{
-  uint64x2_t result;
-  __asm__ ("dup %0.2d,%x1"
-           : "=w"(result)
-           : "r"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vdups_lane_f32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t a_ = (a);                                            \
-       float32_t result;                                                \
-       __asm__ ("dup %s0, %1.s[%2]"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vext_f32(a, b, c)                                               \
+#define vextq_s16(a, b, c)                                              \
   __extension__                                                         \
     ({                                                                  \
-       float32x2_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4"                        \
+       int16x8_t b_ = (b);                                              \
+       int16x8_t a_ = (a);                                              \
+       int16x8_t result;                                                \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2"                     \
                 : "=w"(result)                                          \
                 : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_f64(a, b, c)                                               \
+#define vextq_s32(a, b, c)                                              \
   __extension__                                                         \
     ({                                                                  \
-       float64x1_t b_ = (b);                                            \
-       float64x1_t a_ = (a);                                            \
-       float64x1_t result;                                              \
-       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8"                        \
+       int32x4_t b_ = (b);                                              \
+       int32x4_t a_ = (a);                                              \
+       int32x4_t result;                                                \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4"                     \
                 : "=w"(result)                                          \
                 : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_p8(a, b, c)                                                \
+#define vextq_s64(a, b, c)                                              \
   __extension__                                                         \
     ({                                                                  \
-       poly8x8_t b_ = (b);                                              \
-       poly8x8_t a_ = (a);                                              \
-       poly8x8_t result;                                                \
-       __asm__ ("ext %0.8b,%1.8b,%2.8b,%3"                              \
+       int64x2_t b_ = (b);                                              \
+       int64x2_t a_ = (a);                                              \
+       int64x2_t result;                                                \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8"                     \
                 : "=w"(result)                                          \
                 : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_p16(a, b, c)                                               \
+#define vextq_u8(a, b, c)                                               \
   __extension__                                                         \
     ({                                                                  \
-       poly16x4_t b_ = (b);                                             \
-       poly16x4_t a_ = (a);                                             \
-       poly16x4_t result;                                               \
-       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2"                        \
+       uint8x16_t b_ = (b);                                             \
+       uint8x16_t a_ = (a);                                             \
+       uint8x16_t result;                                               \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3"                       \
                 : "=w"(result)                                          \
                 : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_s8(a, b, c)                                                \
+#define vextq_u16(a, b, c)                                              \
   __extension__                                                         \
     ({                                                                  \
-       int8x8_t b_ = (b);                                               \
-       int8x8_t a_ = (a);                                               \
-       int8x8_t result;                                                 \
-       __asm__ ("ext %0.8b,%1.8b,%2.8b,%3"                              \
+       uint16x8_t b_ = (b);                                             \
+       uint16x8_t a_ = (a);                                             \
+       uint16x8_t result;                                               \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2"                     \
                 : "=w"(result)                                          \
                 : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_s16(a, b, c)                                               \
+#define vextq_u32(a, b, c)                                              \
   __extension__                                                         \
     ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x4_t result;                                                \
-       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2"                        \
+       uint32x4_t b_ = (b);                                             \
+       uint32x4_t a_ = (a);                                             \
+       uint32x4_t result;                                               \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4"                     \
                 : "=w"(result)                                          \
                 : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_s32(a, b, c)                                               \
+#define vextq_u64(a, b, c)                                              \
   __extension__                                                         \
     ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x2_t result;                                                \
-       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4"                        \
+       uint64x2_t b_ = (b);                                             \
+       uint64x2_t a_ = (a);                                             \
+       uint64x2_t result;                                               \
+       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8"                     \
                 : "=w"(result)                                          \
                 : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_s64(a, b, c)                                               \
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
+{
+  float32x2_t result;
+  __asm__ ("fmla %0.2s,%2.2s,%3.2s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+#define vfma_lane_f32(a, b, c, d)                                       \
   __extension__                                                         \
     ({                                                                  \
-       int64x1_t b_ = (b);                                              \
-       int64x1_t a_ = (a);                                              \
-       int64x1_t result;                                                \
-       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8"                        \
+       float32x2_t c_ = (c);                                            \
+       float32x2_t b_ = (b);                                            \
+       float32x2_t a_ = (a);                                            \
+       float32x2_t result;                                              \
+       __asm__ ("fmla %0.2s,%2.2s,%3.s[%4]"                             \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_u8(a, b, c)                                                \
+#define vfmad_lane_f64(a, b, c)                                         \
   __extension__                                                         \
     ({                                                                  \
-       uint8x8_t b_ = (b);                                              \
-       uint8x8_t a_ = (a);                                              \
-       uint8x8_t result;                                                \
-       __asm__ ("ext %0.8b,%1.8b,%2.8b,%3"                              \
+       float64x2_t b_ = (b);                                            \
+       float64_t a_ = (a);                                              \
+       float64_t result;                                                \
+       __asm__ ("fmla %d0,%d1,%2.d[%3]"                                 \
                 : "=w"(result)                                          \
                 : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_u16(a, b, c)                                               \
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
+{
+  float32x4_t result;
+  __asm__ ("fmla %0.4s,%2.4s,%3.4s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
+{
+  float64x2_t result;
+  __asm__ ("fmla %0.2d,%2.2d,%3.2d"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+#define vfmaq_lane_f32(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x4_t result;                                               \
-       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2"                        \
+       float32x4_t c_ = (c);                                            \
+       float32x4_t b_ = (b);                                            \
+       float32x4_t a_ = (a);                                            \
+       float32x4_t result;                                              \
+       __asm__ ("fmla %0.4s,%2.4s,%3.s[%4]"                             \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_u32(a, b, c)                                               \
+#define vfmaq_lane_f64(a, b, c, d)                                      \
   __extension__                                                         \
     ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x2_t result;                                               \
-       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4"                        \
+       float64x2_t c_ = (c);                                            \
+       float64x2_t b_ = (b);                                            \
+       float64x2_t a_ = (a);                                            \
+       float64x2_t result;                                              \
+       __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]"                             \
                 : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
+                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
                 : /* No clobbers */);                                   \
        result;                                                          \
      })
 
-#define vext_u64(a, b, c)                                               \
+#define vfmas_lane_f32(a, b, c)                                         \
   __extension__                                                         \
     ({                                                                  \
-       uint64x1_t b_ = (b);                                             \
-       uint64x1_t a_ = (a);                                             \
-       uint64x1_t result;                                               \
-       __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8"                        \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_f32(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4"                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_f64(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8"                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_p8(a, b, c)                                               \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x16_t b_ = (b);                                             \
-       poly8x16_t a_ = (a);                                             \
-       poly8x16_t result;                                               \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3"                       \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_p16(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x8_t b_ = (b);                                             \
-       poly16x8_t a_ = (a);                                             \
-       poly16x8_t result;                                               \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2"                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_s8(a, b, c)                                               \
-  __extension__                                                         \
-    ({                                                                  \
-       int8x16_t b_ = (b);                                              \
-       int8x16_t a_ = (a);                                              \
-       int8x16_t result;                                                \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3"                       \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_s16(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int16x8_t result;                                                \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2"                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_s32(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4"                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_s64(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t b_ = (b);                                              \
-       int64x2_t a_ = (a);                                              \
-       int64x2_t result;                                                \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8"                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_u8(a, b, c)                                               \
-  __extension__                                                         \
-    ({                                                                  \
-       uint8x16_t b_ = (b);                                             \
-       uint8x16_t a_ = (a);                                             \
-       uint8x16_t result;                                               \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3"                       \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_u16(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint16x8_t result;                                               \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2"                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_u32(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4"                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vextq_u64(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t b_ = (b);                                             \
-       uint64x2_t a_ = (a);                                             \
-       uint64x2_t result;                                               \
-       __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8"                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-{
-  float32x2_t result;
-  __asm__ ("fmla %0.2s,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vfma_lane_f32(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t c_ = (c);                                            \
-       float32x2_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmla %0.2s,%2.2s,%3.s[%4]"                             \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vfmad_lane_f64(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64_t a_ = (a);                                              \
-       float64_t result;                                                \
-       __asm__ ("fmla %d0,%d1,%2.d[%3]"                                 \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-{
-  float32x4_t result;
-  __asm__ ("fmla %0.4s,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-{
-  float64x2_t result;
-  __asm__ ("fmla %0.2d,%2.2d,%3.2d"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vfmaq_lane_f32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t c_ = (c);                                            \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmla %0.4s,%2.4s,%3.s[%4]"                             \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vfmaq_lane_f64(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t c_ = (c);                                            \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]"                             \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vfmas_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32_t a_ = (a);                                              \
-       float32_t result;                                                \
-       __asm__ ("fmla %s0,%s1,%2.s[%3]"                                 \
+       float32x4_t b_ = (b);                                            \
+       float32_t a_ = (a);                                              \
+       float32_t result;                                                \
+       __asm__ ("fmla %s0,%s1,%2.s[%3]"                                 \
                 : "=w"(result)                                          \
                 : "w"(a_), "w"(b_), "i"(c)                              \
                 : /* No clobbers */);                                   \
@@ -18364,1838 +17638,2269 @@ __ST2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
 __ST2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
 __ST2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
 
-#define __ST3_LANE_FUNC(intype, ptrtype, regsuffix,                    \
-                       lnsuffix, funcsuffix, Q)                        \
-  __extension__ static __inline void                                   \
-  __attribute__ ((__always_inline__))                                  \
-  vst3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
-                                    intype b, const int c)             \
-  {                                                                    \
-    __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"   \
-            "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t"  \
-            : "=Q"(*(intype *) ptr)                                    \
-            : "Q"(b), "i"(c)                                           \
-            : "memory", "v16", "v17", "v18");                          \
-  }
+#define __ST3_LANE_FUNC(intype, ptrtype, regsuffix,                    \
+                       lnsuffix, funcsuffix, Q)                        \
+  __extension__ static __inline void                                   \
+  __attribute__ ((__always_inline__))                                  \
+  vst3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
+                                    intype b, const int c)             \
+  {                                                                    \
+    __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"   \
+            "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t"  \
+            : "=Q"(*(intype *) ptr)                                    \
+            : "Q"(b), "i"(c)                                           \
+            : "memory", "v16", "v17", "v18");                          \
+  }
+
+__ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,)
+__ST3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
+__ST3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
+__ST3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
+__ST3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
+__ST3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
+__ST3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
+__ST3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
+__ST3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
+__ST3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
+__ST3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
+__ST3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
+__ST3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
+__ST3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
+__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
+__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
+__ST3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
+__ST3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
+__ST3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
+__ST3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
+__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
+__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
+__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
+__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
+
+#define __ST4_LANE_FUNC(intype, ptrtype, regsuffix,                    \
+                       lnsuffix, funcsuffix, Q)                        \
+  __extension__ static __inline void                                   \
+  __attribute__ ((__always_inline__))                                  \
+  vst4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
+                                    intype b, const int c)             \
+  {                                                                    \
+    __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"   \
+            "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t"  \
+            : "=Q"(*(intype *) ptr)                                    \
+            : "Q"(b), "i"(c)                                           \
+            : "memory", "v16", "v17", "v18", "v19");                   \
+  }
+
+__ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,)
+__ST4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
+__ST4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
+__ST4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
+__ST4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
+__ST4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
+__ST4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
+__ST4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
+__ST4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
+__ST4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
+__ST4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
+__ST4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
+__ST4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
+__ST4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
+__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
+__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
+__ST4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
+__ST4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
+__ST4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
+__ST4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
+__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
+__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
+__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
+__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vaddlv_s32 (int32x2_t a)
+{
+  int64_t result;
+  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vaddlv_u32 (uint32x2_t a)
+{
+  uint64_t result;
+  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vaddv_s32 (int32x2_t a)
+{
+  int32_t result;
+  __asm__ ("addp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vaddv_u32 (uint32x2_t a)
+{
+  uint32_t result;
+  __asm__ ("addp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmaxnmv_f32 (float32x2_t a)
+{
+  float32_t result;
+  __asm__ ("fmaxnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vminnmv_f32 (float32x2_t a)
+{
+  float32_t result;
+  __asm__ ("fminnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmaxnmvq_f64 (float64x2_t a)
+{
+  float64_t result;
+  __asm__ ("fmaxnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vmaxv_s32 (int32x2_t a)
+{
+  int32_t result;
+  __asm__ ("smaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vmaxv_u32 (uint32x2_t a)
+{
+  uint32_t result;
+  __asm__ ("umaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vminnmvq_f64 (float64x2_t a)
+{
+  float64_t result;
+  __asm__ ("fminnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vminv_s32 (int32x2_t a)
+{
+  int32_t result;
+  __asm__ ("sminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vminv_u32 (uint32x2_t a)
+{
+  uint32_t result;
+  __asm__ ("uminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  return result;
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vpaddd_s64 (int64x2_t __a)
+{
+  return __builtin_aarch64_addpdi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+  return  __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+  return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c);
+}
+
+/* Table intrinsics.  */
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vqtbl1_p8 (poly8x16_t a, uint8x8_t b)
+{
+  poly8x8_t result;
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqtbl1_s8 (int8x16_t a, int8x8_t b)
+{
+  int8x8_t result;
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
+{
+  uint8x8_t result;
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vqtbl1q_p8 (poly8x16_t a, uint8x16_t b)
+{
+  poly8x16_t result;
+  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqtbl1q_s8 (int8x16_t a, int8x16_t b)
+{
+  int8x16_t result;
+  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqtbl1q_u8 (uint8x16_t a, uint8x16_t b)
+{
+  uint8x16_t result;
+  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+           : "=w"(result)
+           : "w"(a), "w"(b)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqtbl2_s8 (int8x16x2_t tab, int8x8_t idx)
+{
+  int8x8_t result;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
+  return result;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
+{
+  uint8x8_t result;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
+  return result;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
+{
+  poly8x8_t result;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
+  return result;
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqtbl2q_s8 (int8x16x2_t tab, int8x16_t idx)
+{
+  int8x16_t result;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
+  return result;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
+{
+  uint8x16_t result;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
+  return result;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
+{
+  poly8x16_t result;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
+  return result;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqtbl3_s8 (int8x16x3_t tab, int8x8_t idx)
+{
+  int8x8_t result;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
+  return result;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
+{
+  uint8x8_t result;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
+  return result;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
+{
+  poly8x8_t result;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
+  return result;
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqtbl3q_s8 (int8x16x3_t tab, int8x16_t idx)
+{
+  int8x16_t result;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
+  return result;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
+{
+  uint8x16_t result;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
+  return result;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
+{
+  poly8x16_t result;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
+  return result;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqtbl4_s8 (int8x16x4_t tab, int8x8_t idx)
+{
+  int8x8_t result;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
+{
+  uint8x8_t result;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
+{
+  poly8x8_t result;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
+}
+
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqtbl4q_s8 (int8x16x4_t tab, int8x16_t idx)
+{
+  int8x16_t result;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
+{
+  uint8x16_t result;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
+}
 
-__ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,)
-__ST3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__ST3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__ST3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__ST3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__ST3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__ST3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__ST3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__ST3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__ST3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__ST3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__ST3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__ST3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__ST3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__ST3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__ST3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__ST3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__ST3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
+{
+  poly8x16_t result;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
+          :"=w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
+}
 
-#define __ST4_LANE_FUNC(intype, ptrtype, regsuffix,                    \
-                       lnsuffix, funcsuffix, Q)                        \
-  __extension__ static __inline void                                   \
-  __attribute__ ((__always_inline__))                                  \
-  vst4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
-                                    intype b, const int c)             \
-  {                                                                    \
-    __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"   \
-            "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t"  \
-            : "=Q"(*(intype *) ptr)                                    \
-            : "Q"(b), "i"(c)                                           \
-            : "memory", "v16", "v17", "v18", "v19");                   \
-  }
 
-__ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,)
-__ST4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__ST4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__ST4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__ST4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__ST4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__ST4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__ST4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__ST4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__ST4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__ST4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__ST4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__ST4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__ST4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__ST4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__ST4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__ST4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__ST4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqtbx1_s8 (int8x8_t r, int8x16_t tab, int8x8_t idx)
+{
+  int8x8_t result = r;
+  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
+  return result;
+}
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vaddlv_s32 (int32x2_t a)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx)
 {
-  int64_t result;
-  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
+  uint8x8_t result = r;
+  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vaddlv_u32 (uint32x2_t a)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx)
 {
-  uint64_t result;
-  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
+  poly8x8_t result = r;
+  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vaddv_s32 (int32x2_t a)
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqtbx1q_s8 (int8x16_t r, int8x16_t tab, int8x16_t idx)
 {
-  int32_t result;
-  __asm__ ("addp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  int8x16_t result = r;
+  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vaddv_u32 (uint32x2_t a)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx)
 {
-  uint32_t result;
-  __asm__ ("addp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  uint8x16_t result = r;
+  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmaxnmv_f32 (float32x2_t a)
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx)
 {
-  float32_t result;
-  __asm__ ("fmaxnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  poly8x16_t result = r;
+  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
+           : "+w"(result)
+           : "w"(tab), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vminnmv_f32 (float32x2_t a)
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, int8x8_t idx)
 {
-  float32_t result;
-  __asm__ ("fminnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  int8x8_t result = r;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
   return result;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmaxnmvq_f64 (float64x2_t a)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
 {
-  float64_t result;
-  __asm__ ("fmaxnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : );
+  uint8x8_t result = r;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
   return result;
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vmaxv_s32 (int32x2_t a)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
 {
-  int32_t result;
-  __asm__ ("smaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  poly8x8_t result = r;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
   return result;
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vmaxv_u32 (uint32x2_t a)
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, int8x16_t idx)
 {
-  uint32_t result;
-  __asm__ ("umaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  int8x16_t result = r;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
   return result;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vminnmvq_f64 (float64x2_t a)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
 {
-  float64_t result;
-  __asm__ ("fminnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : );
+  uint8x16_t result = r;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
+  return result;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
+{
+  poly8x16_t result = r;
+  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
+          "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17");
+  return result;
+}
+
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, int8x8_t idx)
+{
+  int8x8_t result = r;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
   return result;
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vminv_s32 (int32x2_t a)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
 {
-  int32_t result;
-  __asm__ ("sminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  uint8x8_t result = r;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
   return result;
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vminv_u32 (uint32x2_t a)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
 {
-  uint32_t result;
-  __asm__ ("uminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : );
+  poly8x8_t result = r;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
   return result;
 }
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vpaddd_s64 (int64x2_t __a)
-{
-  return __builtin_aarch64_addpdi (__a);
-}
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, int8x16_t idx)
 {
-  return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c);
+  int8x16_t result = r;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
+  return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c);
+  uint8x16_t result = r;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
+  return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c);
+  poly8x16_t result = r;
+  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
+          "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18");
+  return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, int8x8_t idx)
 {
-  return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c);
+  int8x8_t result = r;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
 }
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
 {
-  return  __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c);
+  uint8x8_t result = r;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
 }
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
 {
-  return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c);
+  poly8x8_t result = r;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
 }
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, int8x16_t idx)
 {
-  return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c);
+  int8x16_t result = r;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
 }
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
 {
-  return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c);
+  uint8x16_t result = r;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
+  return result;
 }
 
-/* Table intrinsics.  */
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbl1_p8 (poly8x16_t a, uint8x8_t b)
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
 {
-  poly8x8_t result;
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
+  poly8x16_t result = r;
+  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
+          "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
+          :"+w"(result)
+          :"Q"(tab),"w"(idx)
+          :"memory", "v16", "v17", "v18", "v19");
   return result;
 }
 
+/* V7 legacy table intrinsics.  */
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbl1_s8 (int8x16_t a, int8x8_t b)
+vtbl1_s8 (int8x8_t tab, int8x8_t idx)
 {
   int8x8_t result;
+  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (UINT64_C (0x0)));
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(temp), "w"(idx)
            : /* No clobbers */);
   return result;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
+vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
 {
   uint8x8_t result;
+  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (UINT64_C (0x0)));
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
            : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbl1q_p8 (poly8x16_t a, uint8x16_t b)
-{
-  poly8x16_t result;
-  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbl1q_s8 (int8x16_t a, int8x16_t b)
-{
-  int8x16_t result;
-  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(temp), "w"(idx)
            : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbl1q_u8 (uint8x16_t a, uint8x16_t b)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl1_p8 (poly8x8_t tab, uint8x8_t idx)
 {
-  uint8x16_t result;
-  __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+  poly8x8_t result;
+  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (UINT64_C (0x0)));
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
            : "=w"(result)
-           : "w"(a), "w"(b)
+           : "w"(temp), "w"(idx)
            : /* No clobbers */);
   return result;
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbl2_s8 (int8x16x2_t tab, int8x8_t idx)
+vtbl2_s8 (int8x8x2_t tab, int8x8_t idx)
 {
   int8x8_t result;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
+  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
+vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx)
 {
   uint8x8_t result;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
+  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
+vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx)
 {
   poly8x8_t result;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
+  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+           : "=w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbl2q_s8 (int8x16x2_t tab, int8x16_t idx)
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
 {
-  int8x16_t result;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
+  int8x8_t result;
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (UINT64_C (0x0)));
+  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
+          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
+           : "=w"(result)
+           : "Q"(temp), "w"(idx)
+           : "v16", "v17", "memory");
   return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
-{
-  uint8x16_t result;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
+{
+  uint8x8_t result;
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (UINT64_C (0x0)));
+  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
+          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
+           : "=w"(result)
+           : "Q"(temp), "w"(idx)
+           : "v16", "v17", "memory");
   return result;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
 {
-  poly8x16_t result;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
+  poly8x8_t result;
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (UINT64_C (0x0)));
+  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
+          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
+           : "=w"(result)
+           : "Q"(temp), "w"(idx)
+           : "v16", "v17", "memory");
   return result;
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbl3_s8 (int8x16x3_t tab, int8x8_t idx)
+vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
 {
   int8x8_t result;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
+  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
+          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
+           : "=w"(result)
+           : "Q"(temp), "w"(idx)
+           : "v16", "v17", "memory");
   return result;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
+vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
 {
   uint8x8_t result;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
+  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
+          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
+           : "=w"(result)
+           : "Q"(temp), "w"(idx)
+           : "v16", "v17", "memory");
   return result;
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
+vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
 {
   poly8x8_t result;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
+  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
+          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
+           : "=w"(result)
+           : "Q"(temp), "w"(idx)
+           : "v16", "v17", "memory");
   return result;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbl3q_s8 (int8x16x3_t tab, int8x16_t idx)
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx1_s8 (int8x8_t r, int8x8_t tab, int8x8_t idx)
 {
-  int8x16_t result;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
+  int8x8_t result;
+  int8x8_t tmp1;
+  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (UINT64_C (0x0)));
+  __asm__ ("movi %0.8b, 8\n\t"
+          "cmhs %0.8b, %3.8b, %0.8b\n\t"
+          "tbl %1.8b, {%2.16b}, %3.8b\n\t"
+          "bsl %0.8b, %4.8b, %1.8b\n\t"
+           : "+w"(result), "=w"(tmp1)
+           : "w"(temp), "w"(idx), "w"(r)
+           : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx1_u8 (uint8x8_t r, uint8x8_t tab, uint8x8_t idx)
 {
-  uint8x16_t result;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
+  uint8x8_t result;
+  uint8x8_t tmp1;
+  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (UINT64_C (0x0)));
+  __asm__ ("movi %0.8b, 8\n\t"
+          "cmhs %0.8b, %3.8b, %0.8b\n\t"
+          "tbl %1.8b, {%2.16b}, %3.8b\n\t"
+          "bsl %0.8b, %4.8b, %1.8b\n\t"
+           : "+w"(result), "=w"(tmp1)
+           : "w"(temp), "w"(idx), "w"(r)
+           : /* No clobbers */);
   return result;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx1_p8 (poly8x8_t r, poly8x8_t tab, uint8x8_t idx)
 {
-  poly8x16_t result;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
+  poly8x8_t result;
+  poly8x8_t tmp1;
+  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (UINT64_C (0x0)));
+  __asm__ ("movi %0.8b, 8\n\t"
+          "cmhs %0.8b, %3.8b, %0.8b\n\t"
+          "tbl %1.8b, {%2.16b}, %3.8b\n\t"
+          "bsl %0.8b, %4.8b, %1.8b\n\t"
+           : "+w"(result), "=w"(tmp1)
+           : "w"(temp), "w"(idx), "w"(r)
+           : /* No clobbers */);
   return result;
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbl4_s8 (int8x16x4_t tab, int8x8_t idx)
+vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
 {
-  int8x8_t result;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
+  int8x8_t result = r;
+  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
+           : "+w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
+vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx)
 {
-  uint8x8_t result;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
+  uint8x8_t result = r;
+  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
+           : "+w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
+vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
 {
-  poly8x8_t result;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
+  poly8x8_t result = r;
+  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
+  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
+           : "+w"(result)
+           : "w"(temp), "w"(idx)
+           : /* No clobbers */);
   return result;
 }
 
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbl4q_s8 (int8x16x4_t tab, int8x16_t idx)
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx3_s8 (int8x8_t r, int8x8x3_t tab, int8x8_t idx)
 {
-  int8x16_t result;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
+  int8x8_t result;
+  int8x8_t tmp1;
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (UINT64_C (0x0)));
+  __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t"
+          "movi %0.8b, 24\n\t"
+          "cmhs %0.8b, %3.8b, %0.8b\n\t"
+          "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t"
+          "bsl %0.8b, %4.8b, %1.8b\n\t"
+           : "+w"(result), "=w"(tmp1)
+           : "Q"(temp), "w"(idx), "w"(r)
+           : "v16", "v17", "memory");
   return result;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx3_u8 (uint8x8_t r, uint8x8x3_t tab, uint8x8_t idx)
 {
-  uint8x16_t result;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
+  uint8x8_t result;
+  uint8x8_t tmp1;
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (UINT64_C (0x0)));
+  __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t"
+          "movi %0.8b, 24\n\t"
+          "cmhs %0.8b, %3.8b, %0.8b\n\t"
+          "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t"
+          "bsl %0.8b, %4.8b, %1.8b\n\t"
+           : "+w"(result), "=w"(tmp1)
+           : "Q"(temp), "w"(idx), "w"(r)
+           : "v16", "v17", "memory");
   return result;
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx3_p8 (poly8x8_t r, poly8x8x3_t tab, uint8x8_t idx)
 {
-  poly8x16_t result;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
-          :"=w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
+  poly8x8_t result;
+  poly8x8_t tmp1;
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (UINT64_C (0x0)));
+  __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t"
+          "movi %0.8b, 24\n\t"
+          "cmhs %0.8b, %3.8b, %0.8b\n\t"
+          "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t"
+          "bsl %0.8b, %4.8b, %1.8b\n\t"
+           : "+w"(result), "=w"(tmp1)
+           : "Q"(temp), "w"(idx), "w"(r)
+           : "v16", "v17", "memory");
   return result;
 }
 
-
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbx1_s8 (int8x8_t r, int8x16_t tab, int8x8_t idx)
+vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx)
 {
   int8x8_t result = r;
-  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
+  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
+          "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
            : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
+           : "Q"(temp), "w"(idx)
+           : "v16", "v17", "memory");
   return result;
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx)
+vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx)
 {
   uint8x8_t result = r;
-  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
+  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
+          "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
            : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
+           : "Q"(temp), "w"(idx)
+           : "v16", "v17", "memory");
   return result;
 }
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx)
+vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx)
 {
   poly8x8_t result = r;
-  __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
+  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
+  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
+          "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
            : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
+           : "Q"(temp), "w"(idx)
+           : "v16", "v17", "memory");
   return result;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbx1q_s8 (int8x16_t r, int8x16_t tab, int8x16_t idx)
+/* End of temporary inline asm.  */
+
+/* Start of optimal implementations in approved order.  */
+
+/* vabs  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vabs_f32 (float32x2_t __a)
 {
-  int8x16_t result = r;
-  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return __builtin_aarch64_absv2sf (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx)
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vabsq_f32 (float32x4_t __a)
 {
-  uint8x16_t result = r;
-  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return __builtin_aarch64_absv4sf (__a);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vabsq_f64 (float64x2_t __a)
 {
-  poly8x16_t result = r;
-  __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return __builtin_aarch64_absv2df (__a);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, int8x8_t idx)
+/* vadd */
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vaddd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  int8x8_t result = r;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
-  return result;
+  return __a + __b;
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vaddd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  uint8x8_t result = r;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
-  return result;
+  return __a + __b;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vaddv_f32 (float32x2_t __a)
 {
-  poly8x8_t result = r;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
-  return result;
+  float32x2_t t = __builtin_aarch64_addvv2sf (__a);
+  return vget_lane_f32 (t, 0);
 }
 
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vaddvq_f32 (float32x4_t __a)
+{
+  float32x4_t t = __builtin_aarch64_addvv4sf (__a);
+  return vgetq_lane_f32 (t, 0);
+}
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, int8x16_t idx)
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vaddvq_f64 (float64x2_t __a)
 {
-  int8x16_t result = r;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
-  return result;
+  float64x2_t t = __builtin_aarch64_addvv2df (__a);
+  return vgetq_lane_f64 (t, 0);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
+/* vceq */
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceq_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  uint8x16_t result = r;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
-  return result;
+  return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a,
+                                                (int8x8_t) __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceq_s8 (int8x8_t __a, int8x8_t __b)
 {
-  poly8x16_t result = r;
-  __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t"
-          "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17");
-  return result;
+  return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceq_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b);
 }
 
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceq_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b);
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, int8x8_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceq_s64 (int64x1_t __a, int64x1_t __b)
 {
-  int8x8_t result = r;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b);
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
+vceq_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  uint8x8_t result = r;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
-  return result;
+  return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a,
+                                                (int8x8_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceq_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  poly8x8_t result = r;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
-  return result;
+  return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a,
+                                                 (int16x4_t) __b);
 }
 
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceq_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a,
+                                                 (int32x2_t) __b);
+}
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, int8x16_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceq_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  int8x16_t result = r;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmeqdi ((int64x1_t) __a,
+                                               (int64x1_t) __b);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
+vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
-  uint8x16_t result = r;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
-  return result;
+  return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a,
+                                                  (int8x16_t) __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  poly8x16_t result = r;
-  __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t"
-          "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18");
-  return result;
+  return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b);
 }
 
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vceqq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b);
+}
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, int8x8_t idx)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  int8x8_t result = r;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
-  return result;
+  return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a,
+                                                  (int8x16_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  uint8x8_t result = r;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
-  return result;
+  return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a,
+                                                 (int16x8_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  poly8x8_t result = r;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
-  return result;
+  return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a,
+                                                 (int32x4_t) __b);
 }
 
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a,
+                                                 (int64x2_t) __b);
+}
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, int8x16_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceqd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  int8x16_t result = r;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceqd_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  uint8x16_t result = r;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b);
 }
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceqzd_s64 (int64x1_t __a)
 {
-  poly8x16_t result = r;
-  __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t"
-          "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t"
-          :"+w"(result)
-          :"Q"(tab),"w"(idx)
-          :"memory", "v16", "v17", "v18", "v19");
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, 0);
 }
 
-/* V7 legacy table intrinsics.  */
+/* vcge */
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl1_s8 (int8x8_t tab, int8x8_t idx)
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcge_s8 (int8x8_t __a, int8x8_t __b)
 {
-  int8x8_t result;
-  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (UINT64_C (0x0)));
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcge_s16 (int16x4_t __a, int16x4_t __b)
 {
-  uint8x8_t result;
-  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (UINT64_C (0x0)));
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl1_p8 (poly8x8_t tab, uint8x8_t idx)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcge_s32 (int32x2_t __a, int32x2_t __b)
 {
-  poly8x8_t result;
-  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (UINT64_C (0x0)));
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl2_s8 (int8x8x2_t tab, int8x8_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcge_s64 (int64x1_t __a, int64x1_t __b)
 {
-  int8x8_t result;
-  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b);
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx)
+vcge_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  uint8x8_t result;
-  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __a,
+                                                (int8x8_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx)
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcge_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  poly8x8_t result;
-  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __a,
+                                                 (int16x4_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcge_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  int8x8_t result;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __a,
+                                                 (int32x2_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcge_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  uint8x8_t result;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a,
+                                               (int64x1_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgeq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  poly8x8_t result;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgeq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int8x8_t result;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgeq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  uint8x8_t result;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgeq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  poly8x8_t result;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-          "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "=w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __a,
+                                                  (int8x16_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx1_s8 (int8x8_t r, int8x8_t tab, int8x8_t idx)
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  int8x8_t result;
-  int8x8_t tmp1;
-  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (UINT64_C (0x0)));
-  __asm__ ("movi %0.8b, 8\n\t"
-          "cmhs %0.8b, %3.8b, %0.8b\n\t"
-          "tbl %1.8b, {%2.16b}, %3.8b\n\t"
-          "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=w"(tmp1)
-           : "w"(temp), "w"(idx), "w"(r)
-           : /* No clobbers */);
-  return result;
+  return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __a,
+                                                 (int16x8_t) __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx1_u8 (uint8x8_t r, uint8x8_t tab, uint8x8_t idx)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  uint8x8_t result;
-  uint8x8_t tmp1;
-  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (UINT64_C (0x0)));
-  __asm__ ("movi %0.8b, 8\n\t"
-          "cmhs %0.8b, %3.8b, %0.8b\n\t"
-          "tbl %1.8b, {%2.16b}, %3.8b\n\t"
-          "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=w"(tmp1)
-           : "w"(temp), "w"(idx), "w"(r)
-           : /* No clobbers */);
-  return result;
+  return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __a,
+                                                 (int32x4_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx1_p8 (poly8x8_t r, poly8x8_t tab, uint8x8_t idx)
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  poly8x8_t result;
-  poly8x8_t tmp1;
-  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (UINT64_C (0x0)));
-  __asm__ ("movi %0.8b, 8\n\t"
-          "cmhs %0.8b, %3.8b, %0.8b\n\t"
-          "tbl %1.8b, {%2.16b}, %3.8b\n\t"
-          "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=w"(tmp1)
-           : "w"(temp), "w"(idx), "w"(r)
-           : /* No clobbers */);
-  return result;
+  return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __a,
+                                                 (int64x2_t) __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcged_s64 (int64x1_t __a, int64x1_t __b)
 {
-  int8x8_t result = r;
-  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-           : "+w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcged_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  uint8x8_t result = r;
-  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-           : "+w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a,
+                                               (int64x1_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgezd_s64 (int64x1_t __a)
 {
-  poly8x8_t result = r;
-  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
-  __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-           : "+w"(result)
-           : "w"(temp), "w"(idx)
-           : /* No clobbers */);
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmgedi (__a, 0);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx3_s8 (int8x8_t r, int8x8x3_t tab, int8x8_t idx)
+/* vcgt */
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgt_s8 (int8x8_t __a, int8x8_t __b)
 {
-  int8x8_t result;
-  int8x8_t tmp1;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t"
-          "movi %0.8b, 24\n\t"
-          "cmhs %0.8b, %3.8b, %0.8b\n\t"
-          "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t"
-          "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=w"(tmp1)
-           : "Q"(temp), "w"(idx), "w"(r)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx3_u8 (uint8x8_t r, uint8x8x3_t tab, uint8x8_t idx)
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgt_s16 (int16x4_t __a, int16x4_t __b)
 {
-  uint8x8_t result;
-  uint8x8_t tmp1;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t"
-          "movi %0.8b, 24\n\t"
-          "cmhs %0.8b, %3.8b, %0.8b\n\t"
-          "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t"
-          "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=w"(tmp1)
-           : "Q"(temp), "w"(idx), "w"(r)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx3_p8 (poly8x8_t r, poly8x8x3_t tab, uint8x8_t idx)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgt_s32 (int32x2_t __a, int32x2_t __b)
 {
-  poly8x8_t result;
-  poly8x8_t tmp1;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (UINT64_C (0x0)));
-  __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t"
-          "movi %0.8b, 24\n\t"
-          "cmhs %0.8b, %3.8b, %0.8b\n\t"
-          "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t"
-          "bsl %0.8b, %4.8b, %1.8b\n\t"
-           : "+w"(result), "=w"(tmp1)
-           : "Q"(temp), "w"(idx), "w"(r)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b);
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgt_s64 (int64x1_t __a, int64x1_t __b)
 {
-  int8x8_t result = r;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-          "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b);
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx)
+vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  uint8x8_t result = r;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-          "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __a,
+                                                (int8x8_t) __b);
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx)
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  poly8x8_t result = r;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
-  __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
-          "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
-           : "+w"(result)
-           : "Q"(temp), "w"(idx)
-           : "v16", "v17", "memory");
-  return result;
+  return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __a,
+                                                 (int16x4_t) __b);
 }
 
-/* End of temporary inline asm.  */
-
-/* Start of optimal implementations in approved order.  */
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __a,
+                                                 (int32x2_t) __b);
+}
 
-/* vabs  */
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a,
+                                               (int64x1_t) __b);
+}
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vabs_f32 (float32x2_t __a)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_absv2sf (__a);
+  return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b);
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vabsq_f32 (float32x4_t __a)
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_aarch64_absv4sf (__a);
+  return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b);
 }
 
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vabsq_f64 (float64x2_t __a)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_aarch64_absv2df (__a);
+  return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b);
 }
 
-/* vadd */
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgtq_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b);
+}
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vaddd_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __a + __b;
+  return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __a,
+                                                  (int8x16_t) __b);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vaddd_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __a + __b;
+  return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __a,
+                                                 (int16x8_t) __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vaddv_f32 (float32x2_t __a)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  float32x2_t t = __builtin_aarch64_addvv2sf (__a);
-  return vget_lane_f32 (t, 0);
+  return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __a,
+                                                 (int32x4_t) __b);
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vaddvq_f32 (float32x4_t __a)
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  float32x4_t t = __builtin_aarch64_addvv4sf (__a);
-  return vgetq_lane_f32 (t, 0);
+  return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __a,
+                                                 (int64x2_t) __b);
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vaddvq_f64 (float64x2_t __a)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgtd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  float64x2_t t = __builtin_aarch64_addvv2df (__a);
-  return vgetq_lane_f64 (t, 0);
+  return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b);
 }
 
-/* vceq */
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgtd_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+  return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a,
+                                               (int64x1_t) __b);
+}
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceq_p8 (poly8x8_t __a, poly8x8_t __b)
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcgtzd_s64 (int64x1_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a,
-                                                (int8x8_t) __b);
+  return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, 0);
 }
 
+/* vcle */
+
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceq_s8 (int8x8_t __a, int8x8_t __b)
+vcle_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b);
+  return (uint8x8_t) __builtin_aarch64_cmgev8qi (__b, __a);
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vceq_s16 (int16x4_t __a, int16x4_t __b)
+vcle_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b);
+  return (uint16x4_t) __builtin_aarch64_cmgev4hi (__b, __a);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceq_s32 (int32x2_t __a, int32x2_t __b)
+vcle_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b);
+  return (uint32x2_t) __builtin_aarch64_cmgev2si (__b, __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceq_s64 (int64x1_t __a, int64x1_t __b)
+vcle_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b);
+  return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a);
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceq_u8 (uint8x8_t __a, uint8x8_t __b)
+vcle_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a,
-                                                (int8x8_t) __b);
+  return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __b,
+                                                (int8x8_t) __a);
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vceq_u16 (uint16x4_t __a, uint16x4_t __b)
+vcle_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a,
-                                                 (int16x4_t) __b);
+  return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __b,
+                                                 (int16x4_t) __a);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceq_u32 (uint32x2_t __a, uint32x2_t __b)
+vcle_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a,
-                                                 (int32x2_t) __b);
+  return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __b,
+                                                 (int32x2_t) __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceq_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t) __builtin_aarch64_cmeqdi ((int64x1_t) __a,
-                                               (int64x1_t) __b);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
+vcle_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a,
-                                                  (int8x16_t) __b);
+  return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __b,
+                                               (int64x1_t) __a);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqq_s8 (int8x16_t __a, int8x16_t __b)
+vcleq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b);
+  return (uint8x16_t) __builtin_aarch64_cmgev16qi (__b, __a);
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vceqq_s16 (int16x8_t __a, int16x8_t __b)
+vcleq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b);
+  return (uint16x8_t) __builtin_aarch64_cmgev8hi (__b, __a);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqq_s32 (int32x4_t __a, int32x4_t __b)
+vcleq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b);
+  return (uint32x4_t) __builtin_aarch64_cmgev4si (__b, __a);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vceqq_s64 (int64x2_t __a, int64x2_t __b)
+vcleq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b);
+  return (uint64x2_t) __builtin_aarch64_cmgev2di (__b, __a);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
+vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a,
-                                                  (int8x16_t) __b);
+  return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __b,
+                                                  (int8x16_t) __a);
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
+vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a,
-                                                 (int16x8_t) __b);
+  return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __b,
+                                                 (int16x8_t) __a);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
+vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a,
-                                                 (int32x4_t) __b);
+  return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __b,
+                                                 (int32x4_t) __a);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vceqq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a,
-                                                 (int64x2_t) __b);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceqd_s64 (int64x1_t __a, int64x1_t __b)
+vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b);
+  return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __b,
+                                                 (int64x2_t) __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceqd_u64 (uint64x1_t __a, uint64x1_t __b)
+vcled_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b);
+  return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vceqzd_s64 (int64x1_t __a)
+vclezd_s64 (int64x1_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, 0);
+  return (uint64x1_t) __builtin_aarch64_cmledi (__a, 0);
 }
 
-/* vcge */
+/* vclt */
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcge_s8 (int8x8_t __a, int8x8_t __b)
+vclt_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b);
+  return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__b, __a);
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcge_s16 (int16x4_t __a, int16x4_t __b)
+vclt_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b);
+  return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__b, __a);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcge_s32 (int32x2_t __a, int32x2_t __b)
+vclt_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b);
+  return (uint32x2_t) __builtin_aarch64_cmgtv2si (__b, __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcge_s64 (int64x1_t __a, int64x1_t __b)
+vclt_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b);
+  return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a);
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcge_u8 (uint8x8_t __a, uint8x8_t __b)
+vclt_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __a,
-                                                (int8x8_t) __b);
+  return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __b,
+                                                (int8x8_t) __a);
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcge_u16 (uint16x4_t __a, uint16x4_t __b)
+vclt_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __a,
-                                                 (int16x4_t) __b);
+  return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __b,
+                                                 (int16x4_t) __a);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcge_u32 (uint32x2_t __a, uint32x2_t __b)
+vclt_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __a,
-                                                 (int32x2_t) __b);
+  return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __b,
+                                                 (int32x2_t) __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcge_u64 (uint64x1_t __a, uint64x1_t __b)
+vclt_u64 (uint64x1_t __a, uint64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a,
-                                               (int64x1_t) __b);
+  return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __b,
+                                               (int64x1_t) __a);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgeq_s8 (int8x16_t __a, int8x16_t __b)
+vcltq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b);
+  return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__b, __a);
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgeq_s16 (int16x8_t __a, int16x8_t __b)
+vcltq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b);
+  return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__b, __a);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgeq_s32 (int32x4_t __a, int32x4_t __b)
+vcltq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b);
+  return (uint32x4_t) __builtin_aarch64_cmgtv4si (__b, __a);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgeq_s64 (int64x2_t __a, int64x2_t __b)
+vcltq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b);
+  return (uint64x2_t) __builtin_aarch64_cmgtv2di (__b, __a);
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
+vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __a,
-                                                  (int8x16_t) __b);
+  return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __b,
+                                                  (int8x16_t) __a);
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
+vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __a,
-                                                 (int16x8_t) __b);
+  return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __b,
+                                                 (int16x8_t) __a);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
+vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __a,
-                                                 (int32x4_t) __b);
+  return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __b,
+                                                 (int32x4_t) __a);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgeq_u64 (uint64x2_t __a, uint64x2_t __b)
+vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __a,
-                                                 (int64x2_t) __b);
+  return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __b,
+                                                 (int64x2_t) __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcged_s64 (int64x1_t __a, int64x1_t __b)
+vcltd_s64 (int64x1_t __a, int64x1_t __b)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b);
+  return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a);
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcged_u64 (uint64x1_t __a, uint64x1_t __b)
+vcltzd_s64 (int64x1_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a,
-                                               (int64x1_t) __b);
+  return (uint64x1_t) __builtin_aarch64_cmltdi (__a, 0);
+}
+
+/* vcvt (double -> float).  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_f32_f64 (float64x2_t __a)
+{
+  return __builtin_aarch64_float_truncate_lo_v2sf (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b);
+}
+
+/* vcvt (float -> double).  */
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcvt_f64_f32 (float32x2_t __a)
+{
+
+  return __builtin_aarch64_float_extend_lo_v2df (__a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcvt_high_f64_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
+}
+
+/* vcvt  (<u>int -> float)  */
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vcvtd_f64_s64 (int64_t __a)
+{
+  return (float64_t) __a;
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vcvtd_f64_u64 (uint64_t __a)
+{
+  return (float64_t) __a;
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vcvts_f32_s32 (int32_t __a)
+{
+  return (float32_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgezd_s64 (int64x1_t __a)
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vcvts_f32_u32 (uint32_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__a, 0);
+  return (float32_t) __a;
 }
 
-/* vcgt */
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcgt_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_f32_s32 (int32x2_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b);
+  return __builtin_aarch64_floatv2siv2sf (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcgt_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_f32_u32 (uint32x2_t __a)
 {
-  return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b);
+  return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgt_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_f32_s32 (int32x4_t __a)
 {
-  return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b);
+  return __builtin_aarch64_floatv4siv4sf (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgt_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_f32_u32 (uint32x4_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b);
+  return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcvtq_f64_s64 (int64x2_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __a,
-                                                (int8x8_t) __b);
+  return __builtin_aarch64_floatv2div2df (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcvtq_f64_u64 (uint64x2_t __a)
 {
-  return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __a,
-                                                 (int16x4_t) __b);
+  return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
+/* vcvt (float -> <u>int)  */
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtd_s64_f64 (float64_t __a)
 {
-  return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __a,
-                                                 (int32x2_t) __b);
+  return (int64_t) __a;
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgt_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtd_u64_f64 (float64_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a,
-                                               (int64x1_t) __b);
+  return (uint64_t) __a;
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgtq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvts_s32_f32 (float32_t __a)
 {
-  return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b);
+  return (int32_t) __a;
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgtq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvts_u32_f32 (float32_t __a)
 {
-  return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b);
+  return (uint32_t) __a;
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvt_s32_f32 (float32x2_t __a)
 {
-  return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b);
+  return __builtin_aarch64_lbtruncv2sfv2si (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgtq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvt_u32_f32 (float32x2_t __a)
 {
-  return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x2_t) __builtin_aarch64_lbtruncuv2sfv2si (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtq_s32_f32 (float32x4_t __a)
 {
-  return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __a,
-                                                  (int8x16_t) __b);
+  return __builtin_aarch64_lbtruncv4sfv4si (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtq_u32_f32 (float32x4_t __a)
 {
-  return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __a,
-                                                 (int16x8_t) __b);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x4_t) __builtin_aarch64_lbtruncuv4sfv4si (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcvtq_s64_f64 (float64x2_t __a)
 {
-  return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __a,
-                                                 (int32x4_t) __b);
+  return __builtin_aarch64_lbtruncv2dfv2di (__a);
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcgtq_u64 (uint64x2_t __a, uint64x2_t __b)
+vcvtq_u64_f64 (float64x2_t __a)
 {
-  return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __a,
-                                                 (int64x2_t) __b);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint64x2_t) __builtin_aarch64_lbtruncuv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgtd_s64 (int64x1_t __a, int64x1_t __b)
+/* vcvta  */
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtad_s64_f64 (float64_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b);
+  return __builtin_aarch64_lrounddfdi (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgtd_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtad_u64_f64 (float64_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a,
-                                               (int64x1_t) __b);
+  return __builtin_aarch64_lroundudfdi (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcgtzd_s64 (int64x1_t __a)
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtas_s32_f32 (float32_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, 0);
+  return __builtin_aarch64_lroundsfsi (__a);
 }
 
-/* vcle */
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcle_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtas_u32_f32 (float32_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_cmgev8qi (__b, __a);
+  return __builtin_aarch64_lroundusfsi (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcle_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvta_s32_f32 (float32x2_t __a)
 {
-  return (uint16x4_t) __builtin_aarch64_cmgev4hi (__b, __a);
+  return __builtin_aarch64_lroundv2sfv2si (__a);
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcle_s32 (int32x2_t __a, int32x2_t __b)
+vcvta_u32_f32 (float32x2_t __a)
 {
-  return (uint32x2_t) __builtin_aarch64_cmgev2si (__b, __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x2_t) __builtin_aarch64_lrounduv2sfv2si (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcle_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtaq_s32_f32 (float32x4_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a);
+  return __builtin_aarch64_lroundv4sfv4si (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcle_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtaq_u32_f32 (float32x4_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __b,
-                                                (int8x8_t) __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x4_t) __builtin_aarch64_lrounduv4sfv4si (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcle_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcvtaq_s64_f64 (float64x2_t __a)
 {
-  return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __b,
-                                                 (int16x4_t) __a);
+  return __builtin_aarch64_lroundv2dfv2di (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcle_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcvtaq_u64_f64 (float64x2_t __a)
 {
-  return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __b,
-                                                 (int32x2_t) __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint64x2_t) __builtin_aarch64_lrounduv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcle_u64 (uint64x1_t __a, uint64x1_t __b)
+/* vcvtm  */
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtmd_s64_f64 (float64_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __b,
-                                               (int64x1_t) __a);
+  return __builtin_lfloor (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcleq_s8 (int8x16_t __a, int8x16_t __b)
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtmd_u64_f64 (float64_t __a)
 {
-  return (uint8x16_t) __builtin_aarch64_cmgev16qi (__b, __a);
+  return __builtin_aarch64_lfloorudfdi (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcleq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtms_s32_f32 (float32_t __a)
 {
-  return (uint16x8_t) __builtin_aarch64_cmgev8hi (__b, __a);
+  return __builtin_ifloorf (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcleq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtms_u32_f32 (float32_t __a)
 {
-  return (uint32x4_t) __builtin_aarch64_cmgev4si (__b, __a);
+  return __builtin_aarch64_lfloorusfsi (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcleq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvtm_s32_f32 (float32x2_t __a)
 {
-  return (uint64x2_t) __builtin_aarch64_cmgev2di (__b, __a);
+  return __builtin_aarch64_lfloorv2sfv2si (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvtm_u32_f32 (float32x2_t __a)
 {
-  return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __b,
-                                                  (int8x16_t) __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x2_t) __builtin_aarch64_lflooruv2sfv2si (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtmq_s32_f32 (float32x4_t __a)
 {
-  return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __b,
-                                                 (int16x8_t) __a);
+  return __builtin_aarch64_lfloorv4sfv4si (__a);
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
+vcvtmq_u32_f32 (float32x4_t __a)
 {
-  return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __b,
-                                                 (int32x4_t) __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x4_t) __builtin_aarch64_lflooruv4sfv4si (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcleq_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcvtmq_s64_f64 (float64x2_t __a)
 {
-  return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __b,
-                                                 (int64x2_t) __a);
+  return __builtin_aarch64_lfloorv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcled_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcvtmq_u64_f64 (float64x2_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint64x2_t) __builtin_aarch64_lflooruv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vclezd_s64 (int64x1_t __a)
+/* vcvtn  */
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtnd_s64_f64 (float64_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmledi (__a, 0);
+  return __builtin_aarch64_lfrintndfdi (__a);
 }
 
-/* vclt */
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtnd_u64_f64 (float64_t __a)
+{
+  return __builtin_aarch64_lfrintnudfdi (__a);
+}
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclt_s8 (int8x8_t __a, int8x8_t __b)
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtns_s32_f32 (float32_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__b, __a);
+  return __builtin_aarch64_lfrintnsfsi (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclt_s16 (int16x4_t __a, int16x4_t __b)
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtns_u32_f32 (float32_t __a)
 {
-  return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__b, __a);
+  return __builtin_aarch64_lfrintnusfsi (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclt_s32 (int32x2_t __a, int32x2_t __b)
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvtn_s32_f32 (float32x2_t __a)
 {
-  return (uint32x2_t) __builtin_aarch64_cmgtv2si (__b, __a);
+  return __builtin_aarch64_lfrintnv2sfv2si (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vclt_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvtn_u32_f32 (float32x2_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x2_t) __builtin_aarch64_lfrintnuv2sfv2si (__a);
 }
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclt_u8 (uint8x8_t __a, uint8x8_t __b)
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtnq_s32_f32 (float32x4_t __a)
 {
-  return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __b,
-                                                (int8x8_t) __a);
+  return __builtin_aarch64_lfrintnv4sfv4si (__a);
 }
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclt_u16 (uint16x4_t __a, uint16x4_t __b)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtnq_u32_f32 (float32x4_t __a)
 {
-  return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __b,
-                                                 (int16x4_t) __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x4_t) __builtin_aarch64_lfrintnuv4sfv4si (__a);
 }
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclt_u32 (uint32x2_t __a, uint32x2_t __b)
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcvtnq_s64_f64 (float64x2_t __a)
 {
-  return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __b,
-                                                 (int32x2_t) __a);
+  return __builtin_aarch64_lfrintnv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vclt_u64 (uint64x1_t __a, uint64x1_t __b)
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcvtnq_u64_f64 (float64x2_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __b,
-                                               (int64x1_t) __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint64x2_t) __builtin_aarch64_lfrintnuv2dfv2di (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcltq_s8 (int8x16_t __a, int8x16_t __b)
+/* vcvtp  */
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtpd_s64_f64 (float64_t __a)
 {
-  return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__b, __a);
+  return __builtin_lceil (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcltq_s16 (int16x8_t __a, int16x8_t __b)
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtpd_u64_f64 (float64_t __a)
 {
-  return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__b, __a);
+  return __builtin_aarch64_lceiludfdi (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltq_s32 (int32x4_t __a, int32x4_t __b)
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvtps_s32_f32 (float32_t __a)
 {
-  return (uint32x4_t) __builtin_aarch64_cmgtv4si (__b, __a);
+  return __builtin_iceilf (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcltq_s64 (int64x2_t __a, int64x2_t __b)
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvtps_u32_f32 (float32_t __a)
 {
-  return (uint64x2_t) __builtin_aarch64_cmgtv2di (__b, __a);
+  return __builtin_aarch64_lceilusfsi (__a);
 }
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvtp_s32_f32 (float32x2_t __a)
 {
-  return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __b,
-                                                  (int8x16_t) __a);
+  return __builtin_aarch64_lceilv2sfv2si (__a);
 }
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvtp_u32_f32 (float32x2_t __a)
 {
-  return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __b,
-                                                 (int16x8_t) __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x2_t) __builtin_aarch64_lceiluv2sfv2si (__a);
 }
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtpq_s32_f32 (float32x4_t __a)
 {
-  return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __b,
-                                                 (int32x4_t) __a);
+  return __builtin_aarch64_lceilv4sfv4si (__a);
 }
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcltq_u64 (uint64x2_t __a, uint64x2_t __b)
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtpq_u32_f32 (float32x4_t __a)
 {
-  return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __b,
-                                                 (int64x2_t) __a);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint32x4_t) __builtin_aarch64_lceiluv4sfv4si (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcltd_s64 (int64x1_t __a, int64x1_t __b)
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcvtpq_s64_f64 (float64x2_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a);
+  return __builtin_aarch64_lceilv2dfv2di (__a);
 }
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcltzd_s64 (int64x1_t __a)
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcvtpq_u64_f64 (float64x2_t __a)
 {
-  return (uint64x1_t) __builtin_aarch64_cmltdi (__a, 0);
+  /* TODO: This cast should go away when builtins have
+     their correct types.  */
+  return (uint64x2_t) __builtin_aarch64_lceiluv2dfv2di (__a);
 }
 
 /* vdup */
index 898bfdf..d035aa5 100644 (file)
@@ -1,5 +1,9 @@
 2013-04-29  James Greenhalgh  <james.greenhalgh@arm.com>
 
+       * gcc.target/aarch64/vect-vcvt.c: New.
+
+2013-04-29  James Greenhalgh  <james.greenhalgh@arm.com>
+
        * gcc.target/aarch64/vect-vrnd.c: New.
 
 2013-04-29  Richard Biener  <rguenther@suse.de>
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-vcvt.c b/gcc/testsuite/gcc.target/aarch64/vect-vcvt.c
new file mode 100644 (file)
index 0000000..6066d7d
--- /dev/null
@@ -0,0 +1,132 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps -ffast-math" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+extern double fabs (double);
+
+#define NUM_TESTS 8
+#define DELTA 0.000001
+
+float input_f32[] = {0.1f, -0.1f, 0.4f, 10.3f,
+                    200.0f, -800.0f, -13.0f, -0.5f};
+double input_f64[] = {0.1, -0.1, 0.4, 10.3,
+                     200.0, -800.0, -13.0, -0.5};
+
+#define TEST(SUFFIX, Q, WIDTH, LANES, S, U, D)                         \
+int                                                                    \
+test_vcvt##SUFFIX##_##S##WIDTH##_f##WIDTH##x##LANES##_t (void)         \
+{                                                                      \
+  int ret = 1;                                                         \
+  int i = 0;                                                           \
+  int nlanes = LANES;                                                  \
+  U##int##WIDTH##_t expected_out[NUM_TESTS];                           \
+  U##int##WIDTH##_t actual_out[NUM_TESTS];                             \
+                                                                       \
+  for (i = 0; i < NUM_TESTS; i++)                                      \
+    {                                                                  \
+      expected_out[i]                                                  \
+       = vcvt##SUFFIX##D##_##S##WIDTH##_f##WIDTH (input_f##WIDTH[i]);  \
+      /* Don't vectorize this.  */                                     \
+      asm volatile ("" : : : "memory");                                        \
+    }                                                                  \
+                                                                       \
+  for (i = 0; i < NUM_TESTS; i+=nlanes)                                        \
+    {                                                                  \
+      U##int##WIDTH##x##LANES##_t out =                                        \
+       vcvt##SUFFIX##Q##_##S##WIDTH##_f##WIDTH                         \
+               (vld1##Q##_f##WIDTH (input_f##WIDTH + i));              \
+      vst1##Q##_##S##WIDTH (actual_out + i, out);                      \
+    }                                                                  \
+                                                                       \
+  for (i = 0; i < NUM_TESTS; i++)                                      \
+    ret &= fabs (expected_out[i] - actual_out[i]) < DELTA;             \
+                                                                       \
+  return ret;                                                          \
+}                                                                      \
+
+
+#define BUILD_VARIANTS(SUFFIX)                 \
+TEST (SUFFIX,  , 32, 2, s, ,s)                 \
+TEST (SUFFIX, q, 32, 4, s, ,s)                 \
+TEST (SUFFIX, q, 64, 2, s, ,d)                 \
+TEST (SUFFIX,  , 32, 2, u,u,s)                 \
+TEST (SUFFIX, q, 32, 4, u,u,s)                 \
+TEST (SUFFIX, q, 64, 2, u,u,d)                 \
+
+BUILD_VARIANTS ( )
+/* { dg-final { scan-assembler "fcvtzs\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtzs\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtzs\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtzs\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtzs\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+/* { dg-final { scan-assembler "fcvtzu\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtzu\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtzu\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtzu\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtzu\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+BUILD_VARIANTS (a)
+/* { dg-final { scan-assembler "fcvtas\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtas\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtas\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtas\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtas\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+/* { dg-final { scan-assembler "fcvtau\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtau\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtau\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtau\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtau\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+BUILD_VARIANTS (m)
+/* { dg-final { scan-assembler "fcvtms\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtms\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtms\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtms\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtms\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+/* { dg-final { scan-assembler "fcvtmu\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtmu\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtmu\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtmu\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtmu\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+BUILD_VARIANTS (n)
+/* { dg-final { scan-assembler "fcvtns\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtns\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtns\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtns\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtns\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+/* { dg-final { scan-assembler "fcvtnu\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtnu\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtnu\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtnu\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtnu\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+BUILD_VARIANTS (p)
+/* { dg-final { scan-assembler "fcvtps\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtps\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtps\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtps\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtps\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+/* { dg-final { scan-assembler "fcvtpu\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtpu\\tx\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtpu\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */
+/* { dg-final { scan-assembler "fcvtpu\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "fcvtpu\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */
+
+#undef TEST
+#define TEST(SUFFIX, Q, WIDTH, LANES, S, U, D)                         \
+{                                                                      \
+  if (!test_vcvt##SUFFIX##_##S##WIDTH##_f##WIDTH##x##LANES##_t ())     \
+    abort ();                                                          \
+}
+
+int
+main (int argc, char **argv)
+{
+  BUILD_VARIANTS ( )
+  BUILD_VARIANTS (a)
+  BUILD_VARIANTS (m)
+  BUILD_VARIANTS (n)
+  BUILD_VARIANTS (p)
+  return 0;
+}
+
+/* { dg-final { cleanup-saved-temps } } */