2015-04-08 Charles Baylis <charles.baylis@linaro.org>
authorcbaylis <cbaylis@138bc75d-0d04-0410-961f-82ee72b054a4>
Wed, 8 Apr 2015 06:44:59 +0000 (06:44 +0000)
committercbaylis <cbaylis@138bc75d-0d04-0410-961f-82ee72b054a4>
Wed, 8 Apr 2015 06:44:59 +0000 (06:44 +0000)
        Backport from trunk r216672.

        * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
        update uses to use new macro arguments.
        (__LD3_LANE_FUNC): Likewise.
        (__LD4_LANE_FUNC): Likewise.

2015-04-08  Charles Baylis  <charles.baylis@linaro.org>

        Backport from trunk r216671.
        2014-10-24  Charles Baylis  <charles.baylis@linaro.org>
        * config/aarch64/aarch64-builtins.c
        (aarch64_types_loadstruct_lane_qualifiers): Define.
        * config/aarch64/aarch64-simd-builtins.def (ld2_lane, ld3_lane,
        ld4_lane): New builtins.
        * config/aarch64/aarch64-simd.md (aarch64_vec_load_lanesoi_lane<mode>):
        New pattern.
        (aarch64_vec_load_lanesci_lane<mode>): Likewise.
        (aarch64_vec_load_lanesxi_lane<mode>): Likewise.
        (aarch64_ld2_lane<mode>): New expand.
        (aarch64_ld3_lane<mode>): Likewise.
        (aarch64_ld4_lane<mode>): Likewise.
        * config/aarch64/aarch64.md (define_c_enum "unspec"): Add
        UNSPEC_LD2_LANE, UNSPEC_LD3_LANE, UNSPEC_LD4_LANE.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/linaro/gcc-4_9-branch@221915 138bc75d-0d04-0410-961f-82ee72b054a4

gcc/ChangeLog.linaro
gcc/config/aarch64/aarch64-builtins.c
gcc/config/aarch64/aarch64-simd-builtins.def
gcc/config/aarch64/aarch64-simd.md
gcc/config/aarch64/aarch64.md
gcc/config/aarch64/arm_neon.h

index a05a965..6cd8681 100644 (file)
@@ -1,3 +1,30 @@
+2015-04-08  Charles Baylis  <charles.baylis@linaro.org>
+
+       Backport from trunk r216672.
+
+       * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
+       update uses to use new macro arguments.
+       (__LD3_LANE_FUNC): Likewise.
+       (__LD4_LANE_FUNC): Likewise.
+
+2015-04-08  Charles Baylis  <charles.baylis@linaro.org>
+
+       Backport from trunk r216671.
+       2014-10-24  Charles Baylis  <charles.baylis@linaro.org>
+       * config/aarch64/aarch64-builtins.c
+       (aarch64_types_loadstruct_lane_qualifiers): Define.
+       * config/aarch64/aarch64-simd-builtins.def (ld2_lane, ld3_lane,
+       ld4_lane): New builtins.
+       * config/aarch64/aarch64-simd.md (aarch64_vec_load_lanesoi_lane<mode>):
+       New pattern.
+       (aarch64_vec_load_lanesci_lane<mode>): Likewise.
+       (aarch64_vec_load_lanesxi_lane<mode>): Likewise.
+       (aarch64_ld2_lane<mode>): New expand.
+       (aarch64_ld3_lane<mode>): Likewise.
+       (aarch64_ld4_lane<mode>): Likewise.
+       * config/aarch64/aarch64.md (define_c_enum "unspec"): Add
+       UNSPEC_LD2_LANE, UNSPEC_LD3_LANE, UNSPEC_LD4_LANE.
+
 2015-04-07  Michael Collison  <michael.collison@linaro.org>
 
        Backport from trunk r219679.
index 8b7d880..8f42572 100644 (file)
@@ -221,6 +221,11 @@ aarch64_types_load1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_const_pointer_map_mode };
 #define TYPES_LOAD1 (aarch64_types_load1_qualifiers)
 #define TYPES_LOADSTRUCT (aarch64_types_load1_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_loadstruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_const_pointer_map_mode,
+      qualifier_none, qualifier_none };
+#define TYPES_LOADSTRUCT_LANE (aarch64_types_loadstruct_lane_qualifiers)
 
 static enum aarch64_type_qualifiers
 aarch64_types_bsl_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
index 3de023b..3526cb1 100644 (file)
   BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0)
   BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0)
   BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0)
+  /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>.  */
+  BUILTIN_VQ (LOADSTRUCT_LANE, ld2_lane, 0)
+  BUILTIN_VQ (LOADSTRUCT_LANE, ld3_lane, 0)
+  BUILTIN_VQ (LOADSTRUCT_LANE, ld4_lane, 0)
   /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
   BUILTIN_VDC (STORESTRUCT, st2, 0)
   BUILTIN_VDC (STORESTRUCT, st3, 0)
index 6170453..ac4e483 100644 (file)
   [(set_attr "type" "neon_load2_all_lanes<q>")]
 )
 
+(define_insn "aarch64_vec_load_lanesoi_lane<mode>"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+       (unspec:OI [(match_operand:<V_TWO_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+                   (match_operand:OI 2 "register_operand" "0")
+                   (match_operand:SI 3 "immediate_operand" "i")
+                   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) ]
+                  UNSPEC_LD2_LANE))]
+  "TARGET_SIMD"
+  "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1"
+  [(set_attr "type" "neon_load2_one_lane")]
+)
+
 (define_insn "vec_store_lanesoi<mode>"
   [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
        (unspec:OI [(match_operand:OI 1 "register_operand" "w")
   [(set_attr "type" "neon_load3_all_lanes<q>")]
 )
 
+(define_insn "aarch64_vec_load_lanesci_lane<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+       (unspec:CI [(match_operand:<V_THREE_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+                   (match_operand:CI 2 "register_operand" "0")
+                   (match_operand:SI 3 "immediate_operand" "i")
+                   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD3_LANE))]
+  "TARGET_SIMD"
+  "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1"
+  [(set_attr "type" "neon_load3_one_lane")]
+)
+
 (define_insn "vec_store_lanesci<mode>"
   [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
        (unspec:CI [(match_operand:CI 1 "register_operand" "w")
   [(set_attr "type" "neon_load4_all_lanes<q>")]
 )
 
+(define_insn "aarch64_vec_load_lanesxi_lane<mode>"
+  [(set (match_operand:XI 0 "register_operand" "=w")
+       (unspec:XI [(match_operand:<V_FOUR_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+                   (match_operand:XI 2 "register_operand" "0")
+                   (match_operand:SI 3 "immediate_operand" "i")
+                   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                  UNSPEC_LD4_LANE))]
+  "TARGET_SIMD"
+  "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1"
+  [(set_attr "type" "neon_load4_one_lane")]
+)
+
 (define_insn "vec_store_lanesxi<mode>"
   [(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
        (unspec:XI [(match_operand:XI 1 "register_operand" "w")
   DONE;
 })
 
+(define_expand "aarch64_ld2_lane<mode>"
+  [(match_operand:OI 0 "register_operand" "=w")
+       (match_operand:DI 1 "register_operand" "w")
+       (match_operand:OI 2 "register_operand" "0")
+       (match_operand:SI 3 "immediate_operand" "i")
+       (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <V_TWO_ELEM>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+  aarch64_simd_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<VCONQ>mode));
+  emit_insn (gen_aarch64_vec_load_lanesoi_lane<mode> (operands[0],
+                                                     mem,
+                                                     operands[2],
+                                                     operands[3]));
+  DONE;
+})
+
+(define_expand "aarch64_ld3_lane<mode>"
+  [(match_operand:CI 0 "register_operand" "=w")
+       (match_operand:DI 1 "register_operand" "w")
+       (match_operand:CI 2 "register_operand" "0")
+       (match_operand:SI 3 "immediate_operand" "i")
+       (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <V_THREE_ELEM>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+  aarch64_simd_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<VCONQ>mode));
+  emit_insn (gen_aarch64_vec_load_lanesci_lane<mode> (operands[0],
+                                                     mem,
+                                                     operands[2],
+                                                     operands[3]));
+  DONE;
+})
+
+(define_expand "aarch64_ld4_lane<mode>"
+  [(match_operand:XI 0 "register_operand" "=w")
+       (match_operand:DI 1 "register_operand" "w")
+       (match_operand:XI 2 "register_operand" "0")
+       (match_operand:SI 3 "immediate_operand" "i")
+       (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  enum machine_mode mode = <V_FOUR_ELEM>mode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+  aarch64_simd_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<VCONQ>mode));
+  emit_insn (gen_aarch64_vec_load_lanesxi_lane<mode> (operands[0],
+                                                     mem,
+                                                     operands[2],
+                                                     operands[3]));
+  DONE;
+})
+
+
+
 ;; Expanders for builtins to extract vector registers from large
 ;; opaque integer modes.
 
index 0ab9c38..e96fcf9 100644 (file)
@@ -95,6 +95,9 @@
     UNSPEC_LD3_DUP
     UNSPEC_LD4
     UNSPEC_LD4_DUP
+    UNSPEC_LD2_LANE
+    UNSPEC_LD3_LANE
+    UNSPEC_LD4_LANE
     UNSPEC_MB
     UNSPEC_NOP
     UNSPEC_PRLG_STK
index 6aa893f..83d144f 100644 (file)
@@ -11911,131 +11911,6 @@ __STRUCTN (poly, 8, 4)
 __STRUCTN (float, 64, 4)
 #undef __STRUCTN
 
-#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
-                       lnsuffix, funcsuffix, Q)                        \
-  __extension__ static __inline rettype                                        \
-  __attribute__ ((__always_inline__))                                  \
-  vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
-                                    rettype b, const int c)            \
-  {                                                                    \
-    rettype result;                                                    \
-    __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t"    \
-            "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t"   \
-            "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t"     \
-            : "=Q"(result)                                             \
-            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
-            : "memory", "v16", "v17");                                 \
-    return result;                                                     \
-  }
-
-__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
-__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
-__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
-__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
-__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
-__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
-__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
-__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
-__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
-__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
-__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
-__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
-__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
-__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
-__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
-__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
-__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
-__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
-
-#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
-                       lnsuffix, funcsuffix, Q)                        \
-  __extension__ static __inline rettype                                        \
-  __attribute__ ((__always_inline__))                                  \
-  vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
-                                    rettype b, const int c)            \
-  {                                                                    \
-    rettype result;                                                    \
-    __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"   \
-            "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t"  \
-            "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t"    \
-            : "=Q"(result)                                             \
-            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
-            : "memory", "v16", "v17", "v18");                          \
-    return result;                                                     \
-  }
-
-__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
-__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
-
-#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
-                       lnsuffix, funcsuffix, Q)                        \
-  __extension__ static __inline rettype                                        \
-  __attribute__ ((__always_inline__))                                  \
-  vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
-                                    rettype b, const int c)            \
-  {                                                                    \
-    rettype result;                                                    \
-    __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"   \
-            "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t"  \
-            "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t"    \
-            : "=Q"(result)                                             \
-            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
-            : "memory", "v16", "v17", "v18", "v19");                   \
-    return result;                                                     \
-  }
-
-__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
-__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
 
 #define __ST2_LANE_FUNC(intype, largetype, ptrtype,                         \
                        mode, ptr_mode, funcsuffix, signedtype)              \
@@ -18379,6 +18254,368 @@ vld4q_dup_f64 (const float64_t * __a)
   return ret;
 }
 
+/* vld2_lane */
+
+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
+                        mode, ptrmode, funcsuffix, signedtype)            \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{                                                                         \
+  __builtin_aarch64_simd_oi __o;                                          \
+  largetype __temp;                                                       \
+  __temp.val[0] =                                                         \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
+  __temp.val[1] =                                                         \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
+  __o = __builtin_aarch64_set_qregoi##mode (__o,                          \
+                                          (signedtype) __temp.val[0],     \
+                                          0);                             \
+  __o = __builtin_aarch64_set_qregoi##mode (__o,                          \
+                                          (signedtype) __temp.val[1],     \
+                                          1);                             \
+  __o =        __builtin_aarch64_ld2_lane##mode (                                 \
+         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);         \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);         \
+  return __b;                                                             \
+}
+
+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
+                sf, f32, float32x4_t)
+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
+                int8x16_t)
+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
+                p16, int16x8_t)
+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
+                int8x16_t)
+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
+                int16x8_t)
+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
+                int32x4_t)
+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
+                int64x2_t)
+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
+                int8x16_t)
+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
+                u16, int16x8_t)
+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
+                u32, int32x4_t)
+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
+                u64, int64x2_t)
+
+#undef __LD2_LANE_FUNC
+
+__extension__ static __inline float64x1x2_t
+__attribute__ ((__always_inline__))
+vld2_lane_f64 (const float64_t *ptr, float64x1x2_t b, const int c)
+{
+  float64x1x2_t result;
+  __asm__ ("ld1 {v16.1d, v17.1d}, %1\n\t"
+          "ld2 {v16.d, v17.d}[%3], %2\n\t"
+          "st1 {v16.1d, v17.1d}, %0\n\t"
+          : "=Q"(result)
+          : "Q"(b), "Q"(*(const float64x1x2_t *)ptr), "i"(c)
+          : "memory", "v16", "v17");
+  return result;
+}
+
+
+
+/* vld2q_lane */
+
+#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{                                                                         \
+  __builtin_aarch64_simd_oi __o;                                          \
+  intype ret;                                                             \
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \
+  __o = __builtin_aarch64_ld2_lane##mode (                                \
+       (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);             \
+  ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0);         \
+  ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1);         \
+  return ret;                                                             \
+}
+
+__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
+__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
+__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
+
+__extension__ static __inline float64x2x2_t
+__attribute__ ((__always_inline__))
+vld2q_lane_f64 (const float64_t *ptr, float64x2x2_t b, const int c)
+{
+  float64x2x2_t result;
+  __asm__ ("ld1 {v16.2d, v17.2d}, %1\n\t"
+          "ld2 {v16.d, v17.d}[%3], %2\n\t"
+          "st1 {v16.2d, v17.2d}, %0\n\t"
+          : "=Q"(result)
+          : "Q"(b), "Q"(*(const float64x2x2_t *)ptr), "i"(c)
+          : "memory", "v16", "v17");
+  return result;
+}
+
+
+#undef __LD2_LANE_FUNC
+
+/* vld3_lane */
+
+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
+                        mode, ptrmode, funcsuffix, signedtype)            \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{                                                                         \
+  __builtin_aarch64_simd_ci __o;                                          \
+  largetype __temp;                                                       \
+  __temp.val[0] =                                                         \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
+  __temp.val[1] =                                                         \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
+  __temp.val[2] =                                                         \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));         \
+  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
+                                          (signedtype) __temp.val[0],     \
+                                          0);                             \
+  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
+                                          (signedtype) __temp.val[1],     \
+                                          1);                             \
+  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
+                                          (signedtype) __temp.val[2],     \
+                                          2);                             \
+  __o =        __builtin_aarch64_ld3_lane##mode (                                 \
+         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);         \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);         \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);         \
+  return __b;                                                             \
+}
+
+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
+                sf, f32, float32x4_t)
+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
+                int8x16_t)
+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
+                p16, int16x8_t)
+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
+                int8x16_t)
+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
+                int16x8_t)
+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
+                int32x4_t)
+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
+                int64x2_t)
+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
+                int8x16_t)
+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
+                u16, int16x8_t)
+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
+                u32, int32x4_t)
+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
+                u64, int64x2_t)
+
+#undef __LD3_LANE_FUNC
+
+__extension__ static __inline float64x1x3_t
+__attribute__ ((__always_inline__))
+vld3_lane_f64 (const float64_t *ptr, float64x1x3_t b, const int c)
+{
+  float64x1x3_t result;
+  __asm__ ("ld1 {v16.1d - v18.1d}, %1\n\t"
+          "ld3 {v16.d - v18.d}[%3], %2\n\t"
+          "st1 {v16.1d - v18.1d}, %0\n\t"
+          : "=Q"(result)
+          : "Q"(b), "Q"(*(const float64x1x3_t *)ptr), "i"(c)
+          : "memory", "v16", "v17", "v18");
+  return result;
+}
+
+
+/* vld3q_lane */
+
+#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{                                                                         \
+  __builtin_aarch64_simd_ci __o;                                          \
+  intype ret;                                                             \
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
+  __o = __builtin_aarch64_ld3_lane##mode (                                \
+       (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);             \
+  ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0);         \
+  ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1);         \
+  ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2);         \
+  return ret;                                                             \
+}
+
+__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
+__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
+__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
+
+__extension__ static __inline float64x2x3_t
+__attribute__ ((__always_inline__))
+vld3q_lane_f64 (const float64_t *ptr, float64x2x3_t b, const int c)
+{
+  float64x2x3_t result;
+  __asm__ ("ld1 {v16.2d - v18.2d}, %1\n\t"
+          "ld3 {v16.d - v18.d}[%3], %2\n\t"
+          "st1 {v16.2d - v18.2d}, %0\n\t"
+          : "=Q"(result)
+          : "Q"(b), "Q"(*(const float64x2x3_t *)ptr), "i"(c)
+          : "memory", "v16", "v17", "v18");
+  return result;
+}
+
+
+#undef __LD3_LANE_FUNC
+
+/* vld4_lane */
+
+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
+                        mode, ptrmode, funcsuffix, signedtype)            \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
+{                                                                         \
+  __builtin_aarch64_simd_xi __o;                                          \
+  largetype __temp;                                                       \
+  __temp.val[0] =                                                         \
+    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
+  __temp.val[1] =                                                         \
+    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
+  __temp.val[2] =                                                         \
+    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));         \
+  __temp.val[3] =                                                         \
+    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));         \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
+                                          (signedtype) __temp.val[0],     \
+                                          0);                             \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
+                                          (signedtype) __temp.val[1],     \
+                                          1);                             \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
+                                          (signedtype) __temp.val[2],     \
+                                          2);                             \
+  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
+                                          (signedtype) __temp.val[3],     \
+                                          3);                             \
+  __o =        __builtin_aarch64_ld4_lane##mode (                                 \
+         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
+  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);         \
+  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);         \
+  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);         \
+  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);         \
+  return __b;                                                             \
+}
+
+/* vld4q_lane */
+
+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
+                sf, f32, float32x4_t)
+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
+                int8x16_t)
+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
+                p16, int16x8_t)
+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
+                int8x16_t)
+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
+                int16x8_t)
+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
+                int32x4_t)
+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
+                int64x2_t)
+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
+                int8x16_t)
+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
+                u16, int16x8_t)
+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
+                u32, int32x4_t)
+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
+                u64, int64x2_t)
+
+#undef __LD4_LANE_FUNC
+
+__extension__ static __inline float64x1x4_t
+__attribute__ ((__always_inline__))
+vld4_lane_f64 (const float64_t *ptr, float64x1x4_t b, const int c)
+{
+  float64x1x4_t result;
+  __asm__ ("ld1 {v16.1d - v19.1d}, %1\n\t"
+          "ld4 {v16.d - v19.d}[%3], %2\n\t"
+          "st1 {v16.1d - v19.1d}, %0\n\t"
+          : "=Q"(result)
+          : "Q"(b), "Q"(*(const float64x1x4_t *)ptr), "i"(c)
+          : "memory", "v16", "v17", "v18", "v19");
+  return result;
+}
+
+
+/* vld4q_lane */
+
+#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__))   \
+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{                                                                         \
+  __builtin_aarch64_simd_xi __o;                                          \
+  intype ret;                                                             \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \
+  __o = __builtin_aarch64_ld4_lane##mode (                                \
+       (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);             \
+  ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0);         \
+  ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1);         \
+  ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2);         \
+  ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3);         \
+  return ret;                                                             \
+}
+
+__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
+__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
+__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
+
+__extension__ static __inline float64x2x4_t
+__attribute__ ((__always_inline__))
+vld4q_lane_f64 (const float64_t *ptr, float64x2x4_t b, const int c)
+{
+  float64x2x4_t result;
+  __asm__ ("ld1 {v16.2d - v19.2d}, %1\n\t"
+          "ld4 {v16.d - v19.d}[%3], %2\n\t"
+          "st1 {v16.2d - v19.2d}, %0\n\t"
+          : "=Q"(result)
+          : "Q"(b), "Q"(*(const float64x2x4_t *)ptr), "i"(c)
+          : "memory", "v16", "v17", "v18", "v19");
+  return result;
+}
+
+#undef __LD4_LANE_FUNC
+
 /* vmax */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))