[AArch64] Implement vmul<q>_lane<q>_<fsu><16,32,64> intrinsics in C

author jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>

Mon, 16 Sep 2013 09:50:21 +0000 (09:50 +0000)

committer jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>

Mon, 16 Sep 2013 09:50:21 +0000 (09:50 +0000)
author jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>
Mon, 16 Sep 2013 09:50:21 +0000 (09:50 +0000)
committer jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>
Mon, 16 Sep 2013 09:50:21 +0000 (09:50 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 9f25d92..aaab5ec 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,19 @@
  2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>
  
+       * config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.
+       (aarch64_mul3_elt_<vswap_width_name><mode>): Likewise.
+       (aarch64_mul3_elt_to_128df): Likewise.
+       (aarch64_mul3_elt_to_64v2df): Likewise.
+       * config/aarch64/iterators.md (VEL): Also handle DFmode.
+       (VMUL): New.
+       (VMUL_CHANGE_NLANES) Likewise.
+       (h_con): Likewise.
+       (f): Likewise.
+       * config/aarch64/arm_neon.h
+       (vmul<q>_lane<q>_<suf><16,32,64>): Convert to C implementation.
+
+2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>
+
         * config/aarch64/arm_neon.h
         (vcvtx_high_f32_f64): Fix parameters.
  
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index 9805197..04d5794 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -582,6 +582,59 @@
     (set_attr "simd_mode" "<MODE>")]
  )
  
+(define_insn "*aarch64_mul3_elt<mode>"
+ [(set (match_operand:VMUL 0 "register_operand" "=w")
+    (mult:VMUL
+      (vec_duplicate:VMUL
+         (vec_select:<VEL>
+           (match_operand:VMUL 1 "register_operand" "<h_con>")
+           (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VMUL 3 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"
+  [(set_attr "simd_type" "simd_<f>mul_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
+  [(set (match_operand:VMUL_CHANGE_NLANES 0 "register_operand" "=w")
+     (mult:VMUL_CHANGE_NLANES
+       (vec_duplicate:VMUL_CHANGE_NLANES
+         (vec_select:<VEL>
+           (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+           (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"
+  [(set_attr "simd_type" "simd_<f>mul_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_mul3_elt_to_128df"
+  [(set (match_operand:V2DF 0 "register_operand" "=w")
+     (mult:V2DF
+       (vec_duplicate:V2DF
+        (match_operand:DF 2 "register_operand" "w"))
+      (match_operand:V2DF 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "fmul\\t%0.2d, %1.2d, %2.d[0]"
+  [(set_attr "simd_type" "simd_fmul_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "*aarch64_mul3_elt_to_64v2df"
+  [(set (match_operand:DF 0 "register_operand" "=w")
+     (mult:DF
+       (vec_select:DF
+        (match_operand:V2DF 1 "register_operand" "w")
+        (parallel [(match_operand:SI 2 "immediate_operand")]))
+       (match_operand:DF 3 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "fmul\\t%0.2d, %3.2d, %1.d[%2]"
+  [(set_attr "simd_type" "simd_fmul_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
  (define_insn "neg<mode>2"
    [(set (match_operand:VDQ 0 "register_operand" "=w")
         (neg:VDQ (match_operand:VDQ 1 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h

index 23b1116..6c9dd79 100644 (file)
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9501,136 +9501,6 @@ vmovq_n_u64 (uint64_t a)
    return result;
  }
  
-#define vmul_lane_f32(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmul %0.2s,%1.2s,%2.s[%3]"                             \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmul_lane_s16(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x4_t result;                                                \
-       __asm__ ("mul %0.4h,%1.4h,%2.h[%3]"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmul_lane_s32(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x2_t result;                                                \
-       __asm__ ("mul %0.2s,%1.2s,%2.s[%3]"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmul_lane_u16(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x4_t result;                                               \
-       __asm__ ("mul %0.4h,%1.4h,%2.h[%3]"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmul_lane_u32(a, b, c)                                          \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x2_t result;                                               \
-       __asm__ ("mul %0.2s, %1.2s, %2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmul_laneq_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmul %0.2s, %1.2s, %2.s[%3]"                           \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmul_laneq_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x4_t result;                                                \
-       __asm__ ("mul %0.4h, %1.4h, %2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmul_laneq_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x2_t result;                                                \
-       __asm__ ("mul %0.2s, %1.2s, %2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmul_laneq_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x4_t result;                                               \
-       __asm__ ("mul %0.4h, %1.4h, %2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmul_laneq_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x2_t result;                                               \
-       __asm__ ("mul %0.2s, %1.2s, %2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vmul_n_f32 (float32x2_t a, float32_t b)
  {
@@ -10149,162 +10019,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b)
    return result;
  }
  
-#define vmulq_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmul %0.4s, %1.4s, %2.s[%3]"                           \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_lane_f64(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x1_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmul %0.2d,%1.2d,%2.d[%3]"                             \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_lane_s16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int16x8_t result;                                                \
-       __asm__ ("mul %0.8h,%1.8h,%2.h[%3]"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_lane_s32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("mul %0.4s,%1.4s,%2.s[%3]"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_lane_u16(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint16x8_t result;                                               \
-       __asm__ ("mul %0.8h,%1.8h,%2.h[%3]"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_lane_u32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("mul %0.4s, %1.4s, %2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_laneq_f32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmul %0.4s, %1.4s, %2.s[%3]"                           \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_laneq_f64(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmul %0.2d,%1.2d,%2.d[%3]"                             \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_laneq_s16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int16x8_t result;                                                \
-       __asm__ ("mul %0.8h, %1.8h, %2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_laneq_s32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("mul %0.4s, %1.4s, %2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_laneq_u16(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint16x8_t result;                                               \
-       __asm__ ("mul %0.8h, %1.8h, %2.h[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "x"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmulq_laneq_u32(a, b, c)                                        \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("mul %0.4s, %1.4s, %2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vmulq_n_f32 (float32x4_t a, float32_t b)
  {
@@ -21435,6 +21149,158 @@ vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
    return a - b * c;
  }
  
+/* vmul_lane  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_f32 (__b, __lane);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_s16 (__b, __lane);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_s32 (__b, __lane);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_u16 (__b, __lane);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_u32 (__b, __lane);
+}
+
+/* vmul_laneq  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_f32 (__b, __lane);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_f64 (__b, __lane);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_s16 (__b, __lane);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_s32 (__b, __lane);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_u16 (__b, __lane);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_u32 (__b, __lane);
+}
+
+/* vmulq_lane  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_f32 (__b, __lane);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_s16 (__b, __lane);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_s32 (__b, __lane);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_u16 (__b, __lane);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane)
+{
+  return __a * __aarch64_vget_lane_u32 (__b, __lane);
+}
+
+/* vmulq_laneq  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_f32 (__b, __lane);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_f64 (__b, __lane);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_s16 (__b, __lane);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_s32 (__b, __lane);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_u16 (__b, __lane);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
+{
+  return __a * __aarch64_vgetq_lane_u32 (__b, __lane);
+}
+
  /* vqabs */
  
  __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index ffe125b..a6b3117 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -169,6 +169,12 @@
  ;; Double scalar modes
  (define_mode_iterator DX [DI DF])
  
+;; Modes available for <f>mul lane operations.
+(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
+
+;; Modes available for <f>mul lane operations changing lane count.
+(define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
+
  ;; ------------------------------------------------------------------
  ;; Unspec enumerations for Advance SIMD. These could well go into
  ;; aarch64.md but for their use in int_iterators here.
@@ -358,7 +364,7 @@
                          (V2SI "SI") (V4SI "SI")
                          (DI "DI")   (V2DI "DI")
                          (V2SF "SF") (V4SF "SF")
-                        (V2DF "DF")
+                        (V2DF "DF") (DF "DF")
                         (SI   "SI") (HI   "HI")
                         (QI   "QI")])
  
@@ -541,6 +547,22 @@
                                     (V2SF "to_128") (V4SF  "to_64")
                                     (DF   "to_128") (V2DF  "to_64")])
  
+;; For certain vector-by-element multiplication instructions we must
+;; constrain the HI cases to use only V0-V15.  This is covered by
+;; the 'x' constraint.  All other modes may use the 'w' constraint.
+(define_mode_attr h_con [(V2SI "w") (V4SI "w")
+                        (V4HI "x") (V8HI "x")
+                        (V2SF "w") (V4SF "w")
+                        (V2DF "w") (DF "w")])
+
+;; Defined to 'f' for types whose element type is a float type.
+(define_mode_attr f [(V8QI "")  (V16QI "")
+                    (V4HI "")  (V8HI  "")
+                    (V2SI "")  (V4SI  "")
+                    (DI   "")  (V2DI  "")
+                    (V2SF "f") (V4SF  "f")
+                    (V2DF "f") (DF    "f")])
+
  ;; -------------------------------------------------------------------
  ;; Code Iterators
  ;; -------------------------------------------------------------------
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 6b9b3bc..17ae8ee 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>
+
+       * gcc.target/aarch64/mul_intrinsic_1.c: New.
+       * gcc.target/aarch64/fmul_intrinsic_1.c: Likewise.
+
  2013-09-16  Richard Biener  <rguenther@suse.de>
  
         * gcc.dg/tree-ssa/ldist-22.c: New testcase.
diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c

new file mode 100644 (file)

index 0000000..f6e32f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c
@@ -0,0 +1,116 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+#define DELTA 0.0001
+extern void abort (void);
+extern double fabs (double);
+
+#define TEST_VMUL(q1, q2, size, in1_lanes, in2_lanes)                  \
+static void                                                            \
+test_vmul##q1##_lane##q2##_f##size (float##size##_t * res,             \
+                                  const float##size##_t *in1,          \
+                                  const float##size##_t *in2)          \
+{                                                                      \
+  float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res);          \
+  float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1);          \
+  float##size##x##in2_lanes##_t c;                                     \
+  if (in2_lanes > 1)                                                   \
+    {                                                                  \
+      c = vld1##q2##_f##size (in2);                                    \
+      a = vmul##q1##_lane##q2##_f##size (b, c, 1);                     \
+    }                                                                  \
+  else                                                                 \
+    {                                                                  \
+      c = vld1##q2##_f##size (in2 + 1);                                        \
+      a = vmul##q1##_lane##q2##_f##size (b, c, 0);                     \
+    }                                                                  \
+  vst1##q1##_f##size (res, a);                                         \
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)               \
+TEST_VMUL ( ,  , width, n_half_lanes, n_half_lanes)            \
+TEST_VMUL (q,  , width, n_lanes, n_half_lanes)                 \
+TEST_VMUL ( , q, width, n_half_lanes, n_lanes)                 \
+TEST_VMUL (q, q, width, n_lanes, n_lanes)
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (64, 2, 1)
+
+#define POOL2 {0.0, 1.0}
+#define POOL4 {0.0, 1.0, 2.0, 3.0}
+#define EMPTY2 {0.0, 0.0}
+#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
+
+#define BUILD_TEST(size, lanes)                                        \
+static void                                                    \
+test_f##size (void)                                            \
+{                                                              \
+  int i;                                                       \
+  float##size##_t pool[lanes] = POOL##lanes;                   \
+  float##size##_t res[lanes] = EMPTY##lanes;                   \
+  float##size##_t res2[lanes] = EMPTY##lanes;                  \
+  float##size##_t res3[lanes] = EMPTY##lanes;                  \
+  float##size##_t res4[lanes] = EMPTY##lanes;                  \
+                                                               \
+  /* Avoid constant folding the multiplication.  */            \
+  asm volatile ("" : : : "memory");                            \
+  test_vmul_lane_f##size (res, pool, pool);                    \
+  /* Avoid fusing multiplication and subtraction.  */          \
+  asm volatile ("" : :"Q" (res) : "memory");                   \
+  for (i = 0; i < lanes / 2; i++)                              \
+    if (fabs (res[i] - pool[i]) > DELTA)                       \
+      abort ();                                                        \
+                                                               \
+  test_vmulq_lane_f##size (res2, pool, pool);                  \
+  /* Avoid fusing multiplication and subtraction.  */          \
+  asm volatile ("" : :"Q" (res2) : "memory");                  \
+  for (i = 0; i < lanes; i++)                                  \
+    if (fabs (res2[i] - pool[i]) > DELTA)                      \
+      abort ();                                                        \
+                                                               \
+  test_vmul_laneq_f##size (res3, pool, pool);                  \
+  /* Avoid fusing multiplication and subtraction.  */          \
+  asm volatile ("" : :"Q" (res3) : "memory");                  \
+  for (i = 0; i < lanes / 2; i++)                              \
+    if (fabs (res3[i] - pool[i]) > DELTA)                      \
+      abort ();                                                        \
+                                                               \
+  test_vmulq_laneq_f##size (res4, pool, pool);                 \
+  /* Avoid fusing multiplication and subtraction.  */          \
+  asm volatile ("" : :"Q" (res4) : "memory");                  \
+  for (i = 0; i < lanes; i++)                                  \
+    if (fabs (res4[i] - pool[i]) > DELTA)                      \
+      abort ();                                                        \
+}
+
+BUILD_TEST (32, 4)
+BUILD_TEST (64, 2)
+
+int
+main (int argc, char **argv)
+{
+  test_f32 ();
+  test_f64 ();
+  return 0;
+}
+
+/* vmul_laneq_f32.
+   vmul_lane_f32.  */
+/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vmulq_lane_f32.
+   vmulq_laneq_f32.  */
+/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vmul_lane_f64.  */
+/* { dg-final { scan-assembler-times "fmul\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 1 } } */
+
+/* vmul_laneq_f64.
+   vmulq_lane_f64.
+   vmulq_laneq_f64.  */
+/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 } } */
+
+/* { dg-final { cleanup-saved-temps } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c

new file mode 100644 (file)

index 0000000..dabe10e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c
@@ -0,0 +1,83 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define MAPs(size, xx) int##size##xx##_t
+#define MAPu(size, xx) uint##size##xx##_t
+
+
+#define TEST_VMUL(q, su, size, in1_lanes, in2_lanes)           \
+static void                                                    \
+test_vmulq_lane##q##_##su##size (MAP##su (size, ) * res,       \
+                                const MAP##su(size, ) *in1,    \
+                                const MAP##su(size, ) *in2)    \
+{                                                              \
+  MAP##su (size, x##in1_lanes) a = vld1q_##su##size (in1);     \
+  MAP##su (size, x##in2_lanes) b = vld1##q##_##su##size (in2); \
+  a = vmulq_lane##q##_##su##size (a, b, 1);                    \
+  vst1q_##su##size (res, a);                                   \
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)               \
+TEST_VMUL (, s, width, n_lanes, n_half_lanes)                  \
+TEST_VMUL (q, s, width, n_lanes, n_lanes)                      \
+TEST_VMUL (, u, width, n_lanes, n_half_lanes)                  \
+TEST_VMUL (q, u, width, n_lanes, n_lanes)                      \
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (16, 8, 4)
+
+#define POOL4 {0, 1, 2, 3}
+#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
+#define EMPTY4 {0, 0, 0, 0}
+#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0}
+
+#define BUILD_TEST(su, size, lanes)                            \
+static void                                                    \
+test_##su##size (void)                                         \
+{                                                              \
+  int i;                                                       \
+  MAP##su (size,) pool[lanes] = POOL##lanes;                   \
+  MAP##su (size,) res[lanes] = EMPTY##lanes;                   \
+  MAP##su (size,) res2[lanes] = EMPTY##lanes;                  \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vmulq_lane_##su##size (res, pool, pool);                        \
+  for (i = 0; i < lanes; i++)                                  \
+    if (res[i] != pool[i])                                     \
+      abort ();                                                        \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vmulq_laneq_##su##size (res2, pool, pool);              \
+  for (i = 0; i < lanes; i++)                                  \
+    if (res2[i] != pool[i])                                    \
+      abort ();                                                        \
+}
+
+#undef BUILD_VARS
+#define BUILD_VARS(size, lanes)                                        \
+BUILD_TEST (s, size, lanes)                                    \
+BUILD_TEST (u, size, lanes)
+
+BUILD_VARS (32, 4)
+BUILD_VARS (16, 8)
+
+int
+main (int argc, char **argv)
+{
+  test_s32 ();
+  test_u32 ();
+  test_s16 ();
+  test_u16 ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { cleanup-saved-temps } } */
+
author	jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>
	Mon, 16 Sep 2013 09:50:21 +0000 (09:50 +0000)
committer	jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>
	Mon, 16 Sep 2013 09:50:21 +0000 (09:50 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| history
gcc/config/aarch64/arm_neon.h		patch \| blob \| history
gcc/config/aarch64/iterators.md		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/mul_intrinsic_1.c	[new file with mode: 0644]	patch \| blob