Add support for fma intrinsics for ARM.
authorMatthew Gretton-Dann <matthew.gretton-dann@arm.com>
Thu, 18 Oct 2012 12:02:01 +0000 (12:02 +0000)
committerRamana Radhakrishnan <ramana@gcc.gnu.org>
Thu, 18 Oct 2012 12:02:01 +0000 (12:02 +0000)
Correct dates in changelog from earlier commit.

2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

        * config/arm/arm.c (neon_builtin_data): Add vfma and vfms
       builtins.
        * config/arm/neon-docgen.ml (intrinsic_groups): Add
        fused-multiply-* groups.
        * config/neon-gen.ml (print_feature_test_start): New function.
        (print_feature_test_end): Likewise.
        (print_variant): Print feature test macros.
        * config/arm/neon-testgen.ml (emit_prologue): Allow different
        tests to require different effective targets.
        (effective_target): New function.
        (test_intrinsic): Specify correct effective targets.
        * gcc/config/arm/neon.md (fma<VCVTF:mode>4_intrinsic): New pattern.
        (fmsub<VCVTF:mode>4_intrinsic): Likewise.
        (neon_vfma<VCVFT:mode>): New expand.
        (neon_vfms<VCVFT:mode>): Likewise.
        * config/neon.ml (opcode): Add Vfma and Vfms.
        (features): Add Requires_feature.
        (ops): Add VFMA and VFMS intrinsics.
        * config/arm/arm_neon.h: Regenerate.
        * doc/arm-neon-intrinsics.texi: Likewise.

2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>

        * gcc.target/arm/neon/vfmaQf32.c: New testcase.
        * gcc.target/arm/neon/vfmaf32.c: Likewise.
        * gcc.target/arm/neon/vfmsQf32.c: Likewise.
        * gcc.target/arm/neon/vfmsf32.c: Likewise.

Co-Authored-By: Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>
From-SVN: r192560

13 files changed:
gcc/ChangeLog
gcc/config/arm/arm.c
gcc/config/arm/arm_neon.h
gcc/config/arm/neon-docgen.ml
gcc/config/arm/neon-gen.ml
gcc/config/arm/neon-testgen.ml
gcc/config/arm/neon.md
gcc/config/arm/neon.ml
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c [new file with mode: 0644]
gcc/testsuite/gcc.target/arm/neon/vfmaf32.c [new file with mode: 0644]
gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c [new file with mode: 0644]
gcc/testsuite/gcc.target/arm/neon/vfmsf32.c [new file with mode: 0644]

index 607205a..a03cf11 100644 (file)
@@ -1,3 +1,27 @@
+2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
+           Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>
+
+        * config/arm/arm.c (neon_builtin_data): Add vfma and vfms
+       builtins.
+        * config/arm/neon-docgen.ml (intrinsic_groups): Add
+        fused-multiply-* groups.
+        * config/neon-gen.ml (print_feature_test_start): New function.
+        (print_feature_test_end): Likewise.
+        (print_variant): Print feature test macros.
+        * config/arm/neon-testgen.ml (emit_prologue): Allow different
+        tests to require different effective targets.
+        (effective_target): New function.
+        (test_intrinsic): Specify correct effective targets.
+        * gcc/config/arm/neon.md (fma<VCVTF:mode>4_intrinsic): New pattern.
+        (fmsub<VCVTF:mode>4_intrinsic): Likewise.
+        (neon_vfma<VCVFT:mode>): New expand.
+        (neon_vfms<VCVFT:mode>): Likewise.
+        * config/neon.ml (opcode): Add Vfma and Vfms.
+        (features): Add Requires_feature.
+        (ops): Add VFMA and VFMS intrinsics.
+        * config/arm/arm_neon.h: Regenerate.
+        * doc/arm-neon-intrinsics.texi: Likewise.
+
 2012-10-18  Richard Guenther  <rguenther@suse.de>
 
        * lto-streamer.h (enum LTO_tags): Add LTO_integer_cst.
@@ -11,7 +35,7 @@
        (streamer_pack_tree_bitfields): Call it.
        (streamer_write_integer_cst): Adjust.
 
-2012-10-17  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
+2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
            Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>
 
         * config.gcc: Add support for ARMv8 for arm*-*-* targets.
index 165dfe2..327ef22 100644 (file)
@@ -18726,6 +18726,8 @@ static neon_builtin_datum neon_builtin_data[] =
   VAR8 (BINOP, vmul, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
   VAR8 (TERNOP, vmla, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
   VAR3 (TERNOP, vmlal, v8qi, v4hi, v2si),
+  VAR2 (TERNOP, vfma, v2sf, v4sf),
+  VAR2 (TERNOP, vfms, v2sf, v4sf),
   VAR8 (TERNOP, vmls, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
   VAR3 (TERNOP, vmlsl, v8qi, v4hi, v2si),
   VAR4 (BINOP, vqdmulh, v4hi, v2si, v8hi, v4si),
index b486d57..8fec83f 100644 (file)
@@ -1350,6 +1350,38 @@ vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
   return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1);
 }
 
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return (float32x2_t)__builtin_neon_vfmav2sf (__a, __b, __c, 3);
+}
+
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return (float32x4_t)__builtin_neon_vfmav4sf (__a, __b, __c, 3);
+}
+
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return (float32x2_t)__builtin_neon_vfmsv2sf (__a, __b, __c, 3);
+}
+
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return (float32x4_t)__builtin_neon_vfmsv4sf (__a, __b, __c, 3);
+}
+
+#endif
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vsub_s8 (int8x8_t __a, int8x8_t __b)
 {
index 23e37b4..043b1e0 100644 (file)
@@ -103,6 +103,8 @@ let intrinsic_groups =
     "Multiplication", single_opcode Vmul;
     "Multiply-accumulate", single_opcode Vmla;
     "Multiply-subtract", single_opcode Vmls;
+    "Fused-multiply-accumulate", single_opcode Vfma;
+    "Fused-multiply-subtract", single_opcode Vfms;
     "Subtraction", single_opcode Vsub;
     "Comparison (equal-to)", single_opcode Vceq;
     "Comparison (greater-than-or-equal-to)", single_opcode Vcge;
index 29679aa..6c4e272 100644 (file)
@@ -286,6 +286,24 @@ let get_shuffle features =
     | _ -> None
   with Not_found -> None
 
+let print_feature_test_start features =
+  try
+    match List.find (fun feature ->
+                       match feature with Requires_feature _ -> true
+                                        | _ -> false)
+                     features with
+      Requires_feature feature -> 
+        Format.printf "#ifdef __ARM_FEATURE_%s@\n" feature
+    | _ -> assert false
+  with Not_found -> assert true
+
+let print_feature_test_end features =
+  let feature =
+    List.exists (function Requires_feature x -> true
+                                        |  _ -> false) features in
+  if feature then Format.printf "#endif@\n"
+
+
 let print_variant opcode features shape name (ctype, asmtype, elttype) =
   let bits = infoword_value elttype features in
   let modesuf = mode_suffix elttype shape in
@@ -302,7 +320,11 @@ let print_variant opcode features shape name (ctype, asmtype, elttype) =
        return ctype builtin in
   let body = pdecls @ rdecls @ stmts
   and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in
-  print_function ctype fnname body
+  begin
+    print_feature_test_start features;
+    print_function ctype fnname body;
+    print_feature_test_end features;
+  end
 
 (* When this function processes the element types in the ops table, it rewrites
    them in a list of tuples (a,b,c):
index a69a539..4645f39 100644 (file)
@@ -46,13 +46,14 @@ let open_test_file dir name =
     failwith ("Could not create test source file " ^ name ^ ": " ^ str)
 
 (* Emit prologue code to a test source file.  *)
-let emit_prologue chan test_name =
+let emit_prologue chan test_name effective_target =
   Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic.  */\n" test_name;
   Printf.fprintf chan "/* This file was autogenerated by neon-testgen.  */\n\n";
   Printf.fprintf chan "/* { dg-do assemble } */\n";
-  Printf.fprintf chan "/* { dg-require-effective-target arm_neon_ok } */\n";
+  Printf.fprintf chan "/* { dg-require-effective-target %s_ok } */\n"
+                 effective_target;
   Printf.fprintf chan "/* { dg-options \"-save-temps -O0\" } */\n";
-  Printf.fprintf chan "/* { dg-add-options arm_neon } */\n";
+  Printf.fprintf chan "/* { dg-add-options %s } */\n" effective_target;
   Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n";
   Printf.fprintf chan "void test_%s (void)\n{\n" test_name
 
@@ -156,6 +157,17 @@ let check_types tys =
                 then (Const :: flags, String.sub ty 6 ((String.length ty) - 6))
                 else (flags, ty)) tys'
 
+(* Work out what the effective target should be.  *)
+let effective_target features =
+  try
+    match List.find (fun feature ->
+                       match feature with Requires_feature _ -> true
+                                        | _ -> false)
+                     features with
+      Requires_feature "FMA" -> "arm_neonv2"
+    | _ -> assert false
+  with Not_found -> "arm_neon"
+
 (* Given an intrinsic shape, produce a regexp that will match
    the right-hand sides of instructions generated by an intrinsic of
    that shape.  *)
@@ -263,8 +275,10 @@ let test_intrinsic dir opcode features shape name munge elt_ty =
                          "!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n")
                          (analyze_all_shapes features shape analyze_shape)
   in
+  let effective_target = effective_target features
+  in
     (* Emit file and function prologues.  *)
-    emit_prologue chan test_name;
+    emit_prologue chan test_name effective_target;
     (* Emit local variable declarations.  *)
     emit_automatics chan c_types features;
     Printf.fprintf chan "\n";
index b89d538..92e03b0 100644 (file)
 )
 
 ;; Fused multiply-accumulate
+;; We define each insn twice here:
+;;    1: with flag_unsafe_math_optimizations for the widening multiply phase
+;;       to be able to use when converting to FMA.
+;;    2: without flag_unsafe_math_optimizations for the intrinsics to use.
 (define_insn "fma<VCVTF:mode>4"
   [(set (match_operand:VCVTF 0 "register_operand" "=w")
         (fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w")
                      (const_string "neon_fp_vmla_qqq")))]
 )
 
+(define_insn "fma<VCVTF:mode>4_intrinsic"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+        (fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w")
+                (match_operand:VCVTF 2 "register_operand" "w")
+                (match_operand:VCVTF 3 "register_operand" "0")))]
+  "TARGET_NEON && TARGET_FMA"
+  "vfma%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+       (if_then_else (match_test "<Is_d_reg>")
+                     (const_string "neon_fp_vmla_ddd")
+                     (const_string "neon_fp_vmla_qqq")))]
+)
+
 (define_insn "*fmsub<VCVTF:mode>4"
   [(set (match_operand:VCVTF 0 "register_operand" "=w")
         (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
                      (const_string "neon_fp_vmla_qqq")))]
 )
 
+(define_insn "fmsub<VCVTF:mode>4_intrinsic"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+        (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
+                  (match_operand:VCVTF 2 "register_operand" "w")
+                  (match_operand:VCVTF 3 "register_operand" "0")))]
+  "TARGET_NEON && TARGET_FMA"
+  "vfms%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+       (if_then_else (match_test "<Is_d_reg>")
+                     (const_string "neon_fp_vmla_ddd")
+                     (const_string "neon_fp_vmla_qqq")))]
+)
+
 (define_insn "ior<mode>3"
   [(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
        (ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
   DONE;
 })
 
+(define_expand "neon_vfma<VCVTF:mode>"
+  [(match_operand:VCVTF 0 "s_register_operand")
+   (match_operand:VCVTF 1 "s_register_operand")
+   (match_operand:VCVTF 2 "s_register_operand")
+   (match_operand:VCVTF 3 "s_register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_NEON && TARGET_FMA"
+{
+  emit_insn (gen_fma<mode>4_intrinsic (operands[0], operands[2], operands[3],
+                                      operands[1]));
+  DONE;
+})
+
+(define_expand "neon_vfms<VCVTF:mode>"
+  [(match_operand:VCVTF 0 "s_register_operand")
+   (match_operand:VCVTF 1 "s_register_operand")
+   (match_operand:VCVTF 2 "s_register_operand")
+   (match_operand:VCVTF 3 "s_register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_NEON && TARGET_FMA"
+{
+  emit_insn (gen_fmsub<mode>4_intrinsic (operands[0], operands[2], operands[3],
+                                        operands[1]));
+  DONE;
+})
+
 ; Used for intrinsics when flag_unsafe_math_optimizations is false.
 
 (define_insn "neon_vmla<mode>_unspec"
index 56869c0..101f8f6 100644 (file)
@@ -102,6 +102,8 @@ type opcode =
   | Vmul
   | Vmla
   | Vmls
+  | Vfma
+  | Vfms
   | Vsub
   | Vceq
   | Vcge
@@ -275,6 +277,8 @@ type features =
   | Const_valuator of (int -> int)
   | Fixed_vector_reg
   | Fixed_core_reg
+    (* Mark that the intrinsic requires __ARM_FEATURE_string to be defined.  *)
+  | Requires_feature of string
 
 exception MixedMode of elts * elts
 
@@ -802,6 +806,12 @@ let ops =
     Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
     Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
 
+    (* Fused-multiply-accumulate. *)
+    Vfma, [Requires_feature "FMA"], All (3, Dreg), "vfma", elts_same_io, [F32];
+    Vfma, [Requires_feature "FMA"], All (3, Qreg), "vfmaQ", elts_same_io, [F32];
+    Vfms, [Requires_feature "FMA"], All (3, Dreg), "vfms", elts_same_io, [F32];
+    Vfms, [Requires_feature "FMA"], All (3, Qreg), "vfmsQ", elts_same_io, [F32];
+
     (* Subtraction.  *)
     Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32;
     Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2,  [S64; U64];
index 45b6624..9fb6e29 100644 (file)
@@ -1,4 +1,11 @@
-2012-10-17  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
+2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
+
+        * gcc.target/arm/neon/vfmaQf32.c: New testcase.
+        * gcc.target/arm/neon/vfmaf32.c: Likewise.
+        * gcc.target/arm/neon/vfmsQf32.c: Likewise.
+        * gcc.target/arm/neon/vfmsf32.c: Likewise.
+
+2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
 
         * gcc.target/arm/ftest-armv8a-arm.c: New testcase.
         * gcc.target/arm/ftest-armv8a-thumb.c: Likewise.
diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c
new file mode 100644 (file)
index 0000000..d400163
--- /dev/null
@@ -0,0 +1,22 @@
+/* Test the `vfmaQf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmaQf32 (void)
+{
+  float32x4_t out_float32x4_t;
+  float32x4_t arg0_float32x4_t;
+  float32x4_t arg1_float32x4_t;
+  float32x4_t arg2_float32x4_t;
+
+  out_float32x4_t = vfmaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
+}
+
+/* { dg-final { scan-assembler "vfma\.f32\[    \]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[         \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c
new file mode 100644 (file)
index 0000000..988328d
--- /dev/null
@@ -0,0 +1,22 @@
+/* Test the `vfmaf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmaf32 (void)
+{
+  float32x2_t out_float32x2_t;
+  float32x2_t arg0_float32x2_t;
+  float32x2_t arg1_float32x2_t;
+  float32x2_t arg2_float32x2_t;
+
+  out_float32x2_t = vfma_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
+}
+
+/* { dg-final { scan-assembler "vfma\.f32\[    \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[         \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c
new file mode 100644 (file)
index 0000000..247a8ed
--- /dev/null
@@ -0,0 +1,22 @@
+/* Test the `vfmsQf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmsQf32 (void)
+{
+  float32x4_t out_float32x4_t;
+  float32x4_t arg0_float32x4_t;
+  float32x4_t arg1_float32x4_t;
+  float32x4_t arg2_float32x4_t;
+
+  out_float32x4_t = vfmsq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
+}
+
+/* { dg-final { scan-assembler "vfms\.f32\[    \]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[         \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c
new file mode 100644 (file)
index 0000000..7f9e857
--- /dev/null
@@ -0,0 +1,22 @@
+/* Test the `vfmsf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmsf32 (void)
+{
+  float32x2_t out_float32x2_t;
+  float32x2_t arg0_float32x2_t;
+  float32x2_t arg1_float32x2_t;
+  float32x2_t arg2_float32x2_t;
+
+  out_float32x2_t = vfms_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
+}
+
+/* { dg-final { scan-assembler "vfms\.f32\[    \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[         \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */