+2010-10-14 Michael Meissner <meissner@linux.vnet.ibm.com>
+
+ * doc/md.texi (Standard Names): Add fma@var{m}4 documentation.
+
+ * doc/rtl.texi (RTX_TERNARY): Document FMA is ternary. Add
+ SIGN_EXTRACT and ZERO_EXTRACT which were missing.
+ (Standard names): Document fma.
+
+ * doc/cpp.texi (Common Predefined Macros): Document __FP_FAST_FMA,
+ __FP_FAST_FMAF, __FP_FAST_FMAL.
+
+ * builitns.c (expand_builtin_mathfn_ternary): New function for
+ expanding ternary math functions, like fma.
+ (expand_builtin): Call it for the fma builtins.
+
+ * simplify-rtx.c (simplify_ternary_operation): Don't simplify FMA
+ ops at present.
+
+ * tree-vect-stmts.c (vectorizable_call): Allow 3 argument
+ vectorizable functions to support vectorizing fma.
+
+ * config/rs6000/rs6000.c (rs6000_builtin_vectorized_function):
+ Handle fma builtins.
+
+ * config/rs6000/vsx.md (UNSPEC_VSX_MADD): Delete.
+ (UNSPEC_VSX_MSUB): Ditto.
+ (UNSPEC_VSX_NMADD): Ditto.
+ (UNSPEC_VSX_NMSUB): Ditto.
+ (vsx_fmadd<mode>4*): Rewrite to use FMA rtl in some cases instead
+ of UNSPEC. Renumber combiner patterns.
+ (vsx_fmsub<mode>4*): Ditto.
+ (vsx_fnmadd<mode>4*): Ditto.
+ (vsx_fnmsub<mode>4*): Ditto.
+
+ * config/rs6000/altivec.md (UNSPEC_VNMSUBFP): Delete.
+ (altivec_vmaddfp): Rewrite to use FMA rtl if no fused
+ multiply/add. Rename combiner pattern, and add TARGET_FUSED_MADD
+ test.
+ (altivec_vmaddfp_1): Ditto.
+ (altivec_vmaddfp_2): Ditto.
+ (atlivec_mulv4sf3): Ditto.
+ (altivec_vnmsubfp): Ditto.
+ (altivec_vnmsubfp_1): Ditto.
+ (altivec_vnmsubfp_2): Ditto.
+ (altivec_vnmsubfp_3): Delete.
+
+ * config/rs6000/rs6000.md (UNSPEC_FMA): Delete.
+ (fmasf4): Rewrite to always use FMA rtl. Add combiners to
+ generate the four fused multiply/add ops. Combine power, powerpc
+ ops.
+ (fmasf4_fpr): Ditto.
+ (fmssf4_fpr): Ditto.
+ (fnmasf4_fpr): Ditto.
+ (fnmssf4_fpr): Ditto.
+ (fmadf4): Ditto.
+ (fmadf4_fpr): Ditto.
+ (fmsdf4_fpr): Ditto.
+ (fnmadf4_fpr): Ditto.
+ (fnmsdf4_fpr): Ditto.
+
+ * optabs.h (OTI_fma): Add fma optab.
+ (fma_optab): Ditto.
+
+ * genopinit.c (optabs): Set fma optab.
+
+ * rtl.def (FMA): Add FMA rtl.
+
+ * tree.h (mode_has_fma): New function to return if MODE supports a
+ fast multiply and add instruction.
+ * builtins.c (mode_has_fma): Ditto.
+
2010-10-15 Jan Hubicka <jh@suse.cz>
* lto-streamer-out.c (write_symbol): Use pointer set of seen
static rtx expand_builtin_mathfn (tree, rtx, rtx);
static rtx expand_builtin_mathfn_2 (tree, rtx, rtx);
static rtx expand_builtin_mathfn_3 (tree, rtx, rtx);
+static rtx expand_builtin_mathfn_ternary (tree, rtx, rtx);
static rtx expand_builtin_interclass_mathfn (tree, rtx);
static rtx expand_builtin_sincos (tree);
static rtx expand_builtin_cexpi (tree, rtx);
return target;
}
+/* Expand a call to the builtin trinary math functions (fma).
+ Return NULL_RTX if a normal call should be emitted rather than expanding the
+ function in-line. EXP is the expression that is a call to the builtin
+ function; if convenient, the result should be placed in TARGET.
+ SUBTARGET may be used as the target for computing one of EXP's
+ operands. */
+
+static rtx
+expand_builtin_mathfn_ternary (tree exp, rtx target, rtx subtarget)
+{
+ optab builtin_optab;
+ rtx op0, op1, op2, insns;
+ tree fndecl = get_callee_fndecl (exp);
+ tree arg0, arg1, arg2;
+ enum machine_mode mode;
+
+ if (!validate_arglist (exp, REAL_TYPE, REAL_TYPE, REAL_TYPE, VOID_TYPE))
+ return NULL_RTX;
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+
+ switch (DECL_FUNCTION_CODE (fndecl))
+ {
+ CASE_FLT_FN (BUILT_IN_FMA):
+ builtin_optab = fma_optab; break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Make a suitable register to place result in. */
+ mode = TYPE_MODE (TREE_TYPE (exp));
+
+ /* Before working hard, check whether the instruction is available. */
+ if (optab_handler (builtin_optab, mode) == CODE_FOR_nothing)
+ return NULL_RTX;
+
+ target = gen_reg_rtx (mode);
+
+ /* Always stabilize the argument list. */
+ CALL_EXPR_ARG (exp, 0) = arg0 = builtin_save_expr (arg0);
+ CALL_EXPR_ARG (exp, 1) = arg1 = builtin_save_expr (arg1);
+ CALL_EXPR_ARG (exp, 2) = arg2 = builtin_save_expr (arg2);
+
+ op0 = expand_expr (arg0, subtarget, VOIDmode, EXPAND_NORMAL);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+
+ start_sequence ();
+
+ /* Compute into TARGET.
+ Set TARGET to wherever the result comes back. */
+ target = expand_ternary_op (mode, builtin_optab, op0, op1, op2,
+ target, 0);
+
+ /* If we were unable to expand via the builtin, stop the sequence
+ (without outputting the insns) and call to the library function
+ with the stabilized argument list. */
+ if (target == 0)
+ {
+ end_sequence ();
+ return expand_call (exp, target, target == const0_rtx);
+ }
+
+ /* Output the entire sequence. */
+ insns = get_insns ();
+ end_sequence ();
+ emit_insn (insns);
+
+ return target;
+}
+
/* Expand a call to the builtin sin and cos math functions.
Return NULL_RTX if a normal call should be emitted rather than expanding the
function in-line. EXP is the expression that is a call to the builtin
return target;
break;
+ CASE_FLT_FN (BUILT_IN_FMA):
+ target = expand_builtin_mathfn_ternary (exp, target, subtarget);
+ if (target)
+ return target;
+ break;
+
CASE_FLT_FN (BUILT_IN_ILOGB):
if (! flag_unsafe_math_optimizations)
break;
return false;
}
+/* Return true if MODE provides a fast multiply/add (FMA) builtin function. */
+
+bool
+mode_has_fma (enum machine_mode mode)
+{
+ return optab_handler (fma_optab, mode) != CODE_FOR_nothing;
+}
+2010-10-14 Michael Meissner <meissner@linux.vnet.ibm.com>
+
+ * c-cppbuiltin.c (builtin_define_float_constants): Emit
+ __FP_FAST_FMA, __FP_FAST_FMAF, and __FP_FAST_FMAL if the machine
+ has the appropriate fma builtins.
+ (c_cpp_builtins): Adjust call to builtin_define_float_constants.
+
2010-10-14 Iain Sandoe <iains@gcc.gnu.org>
- merge from FSF apple 'trunk' branch.
+ merge from FSF apple 'trunk' branch.
2006 Fariborz Jahanian <fjahanian@apple.com>
-
+
Radars 4436866, 4505126, 4506903, 4517826
* c-common.c (c_common_resword): Define @property and its attributes.
* c-common.h: Define property attribute enum entries.
(objc_add_property_variable): Likewise.
(objc_build_getter_call): Likewise.
(objc_build_setter_call) Likewise.
-
+
2010-10-13 Iain Sandoe <iains@gcc.gnu.org>
- merge from FSF apple 'trunk' branch.
+ merge from FSF apple 'trunk' branch.
2006-04-26 Fariborz Jahanian <fjahanian@apple.com>
Radar 3803157 (method attributes)
* c-common.c (handle_deprecated_attribute): Recognize
objc methods as valid declarations.
* c-common.h: Declare objc_method_decl ().
- * stub-objc.c (objc_method_decl): New stub.
+ * stub-objc.c (objc_method_decl): New stub.
2010-10-08 Joseph Myers <joseph@codesourcery.com>
static void builtin_define_float_constants (const char *,
const char *,
const char *,
+ const char *,
tree);
/* Define NAME with value TYPE size_unit. */
builtin_define_float_constants (const char *name_prefix,
const char *fp_suffix,
const char *fp_cast,
+ const char *fma_suffix,
tree type)
{
/* Used to convert radix-based values to base 10 values in several cases.
NaN has quiet NaNs. */
sprintf (name, "__%s_HAS_QUIET_NAN__", name_prefix);
builtin_define_with_int_value (name, MODE_HAS_NANS (TYPE_MODE (type)));
+
+ /* Note whether we have fast FMA. */
+ if (mode_has_fma (TYPE_MODE (type)))
+ {
+ sprintf (name, "__FP_FAST_FMA%s", fma_suffix);
+ builtin_define_with_int_value (name, 1);
+ }
}
/* Define __DECx__ constants for TYPE using NAME_PREFIX and SUFFIX. */
builtin_define_with_int_value ("__DEC_EVAL_METHOD__",
TARGET_DEC_EVAL_METHOD);
- builtin_define_float_constants ("FLT", "F", "%s", float_type_node);
+ builtin_define_float_constants ("FLT", "F", "%s", "F", float_type_node);
/* Cast the double precision constants. This is needed when single
precision constants are specified or when pragma FLOAT_CONST_DECIMAL64
is used. The correct result is computed by the compiler when using
macros that include a cast. */
- builtin_define_float_constants ("DBL", "L", "((double)%s)", double_type_node);
- builtin_define_float_constants ("LDBL", "L", "%s", long_double_type_node);
+ builtin_define_float_constants ("DBL", "L", "((double)%s)", "",
+ double_type_node);
+ builtin_define_float_constants ("LDBL", "L", "%s", "L",
+ long_double_type_node);
/* For decfloat.h. */
builtin_define_decimal_float_constants ("DEC32", "DF", dfloat32_type_node);
(UNSPEC_VUPKLS_V4SF 325)
(UNSPEC_VUPKHU_V4SF 326)
(UNSPEC_VUPKLU_V4SF 327)
- (UNSPEC_VNMSUBFP 328)
])
(define_constants
"vsel %0,%3,%2,%1"
[(set_attr "type" "vecperm")])
-;; Fused multiply add
-(define_insn "altivec_vmaddfp"
+;; Fused multiply add. By default expand the FMA into (plus (mult)) to help
+;; loop unrolling. Don't do negate multiply ops, because of complications with
+;; honoring signed zero and fused-madd.
+
+(define_expand "altivec_vmaddfp"
+ [(set (match_operand:V4SF 0 "register_operand" "")
+ (plus:V4SF (mult:V4SF (match_operand:V4SF 1 "register_operand" "")
+ (match_operand:V4SF 2 "register_operand" ""))
+ (match_operand:V4SF 3 "register_operand" "")))]
+ "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
+{
+ if (!TARGET_FUSED_MADD)
+ {
+ emit_insn (gen_altivec_vmaddfp_2 (operands[0], operands[1], operands[2],
+ operands[3]));
+ DONE;
+ }
+})
+
+(define_insn "*altivec_vmaddfp_1"
[(set (match_operand:V4SF 0 "register_operand" "=v")
(plus:V4SF (mult:V4SF (match_operand:V4SF 1 "register_operand" "v")
(match_operand:V4SF 2 "register_operand" "v"))
(match_operand:V4SF 3 "register_operand" "v")))]
+ "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD"
+ "vmaddfp %0,%1,%2,%3"
+ [(set_attr "type" "vecfloat")])
+
+(define_insn "altivec_vmaddfp_2"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (fma:V4SF (match_operand:V4SF 1 "register_operand" "v")
+ (match_operand:V4SF 2 "register_operand" "v")
+ (match_operand:V4SF 3 "register_operand" "v")))]
"VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
"vmaddfp %0,%1,%2,%3"
[(set_attr "type" "vecfloat")])
[(use (match_operand:V4SF 0 "register_operand" ""))
(use (match_operand:V4SF 1 "register_operand" ""))
(use (match_operand:V4SF 2 "register_operand" ""))]
- "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD"
+ "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
"
{
rtx neg0;
}")
;; Fused multiply subtract
-(define_expand "altivec_vnmsubfp"
- [(match_operand:V4SF 0 "register_operand" "")
- (match_operand:V4SF 1 "register_operand" "")
- (match_operand:V4SF 2 "register_operand" "")
- (match_operand:V4SF 3 "register_operand" "")]
+(define_insn "altivec_vnmsubfp"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (neg:V4SF
+ (fma:V4SF (match_operand:V4SF 1 "register_operand" "v")
+ (match_operand:V4SF 2 "register_operand" "v")
+ (neg:V4SF
+ (match_operand:V4SF 3 "register_operand" "v")))))]
"VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
-{
- if (TARGET_FUSED_MADD && HONOR_SIGNED_ZEROS (SFmode))
- {
- emit_insn (gen_altivec_vnmsubfp_1 (operands[0], operands[1],
- operands[2], operands[3]));
- DONE;
- }
- else if (TARGET_FUSED_MADD && !HONOR_SIGNED_ZEROS (DFmode))
- {
- emit_insn (gen_altivec_vnmsubfp_2 (operands[0], operands[1],
- operands[2], operands[3]));
- DONE;
- }
- else
- {
- emit_insn (gen_altivec_vnmsubfp_3 (operands[0], operands[1],
- operands[2], operands[3]));
- DONE;
- }
-})
+ "vnmsubfp %0,%1,%2,%3"
+ [(set_attr "type" "vecfloat")])
-(define_insn "altivec_vnmsubfp_1"
+(define_insn "*altivec_vnmsubfp_1"
[(set (match_operand:V4SF 0 "register_operand" "=v")
(neg:V4SF
(minus:V4SF
"vnmsubfp %0,%1,%2,%3"
[(set_attr "type" "vecfloat")])
-(define_insn "altivec_vnmsubfp_2"
+(define_insn "*altivec_vnmsubfp_2"
[(set (match_operand:V4SF 0 "register_operand" "=v")
(minus:V4SF
(match_operand:V4SF 3 "register_operand" "v")
"vnmsubfp %0,%1,%2,%3"
[(set_attr "type" "vecfloat")])
-(define_insn "altivec_vnmsubfp_3"
- [(set (match_operand:V4SF 0 "register_operand" "=v")
- (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
- (match_operand:V4SF 2 "register_operand" "v")
- (match_operand:V4SF 3 "register_operand" "v")]
- UNSPEC_VNMSUBFP))]
- "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
- "vnmsubfp %0,%1,%2,%3"
- [(set_attr "type" "vecfloat")])
-
(define_insn "altivec_vmsumu<VI_char>m"
[(set (match_operand:V4SI 0 "register_operand" "=v")
(unspec:V4SI [(match_operand:VIshort 1 "register_operand" "v")
if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM];
break;
+ case BUILT_IN_FMA:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVMADDDP];
+ break;
+ case BUILT_IN_FMAF:
+ if (VECTOR_UNIT_VSX_P (V4SFmode)
+ && out_mode == SFmode && out_n == 4
+ && in_mode == SFmode && in_n == 4)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVMADDSP];
+ else if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)
+ && out_mode == SFmode && out_n == 4
+ && in_mode == SFmode && in_n == 4)
+ return rs6000_builtin_decls[ALTIVEC_BUILTIN_VMADDFP];
+ break;
case BUILT_IN_TRUNC:
if (VECTOR_UNIT_VSX_P (V2DFmode)
&& out_mode == DFmode && out_n == 2
"fres %0,%1"
[(set_attr "type" "fp")])
+; builtin fmaf support
+; If the user explicitly uses the fma builtin, don't convert this to
+; (plus (mult op1 op2) op3)
+(define_expand "fmasf4"
+ [(set (match_operand:SF 0 "gpc_reg_operand" "")
+ (fma:SF (match_operand:SF 1 "gpc_reg_operand" "")
+ (match_operand:SF 2 "gpc_reg_operand" "")
+ (match_operand:SF 3 "gpc_reg_operand" "")))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT"
+ "")
+
+(define_insn "fmasf4_fpr"
+ [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
+ (fma:SF (match_operand:SF 1 "gpc_reg_operand" "f")
+ (match_operand:SF 2 "gpc_reg_operand" "f")
+ (match_operand:SF 3 "gpc_reg_operand" "f")))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT"
+ "*
+{
+ return ((TARGET_POWERPC)
+ ? \"fmadds %0,%1,%2,%3\"
+ : \"{fma|fmadd} %0,%1,%2,%3\");
+}"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_maddsub_s")])
+
+(define_insn "*fmssf4_fpr"
+ [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
+ (fma:SF (match_operand:SF 1 "gpc_reg_operand" "f")
+ (match_operand:SF 2 "gpc_reg_operand" "f")
+ (neg:SF (match_operand:SF 3 "gpc_reg_operand" "f"))))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT"
+ "*
+{
+ return ((TARGET_POWERPC)
+ ? \"fmsubs %0,%1,%2,%3\"
+ : \"{fms|fmsub} %0,%1,%2,%3\");
+}"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_maddsub_s")])
+
+(define_insn "*fnmasf4_fpr"
+ [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
+ (neg:SF (fma:SF (match_operand:SF 1 "gpc_reg_operand" "f")
+ (match_operand:SF 2 "gpc_reg_operand" "f")
+ (match_operand:SF 3 "gpc_reg_operand" "f"))))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT"
+ "*
+{
+ return ((TARGET_POWERPC)
+ ? \"fnmadds %0,%1,%2,%3\"
+ : \"{fnma|fnmadd} %0,%1,%2,%3\");
+}"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_maddsub_s")])
+
+(define_insn "*fnmssf4_fpr"
+ [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
+ (neg:SF (fma:SF (match_operand:SF 1 "gpc_reg_operand" "f")
+ (match_operand:SF 2 "gpc_reg_operand" "f")
+ (neg:SF (match_operand:SF 3 "gpc_reg_operand" "f")))))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT"
+ "*
+{
+ return ((TARGET_POWERPC)
+ ? \"fnmsubs %0,%1,%2,%3\"
+ : \"{fnms|fnmsub} %0,%1,%2,%3\");
+}"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_maddsub_s")])
+
+; Fused multiply/add ops created by the combiner
(define_insn "*fmaddsf4_powerpc"
[(set (match_operand:SF 0 "gpc_reg_operand" "=f")
(plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "%f")
"fmadds %0,%1,%2,%3"
[(set_attr "type" "fp")
(set_attr "fp_type" "fp_maddsub_s")])
-
+
(define_insn "*fmaddsf4_power"
[(set (match_operand:SF 0 "gpc_reg_operand" "=f")
(plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "%f")
"frsqrte %0,%1"
[(set_attr "type" "fp")])
+; builtin fma support
+; If the user explicitly uses the fma builtin, don't convert this to
+; (plus (mult op1 op2) op3)
+(define_expand "fmadf4"
+ [(set (match_operand:DF 0 "gpc_reg_operand" "")
+ (fma:DF (match_operand:DF 1 "gpc_reg_operand" "")
+ (match_operand:DF 2 "gpc_reg_operand" "")
+ (match_operand:DF 3 "gpc_reg_operand" "")))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT"
+ "")
+
+(define_insn "fmadf4_fpr"
+ [(set (match_operand:DF 0 "gpc_reg_operand" "=f")
+ (fma:DF (match_operand:DF 1 "gpc_reg_operand" "f")
+ (match_operand:DF 2 "gpc_reg_operand" "f")
+ (match_operand:DF 3 "gpc_reg_operand" "f")))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT
+ && VECTOR_UNIT_NONE_P (DFmode)"
+ "{fma|fmadd} %0,%1,%2,%3"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_maddsub_s")])
+
+(define_insn "*fmsdf4_fpr"
+ [(set (match_operand:DF 0 "gpc_reg_operand" "=f")
+ (fma:DF (match_operand:DF 1 "gpc_reg_operand" "f")
+ (match_operand:DF 2 "gpc_reg_operand" "f")
+ (neg:DF (match_operand:DF 3 "gpc_reg_operand" "f"))))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT
+ && VECTOR_UNIT_NONE_P (DFmode)"
+ "{fms|fmsub} %0,%1,%2,%3"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_maddsub_s")])
+
+(define_insn "*fnmadf4_fpr"
+ [(set (match_operand:DF 0 "gpc_reg_operand" "=f")
+ (neg:DF (fma:DF (match_operand:DF 1 "gpc_reg_operand" "f")
+ (match_operand:DF 2 "gpc_reg_operand" "f")
+ (match_operand:DF 3 "gpc_reg_operand" "f"))))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT
+ && VECTOR_UNIT_NONE_P (DFmode)"
+ "{fnma|fnmadd} %0,%1,%2,%3"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_maddsub_s")])
+
+(define_insn "*fnmsdf4_fpr"
+ [(set (match_operand:DF 0 "gpc_reg_operand" "=f")
+ (neg:DF (fma:DF (match_operand:DF 1 "gpc_reg_operand" "f")
+ (match_operand:DF 2 "gpc_reg_operand" "f")
+ (neg:DF (match_operand:DF 3 "gpc_reg_operand" "f")))))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT
+ && VECTOR_UNIT_NONE_P (DFmode)"
+ "{fnms|fnmsub} %0,%1,%2,%3"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_maddsub_s")])
+
+; Fused multiply/add ops created by the combiner
(define_insn "*fmadddf4_fpr"
[(set (match_operand:DF 0 "gpc_reg_operand" "=d")
(plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "%d")
(UNSPEC_VSX_CVUXDSP 507)
(UNSPEC_VSX_CVSPSXDS 508)
(UNSPEC_VSX_CVSPUXDS 509)
- (UNSPEC_VSX_MADD 510)
- (UNSPEC_VSX_MSUB 511)
- (UNSPEC_VSX_NMADD 512)
- (UNSPEC_VSX_NMSUB 513)
- ;; 514 deleted
+ ;; 510-514 deleted
(UNSPEC_VSX_TDIV 515)
(UNSPEC_VSX_TSQRT 516)
(UNSPEC_VSX_XXPERMDI 517)
;; does not check -mfused-madd to allow users to use these ops when they know
;; they want the fused multiply/add.
+;; Fused multiply add. By default expand the FMA into (plus (mult)) to help
+;; loop unrolling. Don't do negate multiply ops, because of complications with
+;; honoring signed zero and fused-madd.
+
(define_expand "vsx_fmadd<mode>4"
[(set (match_operand:VSX_B 0 "vsx_register_operand" "")
(plus:VSX_B
- (mult:VSX_B
- (match_operand:VSX_B 1 "vsx_register_operand" "")
- (match_operand:VSX_B 2 "vsx_register_operand" ""))
+ (mult:VSX_B (match_operand:VSX_B 1 "vsx_register_operand" "")
+ (match_operand:VSX_B 2 "vsx_register_operand" ""))
(match_operand:VSX_B 3 "vsx_register_operand" "")))]
"VECTOR_UNIT_VSX_P (<MODE>mode)"
{
if (!TARGET_FUSED_MADD)
{
- emit_insn (gen_vsx_fmadd<mode>4_2 (operands[0], operands[1], operands[2],
- operands[3]));
+ emit_insn (gen_vsx_fmadd<mode>4_2 (operands[0], operands[1],
+ operands[2], operands[3]));
DONE;
}
})
(define_insn "vsx_fmadd<mode>4_2"
[(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
- (unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
- (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
- (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa")]
- UNSPEC_VSX_MADD))]
+ (fma:VSX_B (match_operand:VSX_B 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
+ (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
+ (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa")))]
"VECTOR_UNIT_VSX_P (<MODE>mode)"
"@
x<VSv>madda<VSs> %x0,%x1,%x2
(define_expand "vsx_fmsub<mode>4"
[(set (match_operand:VSX_B 0 "vsx_register_operand" "")
(minus:VSX_B
- (mult:VSX_B
- (match_operand:VSX_B 1 "vsx_register_operand" "")
- (match_operand:VSX_B 2 "vsx_register_operand" ""))
+ (mult:VSX_B (match_operand:VSX_B 1 "vsx_register_operand" "")
+ (match_operand:VSX_B 2 "vsx_register_operand" ""))
(match_operand:VSX_B 3 "vsx_register_operand" "")))]
"VECTOR_UNIT_VSX_P (<MODE>mode)"
{
if (!TARGET_FUSED_MADD)
{
- emit_insn (gen_vsx_fmsub<mode>4_2 (operands[0], operands[1], operands[2],
- operands[3]));
+ emit_insn (gen_vsx_fmsub<mode>4_2 (operands[0], operands[1],
+ operands[2], operands[3]));
DONE;
}
})
(define_insn "vsx_fmsub<mode>4_2"
[(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
- (unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
- (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
- (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa")]
- UNSPEC_VSX_MSUB))]
+ (fma:VSX_B (match_operand:VSX_B 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
+ (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
+ (neg:VSX_B
+ (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa"))))]
"VECTOR_UNIT_VSX_P (<MODE>mode)"
"@
x<VSv>msuba<VSs> %x0,%x1,%x2
[(set_attr "type" "<VStype_mul>")
(set_attr "fp_type" "<VSfptype_mul>")])
-(define_expand "vsx_fnmadd<mode>4"
- [(match_operand:VSX_B 0 "vsx_register_operand" "")
- (match_operand:VSX_B 1 "vsx_register_operand" "")
- (match_operand:VSX_B 2 "vsx_register_operand" "")
- (match_operand:VSX_B 3 "vsx_register_operand" "")]
+(define_insn "vsx_fnmadd<mode>4"
+ [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
+ (neg:VSX_B
+ (fma:VSX_B
+ (match_operand:VSX_B 1 "vsx_register_operand" "<VSr>,<VSr>,wa,wa")
+ (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
+ (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa"))))]
"VECTOR_UNIT_VSX_P (<MODE>mode)"
-{
- if (TARGET_FUSED_MADD && HONOR_SIGNED_ZEROS (DFmode))
- {
- emit_insn (gen_vsx_fnmadd<mode>4_1 (operands[0], operands[1],
- operands[2], operands[3]));
- DONE;
- }
- else if (TARGET_FUSED_MADD && !HONOR_SIGNED_ZEROS (DFmode))
- {
- emit_insn (gen_vsx_fnmadd<mode>4_2 (operands[0], operands[1],
- operands[2], operands[3]));
- DONE;
- }
- else
- {
- emit_insn (gen_vsx_fnmadd<mode>4_3 (operands[0], operands[1],
- operands[2], operands[3]));
- DONE;
- }
-})
+ "@
+ x<VSv>nmadda<VSs> %x0,%x1,%x2
+ x<VSv>nmaddm<VSs> %x0,%x1,%x3
+ x<VSv>nmadda<VSs> %x0,%x1,%x2
+ x<VSv>nmaddm<VSs> %x0,%x1,%x3"
+ [(set_attr "type" "<VStype_mul>")
+ (set_attr "fp_type" "<VSfptype_mul>")])
(define_insn "vsx_fnmadd<mode>4_1"
[(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
[(set_attr "type" "<VStype_mul>")
(set_attr "fp_type" "<VSfptype_mul>")])
-(define_insn "vsx_fnmadd<mode>4_3"
+(define_insn "vsx_fnmsub<mode>4"
[(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
- (unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "<VSr>,<VSr>,wa,wa")
- (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
- (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa")]
- UNSPEC_VSX_NMADD))]
+ (neg:VSX_B
+ (fma:VSX_B (match_operand:VSX_B 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
+ (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
+ (neg:VSX_B
+ (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa")))))]
"VECTOR_UNIT_VSX_P (<MODE>mode)"
"@
- x<VSv>nmadda<VSs> %x0,%x1,%x2
- x<VSv>nmaddm<VSs> %x0,%x1,%x3
- x<VSv>nmadda<VSs> %x0,%x1,%x2
- x<VSv>nmaddm<VSs> %x0,%x1,%x3"
+ x<VSv>nmsuba<VSs> %x0,%x1,%x2
+ x<VSv>nmsubm<VSs> %x0,%x1,%x3
+ x<VSv>nmsuba<VSs> %x0,%x1,%x2
+ x<VSv>nmsubm<VSs> %x0,%x1,%x3"
[(set_attr "type" "<VStype_mul>")
(set_attr "fp_type" "<VSfptype_mul>")])
-(define_expand "vsx_fnmsub<mode>4"
- [(match_operand:VSX_B 0 "vsx_register_operand" "")
- (match_operand:VSX_B 1 "vsx_register_operand" "")
- (match_operand:VSX_B 2 "vsx_register_operand" "")
- (match_operand:VSX_B 3 "vsx_register_operand" "")]
- "VECTOR_UNIT_VSX_P (<MODE>mode)"
-{
- if (TARGET_FUSED_MADD && HONOR_SIGNED_ZEROS (DFmode))
- {
- emit_insn (gen_vsx_fnmsub<mode>4_1 (operands[0], operands[1],
- operands[2], operands[3]));
- DONE;
- }
- else if (TARGET_FUSED_MADD && !HONOR_SIGNED_ZEROS (DFmode))
- {
- emit_insn (gen_vsx_fnmsub<mode>4_2 (operands[0], operands[1],
- operands[2], operands[3]));
- DONE;
- }
- else
- {
- emit_insn (gen_vsx_fnmsub<mode>4_3 (operands[0], operands[1],
- operands[2], operands[3]));
- DONE;
- }
-})
-
(define_insn "vsx_fnmsub<mode>4_1"
[(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
(neg:VSX_B
[(set_attr "type" "<VStype_mul>")
(set_attr "fp_type" "<VSfptype_mul>")])
-(define_insn "vsx_fnmsub<mode>4_3"
- [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,<VSr>,?wa,?wa")
- (unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "%<VSr>,<VSr>,wa,wa")
- (match_operand:VSX_B 2 "vsx_register_operand" "<VSr>,0,wa,0")
- (match_operand:VSX_B 3 "vsx_register_operand" "0,<VSr>,0,wa")]
- UNSPEC_VSX_NMSUB))]
- "VECTOR_UNIT_VSX_P (<MODE>mode)"
- "@
- x<VSv>nmsuba<VSs> %x0,%x1,%x2
- x<VSv>nmsubm<VSs> %x0,%x1,%x3
- x<VSv>nmsuba<VSs> %x0,%x1,%x2
- x<VSv>nmsubm<VSs> %x0,%x1,%x3"
- [(set_attr "type" "<VStype_mul>")
- (set_attr "fp_type" "<VSfptype_mul>")])
-
;; Vector conditional expressions (no scalar version for these instructions)
(define_insn "vsx_eq<mode>"
[(set (match_operand:VSX_F 0 "vsx_register_operand" "=<VSr>,?wa")
This macro is defined when the compiler is emitting Dwarf2 CFI directives
to the assembler. When this is defined, it is possible to emit those same
directives in inline assembly.
+
+@item __FP_FAST_FMA
+@itemx __FP_FAST_FMAF
+@itemx __FP_FAST_FMAL
+These macros are defined with value 1 if the backend supports the
+@code{fma}, @code{fmaf}, and @code{fmal} builtin functions, so that
+the include file @file{math.h} can define the macros
+@code{FP_FAST_FMA}, @code{FP_FAST_FMAF}, and @code{FP_FAST_FMAL}
+for compatibility with the 1999 C standard.
@end table
@node System-specific Predefined Macros
@itemx @samp{and@var{m}3}, @samp{ior@var{m}3}, @samp{xor@var{m}3}
Similar, for other arithmetic operations.
+@cindex @code{fma@var{m}4} instruction pattern
+@item @samp{fma@var{m}4}
+Multiply operand 2 and operand 1, then add operand 3, storing the
+result in operand 0. All operands must have mode @var{m}. This
+pattern is used to implement the @code{fma}, @code{fmaf}, and
+@code{fmal} builtin functions from the ISO C99 standard. The
+@code{fma} operation may produce different results than doing the
+multiply followed by the add if the machine does not perform a
+rounding step between the operations.
+
@cindex @code{min@var{m}3} instruction pattern
@cindex @code{max@var{m}3} instruction pattern
@item @samp{smin@var{m}3}, @samp{smax@var{m}3}
@item RTX_TERNARY
An RTX code for other three input operations. Currently only
-@code{IF_THEN_ELSE} and @code{VEC_MERGE}.
+@code{IF_THEN_ELSE}, @code{VEC_MERGE}, @code{SIGN_EXTRACT},
+@code{ZERO_EXTRACT}, and @code{FMA}.
@item RTX_INSN
An RTX code for an entire instruction: @code{INSN}, @code{JUMP_INSN}, and
For unsigned widening multiplication, use the same idiom, but with
@code{zero_extend} instead of @code{sign_extend}.
+@findex fma
+@item (fma:@var{m} @var{x} @var{y} @var{z})
+Represents the @code{fma}, @code{fmaf}, and @code{fmal} builtin
+functions that do a combined multiply of @var{x} and @var{y} and then
+adding to@var{z} without doing an intermediate rounding step.
+
@findex div
@findex ss_div
@cindex division
"set_optab_handler (sqrt_optab, $A, CODE_FOR_$(sqrt$a2$))",
"set_optab_handler (floor_optab, $A, CODE_FOR_$(floor$a2$))",
"set_convert_optab_handler (lfloor_optab, $B, $A, CODE_FOR_$(lfloor$F$a$I$b2$))",
+ "set_optab_handler (fma_optab, $A, CODE_FOR_$(fma$a4$))",
"set_optab_handler (ceil_optab, $A, CODE_FOR_$(ceil$a2$))",
"set_convert_optab_handler (lceil_optab, $B, $A, CODE_FOR_$(lceil$F$a$I$b2$))",
"set_optab_handler (round_optab, $A, CODE_FOR_$(round$a2$))",
OTI_pow,
/* Arc tangent of y/x */
OTI_atan2,
+ /* Floating multiply/add */
+ OTI_fma,
/* Move instruction. */
OTI_mov,
#define umax_optab (&optab_table[OTI_umax])
#define pow_optab (&optab_table[OTI_pow])
#define atan2_optab (&optab_table[OTI_atan2])
+#define fma_optab (&optab_table[OTI_fma])
#define mov_optab (&optab_table[OTI_mov])
#define movstrict_optab (&optab_table[OTI_movstrict])
/* Unsigned saturating truncate. */
DEF_RTL_EXPR(US_TRUNCATE, "us_truncate", "e", RTX_UNARY)
+/* Floating point multiply/add combined instruction. */
+DEF_RTL_EXPR(FMA, "fma", "eee", RTX_TERNARY)
+
/* Information about the variable and its location. */
/* Changed 'te' to 'tei'; the 'i' field is for recording
initialization status of variables. */
switch (code)
{
+ /* At present, don't simplify fused multiply and add ops, because we need
+ to make sure there are no intermediate rounding steps used, and that
+ we get the right sign if negative 0 would be returned. */
+ case FMA:
+ return NULL_RTX;
+
case SIGN_EXTRACT:
case ZERO_EXTRACT:
if (CONST_INT_P (op0)
+2010-10-14 Michael Meissner <meissner@linux.vnet.ibm.com>
+
+ * gcc.target/powerpc/ppc-fma-1.c: New tests for powerpc FMA
+ builtin combiner patterns.
+ * gcc.target/powerpc/ppc-fma-2.c: Ditto.
+ * gcc.target/powerpc/ppc-fma-3.c: Ditto.
+ * gcc.target/powerpc/ppc-fma-4.c: Ditto.
+ * gcc.target/powerpc/ppc-fma-5.c: Ditto.
+ * gcc.target/powerpc/ppc-fma-6.c: Ditto.
+
2010-10-15 Richard Guenther <rguenther@suse.de>
* g++.dg/lto/20101015-1_0.C: New testcase.
--- /dev/null
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math" } */
+/* { dg-final { scan-assembler-times "xvmadd" 4 } } */
+/* { dg-final { scan-assembler-times "xsmadd" 2 } } */
+/* { dg-final { scan-assembler-times "fmadds" 2 } } */
+/* { dg-final { scan-assembler-times "xvmsub" 2 } } */
+/* { dg-final { scan-assembler-times "xsmsub" 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "xvnmadd" 2 } } */
+/* { dg-final { scan-assembler-times "xsnmadd" 1 } } */
+/* { dg-final { scan-assembler-times "fnmadds" 1 } } */
+/* { dg-final { scan-assembler-times "xvnmsub" 2 } } */
+/* { dg-final { scan-assembler-times "xsnmsub" 1 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 1 } } */
+
+/* All functions should generate an appropriate (a * b) + c instruction
+ since -mfused-madd is on by default. */
+
+double
+builtin_fma (double b, double c, double d)
+{
+ return __builtin_fma (b, c, d); /* xsmadd{a,m}dp */
+}
+
+double
+builtin_fms (double b, double c, double d)
+{
+ return __builtin_fma (b, c, -d); /* xsmsub{a,b}dp */
+}
+
+double
+builtin_fnma (double b, double c, double d)
+{
+ return - __builtin_fma (b, c, d); /* xsnmadd{a,b}dp */
+}
+
+double
+builtin_fnms (double b, double c, double d)
+{
+ return - __builtin_fma (b, c, -d); /* xsnmsub{a,b}dp */
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, d); /* fmadds */
+}
+
+float
+builtin_fmsf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, -d); /* fmsubs */
+}
+
+float
+builtin_fnmaf (float b, float c, float d)
+{
+ return - __builtin_fmaf (b, c, d); /* fnmadds */
+}
+
+float
+builtin_fnmsf (float b, float c, float d)
+{
+ return - __builtin_fmaf (b, c, -d); /* fnmsubs */
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+ return (b * c) + d; /* xsmadd{a,m}dp */
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+ return (b * c) + d; /* fmadds */
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+double vda[SIZE] __attribute__((__aligned__(32)));
+double vdb[SIZE] __attribute__((__aligned__(32)));
+double vdc[SIZE] __attribute__((__aligned__(32)));
+double vdd[SIZE] __attribute__((__aligned__(32)));
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = __builtin_fma (vdb[i], vdc[i], vdd[i]); /* xvmadd{a,m}dp */
+}
+
+void
+vector_fms (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = __builtin_fma (vdb[i], vdc[i], -vdd[i]); /* xvmsub{a,m}dp */
+}
+
+void
+vector_fnma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = - __builtin_fma (vdb[i], vdc[i], vdd[i]); /* xvnmadd{a,m}dp */
+}
+
+void
+vector_fnms (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = - __builtin_fma (vdb[i], vdc[i], -vdd[i]); /* xvnmsub{a,m}dp */
+}
+
+void
+vector_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]); /* xvmadd{a,m}sp */
+}
+
+void
+vector_fmsf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], -vfd[i]); /* xvmsub{a,m}sp */
+}
+
+void
+vector_fnmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = - __builtin_fmaf (vfb[i], vfc[i], vfd[i]); /* xvnmadd{a,m}sp */
+}
+
+void
+vector_fnmsf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = - __builtin_fmaf (vfb[i], vfc[i], -vfd[i]); /* xvnmsub{a,m}sp */
+}
+
+void
+vnormal_fma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = (vdb[i] * vdc[i]) + vdd[i]; /* xvmadd{a,m}dp */
+}
+
+void
+vnormal_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = (vfb[i] * vfc[i]) + vfd[i]; /* xvmadd{a,m}sp */
+}
--- /dev/null
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math -mno-fused-madd" } */
+/* { dg-final { scan-assembler-times "xvmadd" 2 } } */
+/* { dg-final { scan-assembler-times "xsmadd" 1 } } */
+/* { dg-final { scan-assembler-times "fmadds" 1 } } */
+/* { dg-final { scan-assembler-times "xvmsub" 2 } } */
+/* { dg-final { scan-assembler-times "xsmsub" 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "xvnmadd" 2 } } */
+/* { dg-final { scan-assembler-times "xsnmadd" 1 } } */
+/* { dg-final { scan-assembler-times "fnmadds" 1 } } */
+/* { dg-final { scan-assembler-times "xvnmsub" 2 } } */
+/* { dg-final { scan-assembler-times "xsnmsub" 1 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 1 } } */
+
+/* Only the functions calling the bulitin should generate an appropriate (a *
+ b) + c instruction. */
+
+double
+builtin_fma (double b, double c, double d)
+{
+ return __builtin_fma (b, c, d); /* xsmadd{a,m}dp */
+}
+
+double
+builtin_fms (double b, double c, double d)
+{
+ return __builtin_fma (b, c, -d); /* xsmsub{a,b}dp */
+}
+
+double
+builtin_fnma (double b, double c, double d)
+{
+ return - __builtin_fma (b, c, d); /* xsnmadd{a,b}dp */
+}
+
+double
+builtin_fnms (double b, double c, double d)
+{
+ return - __builtin_fma (b, c, -d); /* xsnmsub{a,b}dp */
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, d); /* fmadds */
+}
+
+float
+builtin_fmsf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, -d); /* fmsubs */
+}
+
+float
+builtin_fnmaf (float b, float c, float d)
+{
+ return - __builtin_fmaf (b, c, d); /* fnmadds */
+}
+
+float
+builtin_fnmsf (float b, float c, float d)
+{
+ return - __builtin_fmaf (b, c, -d); /* fnmsubs */
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+ return (b * c) + d; /* fmul/fadd */
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+ return (b * c) + d; /* fmuls/fadds */
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+double vda[SIZE] __attribute__((__aligned__(32)));
+double vdb[SIZE] __attribute__((__aligned__(32)));
+double vdc[SIZE] __attribute__((__aligned__(32)));
+double vdd[SIZE] __attribute__((__aligned__(32)));
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = __builtin_fma (vdb[i], vdc[i], vdd[i]); /* xvmadd{a,m}dp */
+}
+
+void
+vector_fms (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = __builtin_fma (vdb[i], vdc[i], -vdd[i]); /* xvmsub{a,m}dp */
+}
+
+void
+vector_fnma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = - __builtin_fma (vdb[i], vdc[i], vdd[i]); /* xvnmadd{a,m}dp */
+}
+
+void
+vector_fnms (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = - __builtin_fma (vdb[i], vdc[i], -vdd[i]); /* xvnmsub{a,m}dp */
+}
+
+void
+vector_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]); /* xvmadd{a,m}sp */
+}
+
+void
+vector_fmsf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], -vfd[i]); /* xvmsub{a,m}sp */
+}
+
+void
+vector_fnmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = - __builtin_fmaf (vfb[i], vfc[i], vfd[i]); /* xvnmadd{a,m}sp */
+}
+
+void
+vector_fnmsf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = - __builtin_fmaf (vfb[i], vfc[i], -vfd[i]); /* xvnmsub{a,m}sp */
+}
+
+void
+vnormal_fma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = (vdb[i] * vdc[i]) + vdd[i]; /* xvmadd{a,m}dp */
+}
+
+void
+vnormal_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = (vfb[i] * vfc[i]) + vfd[i]; /* xvmadd{a,m}sp */
+}
--- /dev/null
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power6 -maltivec -ffast-math" } */
+/* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
+/* { dg-final { scan-assembler-times "fmadd " 2 } } */
+/* { dg-final { scan-assembler-times "fmadds" 2 } } */
+/* { dg-final { scan-assembler-times "fmsub " 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "fnmadd " 1 } } */
+/* { dg-final { scan-assembler-times "fnmadds" 1 } } */
+/* { dg-final { scan-assembler-times "fnmsub " 1 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 1 } } */
+
+/* All functions should generate an appropriate (a * b) + c instruction
+ since -mfused-madd is on by default. */
+
+double
+builtin_fma (double b, double c, double d)
+{
+ return __builtin_fma (b, c, d); /* fmadd */
+}
+
+double
+builtin_fms (double b, double c, double d)
+{
+ return __builtin_fma (b, c, -d); /* fmsub */
+}
+
+double
+builtin_fnma (double b, double c, double d)
+{
+ return - __builtin_fma (b, c, d); /* fnmadd */
+}
+
+double
+builtin_fnms (double b, double c, double d)
+{
+ return - __builtin_fma (b, c, -d); /* fnmsub */
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, d); /* fmadds */
+}
+
+float
+builtin_fmsf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, -d); /* fmsubs */
+}
+
+float
+builtin_fnmaf (float b, float c, float d)
+{
+ return - __builtin_fmaf (b, c, d); /* fnmadds */
+}
+
+float
+builtin_fnmsf (float b, float c, float d)
+{
+ return - __builtin_fmaf (b, c, -d); /* fnmsubs */
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+ return (b * c) + d; /* fmadd */
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+ return (b * c) + d; /* fmadds */
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]); /* vaddfp */
+}
+
+void
+vnormal_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = (vfb[i] * vfc[i]) + vfd[i]; /* vaddfp */
+}
--- /dev/null
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power6 -maltivec -ffast-math -mno-fused-madd" } */
+/* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
+/* { dg-final { scan-assembler-times "fmadd " 1 } } */
+/* { dg-final { scan-assembler-times "fmadds" 1 } } */
+/* { dg-final { scan-assembler-times "fmsub " 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "fnmadd " 1 } } */
+/* { dg-final { scan-assembler-times "fnmadds" 1 } } */
+/* { dg-final { scan-assembler-times "fnmsub " 1 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 1 } } */
+
+/* Only the functions calling the builtin should generate an appropriate
+ (a * b) + c instruction. */
+
+double
+builtin_fma (double b, double c, double d)
+{
+ return __builtin_fma (b, c, d); /* fmadd */
+}
+
+double
+builtin_fms (double b, double c, double d)
+{
+ return __builtin_fma (b, c, -d); /* fmsub */
+}
+
+double
+builtin_fnma (double b, double c, double d)
+{
+ return - __builtin_fma (b, c, d); /* fnmadd */
+}
+
+double
+builtin_fnms (double b, double c, double d)
+{
+ return - __builtin_fma (b, c, -d); /* fnmsub */
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, d); /* fmadds */
+}
+
+float
+builtin_fmsf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, -d); /* fmsubs */
+}
+
+float
+builtin_fnmaf (float b, float c, float d)
+{
+ return - __builtin_fmaf (b, c, d); /* fnmadds */
+}
+
+float
+builtin_fnmsf (float b, float c, float d)
+{
+ return - __builtin_fmaf (b, c, -d); /* fnmsubs */
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+ return (b * c) + d; /* fmul/fadd */
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+ return (b * c) + d; /* fmuls/fadds */
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]); /* vaddfp */
+}
--- /dev/null
+/* { dg-do run { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-options "-O2 -mcpu=power5 -std=c99" } */
+
+#ifndef __FP_FAST_FMA
+#error "__FP_FAST_FMA should be defined"
+#endif
+
+#ifndef __FP_FAST_FMAF
+#error "__FP_FAST_FMAF should be defined"
+#endif
+
+double d_a = 2.0, d_b = 3.0, d_c = 4.0;
+float f_a = 2.0f, f_b = 3.0f, f_c = 4.0f;
+
+int
+main (void)
+{
+ if (__builtin_fma (d_a, d_b, d_c) != (2.0 * 3.0) + 4.0)
+ __builtin_abort ();
+
+ if (__builtin_fmaf (f_a, f_b, f_c) != (2.0f * 3.0f) + 4.0f)
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -mcpu=power5 -std=c99 -msoft-float" } */
+/* { dg-final { scan-assembler-not "fmadd" } } */
+/* { dg-final { scan-assembler-not "xsfmadd" } } */
+
+/* Test whether -msoft-float turns off the macros math.h uses for
+ FP_FAST_FMA{,F,L}. */
+#ifdef __FP_FAST_FMA
+#error "__FP_FAST_FMA should not be defined"
+#endif
+
+#ifdef __FP_FAST_FMAF
+#error "__FP_FAST_FMAF should not be defined"
+#endif
+
+double
+builtin_fma (double b, double c, double d)
+{
+ return __builtin_fma (b, c, d); /* bl fma */
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, -d); /* bl fmaf */
+}
vectype_in = NULL_TREE;
nargs = gimple_call_num_args (stmt);
- /* Bail out if the function has more than two arguments, we
- do not have interesting builtin functions to vectorize with
- more than two arguments. No arguments is also not good. */
- if (nargs == 0 || nargs > 2)
+ /* Bail out if the function has more than three arguments, we do not have
+ interesting builtin functions to vectorize with more than two arguments
+ except for fma. No arguments is also not good. */
+ if (nargs == 0 || nargs > 3)
return false;
for (i = 0; i < nargs; i++)
extern void set_builtin_user_assembler_name (tree decl, const char *asmspec);
extern bool is_simple_builtin (tree);
extern bool is_inexpensive_builtin (tree);
+extern bool mode_has_fma (enum machine_mode mode);
/* In convert.c */
extern tree strip_float_extensions (tree);