1.0/sqrt. */
static bool
-use_rsqrt_p ()
+use_rsqrt_p (machine_mode mode)
{
- return (TARGET_SSE && TARGET_SSE_MATH
+ return ((mode == HFmode
+ || (TARGET_SSE && TARGET_SSE_MATH))
&& flag_finite_math_only
&& !flag_trapping_math
&& flag_unsafe_math_optimizations);
return opt_type == OPTIMIZE_FOR_SPEED;
case rint_optab:
- if (mode1 == HFmode)
- return true;
- else if (SSE_FLOAT_MODE_P (mode1)
- && TARGET_SSE_MATH
- && !flag_trapping_math
- && !TARGET_SSE4_1)
+ if (SSE_FLOAT_MODE_P (mode1)
+ && TARGET_SSE_MATH
+ && !flag_trapping_math
+ && !TARGET_SSE4_1
+ && mode1 != HFmode)
return opt_type == OPTIMIZE_FOR_SPEED;
return true;
case floor_optab:
case ceil_optab:
case btrunc_optab:
- if (mode1 == HFmode)
- return true;
- else if (SSE_FLOAT_MODE_P (mode1)
- && TARGET_SSE_MATH
- && !flag_trapping_math
- && TARGET_SSE4_1)
+ if (((SSE_FLOAT_MODE_P (mode1)
+ && TARGET_SSE_MATH
+ && TARGET_SSE4_1)
+ || mode1 == HFmode)
+ && !flag_trapping_math)
return true;
return opt_type == OPTIMIZE_FOR_SPEED;
case rsqrt_optab:
- return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+ return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
default:
return true;
(match_operand:XF 2 "register_operand")))]
"TARGET_80387")
+/* There is no more precision loss than Newton-Rhapson approximation
+ when using HFmode rcp/rsqrt, so do the transformation directly under
+ TARGET_RECIP_DIV and fast-math. */
(define_expand "divhf3"
[(set (match_operand:HF 0 "register_operand")
(div:HF (match_operand:HF 1 "register_operand")
(match_operand:HF 2 "nonimmediate_operand")))]
- "TARGET_AVX512FP16")
+ "TARGET_AVX512FP16"
+{
+ if (TARGET_RECIP_DIV
+ && optimize_insn_for_speed_p ()
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ rtx op = gen_reg_rtx (HFmode);
+ operands[2] = force_reg (HFmode, operands[2]);
+ emit_insn (gen_rcphf2 (op, operands[2]));
+ emit_insn (gen_mulhf3 (operands[0], operands[1], op));
+ DONE;
+ }
+})
(define_expand "div<mode>3"
[(set (match_operand:MODEF 0 "register_operand")
]
(symbol_ref "true")))])
+(define_insn "rcphf2"
+ [(set (match_operand:HF 0 "register_operand" "=v,v")
+ (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
+ UNSPEC_RCP))]
+ "TARGET_AVX512FP16"
+ "@
+ vrcpsh\t{%d1, %0|%0, %d1}
+ vrcpsh\t{%1, %d0|%d0, %1}"
+ [(set_attr "type" "sse")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "HF")
+ (set_attr "avx_partial_xmm_update" "false,true")])
+
(define_insn "*fop_xf_1_i387"
[(set (match_operand:XF 0 "register_operand" "=f,f")
(match_operator:XF 3 "binary_fp_operator"
DONE;
})
+(define_insn "rsqrthf2"
+ [(set (match_operand:HF 0 "register_operand" "=v,v")
+ (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
+ UNSPEC_RSQRT))]
+ "TARGET_AVX512FP16"
+ "@
+ vrsqrtsh\t{%d1, %0|%0, %d1}
+ vrsqrtsh\t{%1, %d0|%d0, %1}"
+ [(set_attr "type" "sse")
+ (set_attr "prefix" "evex")
+ (set_attr "avx_partial_xmm_update" "false,true")
+ (set_attr "mode" "HF")])
+
(define_insn "sqrthf2"
[(set (match_operand:HF 0 "register_operand" "=v,v")
(sqrt:HF
(set_attr "mode" "<ssescalarmode>")])
(define_expand "div<mode>3"
- [(set (match_operand:VF2H 0 "register_operand")
- (div:VF2H (match_operand:VF2H 1 "register_operand")
- (match_operand:VF2H 2 "vector_operand")))]
+ [(set (match_operand:VF2 0 "register_operand")
+ (div:VF2 (match_operand:VF2 1 "register_operand")
+ (match_operand:VF2 2 "vector_operand")))]
"TARGET_SSE2")
(define_expand "div<mode>3"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+ (div:VF_AVX512FP16VL
+ (match_operand:VF_AVX512FP16VL 1 "register_operand")
+ (match_operand:VF_AVX512FP16VL 2 "vector_operand")))]
+ "TARGET_AVX512FP16"
+{
+ /* Transform HF vector div to vector mul/rcp. */
+ if (GET_MODE_INNER (<MODE>mode) == HFmode
+ && TARGET_RECIP_VEC_DIV
+ && optimize_insn_for_speed_p ()
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ rtx op = gen_reg_rtx (<MODE>mode);
+ operands[2] = force_reg (<MODE>mode, operands[2]);
+ emit_insn (gen_avx512fp16_rcp<mode>2 (op, operands[2]));
+ emit_insn (gen_mul<mode>3 (operands[0], operands[1], op));
+ DONE;
+ }
+})
+
+(define_expand "div<mode>3"
[(set (match_operand:VF1 0 "register_operand")
(div:VF1 (match_operand:VF1 1 "register_operand")
(match_operand:VF1 2 "vector_operand")))]
(set_attr "prefix" "evex")
(set_attr "mode" "HF")])
+(define_insn "*avx512fp16_vmrcpv8hf2"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_merge:V8HF
+ (vec_duplicate:V8HF
+ (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "vm")]
+ UNSPEC_RCP))
+ (match_operand:V8HF 2 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vrcpsh\t{%1, %2, %0|%0, %2, %w1}"
+ [(set_attr "type" "sse")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "HF")])
+
(define_insn "<mask_codefor>rcp14<mode><mask_name>"
[(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
(unspec:VF_AVX512VL
DONE;
})
+(define_expand "rsqrt<mode>2"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+ (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "vector_operand")]
+ UNSPEC_RSQRT))]
+ "TARGET_AVX512FP16")
+
(define_insn "<sse>_rsqrt<mode>2"
[(set (match_operand:VF1_128_256 0 "register_operand" "=x")
(unspec:VF1_128_256
(set_attr "prefix" "evex")
(set_attr "mode" "HF")])
+(define_insn "*avx512fp16_vmrsqrtv8hf2"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_merge:V8HF
+ (vec_duplicate:V8HF
+ (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "vm")]
+ UNSPEC_RSQRT))
+ (match_operand:V8HF 2 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vrsqrtsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %w1}"
+ [(set_attr "type" "sse")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "HF")])
+
(define_expand "cond_<code><mode>"
[(set (match_operand:VFH 0 "register_operand")
(vec_merge:VFH
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512fp16 -mavx512vl -ffast-math" } */
+/* { dg-final { scan-assembler "vrcpsh.*\n.*vmulsh" } } */
+/* { dg-final { scan-assembler "vrcpph.*\n.*vmulph" } } */
+/* { dg-final { scan-assembler "vrsqrtsh.*\n.*vmulsh" } } */
+/* { dg-final { scan-assembler "vrsqrtph.*\n.*vmulph" } } */
+/* { dg-final { scan-assembler-not "vsqrtsh" } } */
+/* { dg-final { scan-assembler-not "vsqrtph" } } */
+/* { dg-final { scan-assembler-not "vdivsh" } } */
+/* { dg-final { scan-assembler-not "vdivph" } } */
+
+#define FAST_ATTR \
+ __attribute__((noinline, noclone, optimize("fast-math"), target("recip")))
+
+_Float16 FAST_ATTR
+scalar_hf_rcp_fast (_Float16 a, _Float16 b)
+{
+ return a / b;
+}
+
+_Float16 FAST_ATTR
+scalar_hf_rsqrt_fast (_Float16 a, _Float16 b)
+{
+ return a / __builtin_sqrtf16 (b);
+}
+
+void FAST_ATTR
+vector_hf_rcp_fast (_Float16 * restrict a, _Float16 * restrict b,
+ _Float16 * restrict c, int n)
+{
+ int i;
+ for (i = 0; i < n; i++)
+ c[i] = a[i] / b[i];
+}
+
+void FAST_ATTR
+vector_hf_rsqrt_fast (_Float16 * restrict a, _Float16 * restrict b,
+ _Float16 * restrict c, int n)
+{
+ int i;
+ for (i = 0; i < n; i++)
+ c[i] = a[i] / __builtin_sqrtf16(b[i]);
+}
--- /dev/null
+/* { dg-do run { target avx512fp16 } } */
+/* { dg-options "-O3 -mavx512fp16 -mavx512vl -ffast-math" } */
+
+static void recip_op_test (void);
+#define DO_TEST recip_op_test
+#define AVX512FP16
+#define AVX512VL
+#include "avx512f-check.h"
+#include "avx512fp16-recip-1.c"
+
+_Float16 a[32], b[32], vexp[32], vref[32], sa, sb, sexp, sref;
+
+#define NO_FAST_ATTR \
+ __attribute__((noinline, noclone, \
+ optimize("fast-math,trapping-math")))
+
+_Float16 NO_FAST_ATTR
+scalar_hf_rcp_no_fast (_Float16 a, _Float16 b)
+{
+ return a / b;
+}
+
+_Float16 NO_FAST_ATTR
+scalar_hf_rsqrt_no_fast (_Float16 a, _Float16 b)
+{
+ return a / __builtin_sqrtf16 (b);
+}
+
+void NO_FAST_ATTR
+vector_hf_rcp_no_fast (_Float16 * restrict a, _Float16 * restrict b,
+ _Float16 * restrict c, int n)
+{
+ int i;
+ for (i = 0; i < n; i++)
+ c[i] = a[i] / b[i];
+}
+
+void NO_FAST_ATTR
+vector_hf_rsqrt_no_fast (_Float16 * restrict a, _Float16 * restrict b,
+ _Float16 * restrict c, int n)
+{
+ int i;
+ for (i = 0; i < n; i++)
+ c[i] = a[i] / __builtin_sqrtf16 (b[i]);
+}
+
+void init()
+{
+ int i;
+ sa = 3.75;
+ sb = 6.25;
+ sexp = sref = 2.75;
+ for (i = 0; i < 32; i++)
+ {
+ a[i] = i + 0.5;
+ b[i] = i * 1.5;
+ vexp[i] = vref[i] = 2.75 * i;
+ }
+}
+
+int check_cond(void *a, void *b, int size)
+{
+ int i;
+ unsigned short *pa = (unsigned short *)a,
+ *pb = (unsigned short *)b;
+ for (i = 0; i < size; i++)
+ if (pa[i] != pb[i])
+ return 0;
+ return 1;
+}
+
+static void recip_op_test()
+{
+ init ();
+ sexp = scalar_hf_rcp_fast (sa, sb);
+ sref = scalar_hf_rcp_no_fast (sa, sb);
+ if (!check_cond (&sexp, &sref, 1))
+ abort ();
+
+ init ();
+ sexp = scalar_hf_rsqrt_fast (sa, sb);
+ sref = scalar_hf_rsqrt_no_fast (sa, sb);
+ if (!check_cond (&sexp, &sref, 1))
+ abort ();
+
+ init ();
+ vector_hf_rcp_fast (a, b, vexp, 32);
+ vector_hf_rcp_no_fast (a, b, vref, 32);
+ if (!check_cond (vexp, vref, 1))
+ abort ();
+
+ init ();
+ vector_hf_rsqrt_fast (a, b, vexp, 32);
+ vector_hf_rsqrt_no_fast (a, b, vref, 32);
+ if (!check_cond (vexp, vref, 1))
+ abort ();
+}
/* PR target/102464. */
/* { dg-do compile } */
-/* { dg-options "-O2 -mavx512fp16" } */
+/* { dg-options "-O2 -mavx512fp16 -fno-trapping-math" } */
#define FOO(FUNC,SUFFIX) \
_Float16 \