From 3540429be7ad1085af83600483908b621078fb6f Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 27 Sep 2021 14:57:38 +0800 Subject: [PATCH] Support 128/256/512-bit vector plus/smin/smax reduction for _Float16. gcc/ChangeLog: * config/i386/i386-expand.c (emit_reduc_half): Handle V8HF/V16HF/V32HFmode. * config/i386/sse.md (REDUC_SSE_PLUS_MODE): Add V8HF. (REDUC_SSE_SMINMAX_MODE): Ditto. (REDUC_PLUS_MODE): Add V16HF and V32HF. (REDUC_SMINMAX_MODE): Ditto. gcc/testsuite * gcc.target/i386/avx512fp16-reduce-op-2.c: New test. * gcc.target/i386/avx512fp16-reduce-op-3.c: New test. --- gcc/config/i386/i386-expand.c | 3 + gcc/config/i386/sse.md | 10 ++- .../gcc.target/i386/avx512fp16-reduce-op-2.c | 96 ++++++++++++++++++++++ .../gcc.target/i386/avx512fp16-reduce-op-3.c | 91 ++++++++++++++++++++ 4 files changed, 198 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 94ac303..4780b99 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -16045,6 +16045,7 @@ emit_reduc_half (rtx dest, rtx src, int i) break; case E_V16QImode: case E_V8HImode: + case E_V8HFmode: case E_V4SImode: case E_V2DImode: d = gen_reg_rtx (V1TImode); @@ -16066,6 +16067,7 @@ emit_reduc_half (rtx dest, rtx src, int i) break; case E_V32QImode: case E_V16HImode: + case E_V16HFmode: case E_V8SImode: case E_V4DImode: if (i == 256) @@ -16085,6 +16087,7 @@ emit_reduc_half (rtx dest, rtx src, int i) break; case E_V64QImode: case E_V32HImode: + case E_V32HFmode: if (i < 64) { d = gen_reg_rtx (V4TImode); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index bb7600e..4559b0c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -3157,7 +3157,8 @@ (set_attr "mode" "V4SF")]) (define_mode_iterator REDUC_SSE_PLUS_MODE - [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE")]) + [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")]) (define_expand "reduc_plus_scal_" [(plus:REDUC_SSE_PLUS_MODE @@ -3194,7 +3195,9 @@ (define_mode_iterator REDUC_PLUS_MODE [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F") + (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")]) (define_expand "reduc_plus_scal_" @@ -3214,7 +3217,8 @@ ;; Modes handled by reduc_sm{in,ax}* patterns. (define_mode_iterator REDUC_SSE_SMINMAX_MODE - [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE") + [(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V4SF "TARGET_SSE") (V2DF "TARGET_SSE") (V4SI "TARGET_SSE2") (V8HI "TARGET_SSE2") (V16QI "TARGET_SSE2") (V2DI "TARGET_SSE4_2")]) @@ -3233,9 +3237,11 @@ (define_mode_iterator REDUC_SMINMAX_MODE [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") (V8SF "TARGET_AVX") (V4DF "TARGET_AVX") (V64QI "TARGET_AVX512BW") + (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V32HI "TARGET_AVX512BW") (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c new file mode 100644 index 0000000..593340e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c @@ -0,0 +1,96 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mprefer-vector-width=512 -fdump-tree-optimized" } */ + +/* { dg-final { scan-tree-dump-times "\.REDUC_PLUS" 3 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "\.REDUC_MIN" 3 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "\.REDUC_MAX" 3 "optimized" } } */ + +_Float16 +__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast"))) +reduc_add_128 (_Float16* p) +{ + _Float16 sum = 0; + for (int i = 0; i != 8; i++) + sum += p[i]; + return sum; +} + +_Float16 +__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast"))) +reduc_add_256 (_Float16* p) +{ + _Float16 sum = 0; + for (int i = 0; i != 16; i++) + sum += p[i]; + return sum; +} + +_Float16 +__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast"))) +reduc_add_512 (_Float16* p) +{ + _Float16 sum = 0; + for (int i = 0; i != 32; i++) + sum += p[i]; + return sum; +} + +_Float16 +__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast"))) +reduc_min_128 (_Float16* p) +{ + _Float16 sum = p[0]; + for (int i = 0; i != 8; i++) + sum = sum > p[i] ? p[i] : sum; + return sum; +} + +_Float16 +__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast"))) +reduc_min_256 (_Float16* p) +{ + _Float16 sum = p[0]; + for (int i = 0; i != 16; i++) + sum = sum > p[i] ? p[i] : sum; + return sum; +} + +_Float16 +__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast"))) +reduc_min_512 (_Float16* p) +{ + _Float16 sum = p[0]; + for (int i = 0; i != 32; i++) + sum = sum > p[i] ? p[i] : sum; + return sum; +} + +_Float16 +__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast"))) +reduc_max_128 (_Float16* p) +{ + _Float16 sum = p[0]; + for (int i = 0; i != 8; i++) + sum = sum < p[i] ? p[i] : sum; + return sum; +} + +_Float16 +__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast"))) +reduc_max_256 (_Float16* p) +{ + _Float16 sum = p[0]; + for (int i = 0; i != 16; i++) + sum = sum < p[i] ? p[i] : sum; + return sum; +} + +_Float16 +__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast"))) +reduc_max_512 (_Float16* p) +{ + _Float16 sum = p[0]; + for (int i = 0; i != 32; i++) + sum = sum < p[i] ? p[i] : sum; + return sum; +} diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c new file mode 100644 index 0000000..9281a3b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c @@ -0,0 +1,91 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512fp16" } */ +/* { dg-require-effective-target avx512fp16 } */ +/* { dg-require-effective-target avx512vl } */ + +#define AVX512FP16 +#define AVX512VL + +#include "avx512f-helper.h" + +#include "avx512fp16-reduce-op-2.c" + +void +test_256 (void) +{ + _Float16 a[32]; + int sign = 1; + _Float16 res1 = 0, exp1; + _Float16 res2 = 0, exp2; + _Float16 res3 = 0, exp3; + + for (int i = 0; i != 32; i++) + { + a[i] = sign * (4.0 * i); + sign *= -1; + if (i < 8) + res1 += a[i]; + if (i < 16) + res2 += a[i]; + res3 += a[i]; + } + + exp1 = reduc_add_128 (a); + exp2 = reduc_add_256 (a); + exp3 = reduc_add_512 (a); + if (exp1 != res1 || exp2 != res2 || exp3 != res3) + abort(); +} + +#define MAX(A, B) ((A) > (B) ? (A) : (B)) +#define MIN(A, B) ((A) < (B) ? (A) : (B)) + +void +test_128 () +{ + _Float16 a[32]; + int sign = 1; + _Float16 min_res1, min_exp1, max_res1, max_exp1; + _Float16 min_res2, min_exp2, max_res2, max_exp2; + _Float16 min_res3, min_exp3, max_res3, max_exp3; + + for (int i = 0; i != 32; i++) + { + a[i] = sign * (4.9 * i * i - 8.3 * i + 14.8); + sign *= -1; + } + + min_res1 = max_res1 = a[0]; + for (int i = 0 ; i != 8; i++) + { + min_res1 = MIN (min_res1, a[i]); + max_res1 = MAX (max_res1, a[i]); + } + + min_res2 = min_res1; + max_res2 = max_res1; + for (int i = 8 ; i != 16; i++) + { + min_res2 = MIN (min_res2, a[i]); + max_res2 = MAX (max_res2, a[i]); + } + + min_res3 = min_res2; + max_res3 = max_res2; + for (int i = 16 ; i != 32; i++) + { + min_res3 = MIN (min_res3, a[i]); + max_res3 = MAX (max_res3, a[i]); + } + + min_exp1 = reduc_min_128 (a); + min_exp2 = reduc_min_256 (a); + min_exp3 = reduc_min_512 (a); + max_exp1 = reduc_max_128 (a); + max_exp2 = reduc_max_256 (a); + max_exp3 = reduc_max_512 (a); + + if (min_exp1 != min_res1 || min_exp2 != min_res2 || min_exp3 != min_res3 + || max_exp1 != max_res1 || max_exp2 != max_res2 || max_exp3 != max_res3) + abort(); +} -- 2.7.4