From: konglin1 Date: Mon, 1 Nov 2021 02:43:34 +0000 (+0800) Subject: i386: Optimization for mm512_set1_pch. X-Git-Tag: upstream/12.2.0~3617 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=60e3179b7a33a62f36a143f9168ba2f777e37864;p=platform%2Fupstream%2Fgcc.git i386: Optimization for mm512_set1_pch. This patch is to support fold _mm512_fmadd_pch (a, _mm512_set1_pch(*(b)), c) to 1 instruction vfmaddcph (%rsp){1to16}, %zmm1, %zmm2; gcc/ChangeLog: * config/i386/sse.md (fma___pair): Add new define_insn. (fma__fmaddc_bcst): Add new define_insn_and_split. (fma__fcmaddc_bcst): Likewise gcc/testsuite/ChangeLog: * gcc.target/i386/avx512fp16vl-complex-broadcast-1.c: New test. --- diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 09f00dc..a58d8e8 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -193,7 +193,9 @@ ;; For AVX512FP16 suppport UNSPEC_COMPLEX_FMA + UNSPEC_COMPLEX_FMA_PAIR UNSPEC_COMPLEX_FCMA + UNSPEC_COMPLEX_FCMA_PAIR UNSPEC_COMPLEX_FMUL UNSPEC_COMPLEX_FCMUL UNSPEC_COMPLEX_MASK @@ -5920,6 +5922,9 @@ (define_int_iterator UNSPEC_COMPLEX_F_C_MA [UNSPEC_COMPLEX_FMA UNSPEC_COMPLEX_FCMA]) +(define_int_iterator UNSPEC_COMPLEX_F_C_MA_PAIR + [UNSPEC_COMPLEX_FMA_PAIR UNSPEC_COMPLEX_FCMA_PAIR]) + (define_int_iterator UNSPEC_COMPLEX_F_C_MUL [UNSPEC_COMPLEX_FMUL UNSPEC_COMPLEX_FCMUL]) @@ -5929,6 +5934,10 @@ (UNSPEC_COMPLEX_FMUL "fmulc") (UNSPEC_COMPLEX_FCMUL "fcmulc")]) +(define_int_attr complexpairopname + [(UNSPEC_COMPLEX_FMA_PAIR "fmaddc") + (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")]) + (define_mode_attr complexmove [(V32HF "avx512f_loadv16sf") (V16HF "avx512vl_loadv8sf") @@ -6074,6 +6083,59 @@ [(match_dup 1) (match_dup 2) (match_dup 4)] UNSPEC_COMPLEX_F_C_MA))]) +(define_insn "fma___pair" + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v") + (unspec:VF1_AVX512VL + [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v") + (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr") + (match_operand:VF1_AVX512VL 3 "vector_operand" "0")] + UNSPEC_COMPLEX_F_C_MA_PAIR))] + "TARGET_AVX512FP16" + "vph\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssemuladd")]) + +(define_insn_and_split "fma__fmaddc_bcst" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "vector_operand") + (subreg:VF_AVX512FP16VL + (match_operand: 2 "bcst_vector_operand") 0) + (match_operand:VF_AVX512FP16VL 3 "vector_operand")] + UNSPEC_COMPLEX_FMA))] + "TARGET_AVX512FP16" + "#" + "&& 1" + [(set (match_dup 0) + (unspec: + [(match_dup 1) (match_dup 2) (match_dup 3)] + UNSPEC_COMPLEX_FMA_PAIR))] + { + operands[0] = lowpart_subreg (mode, operands[0], mode); + operands[1] = lowpart_subreg (mode, operands[1], mode); + operands[3] = lowpart_subreg (mode, operands[3], mode); + }) + +(define_insn_and_split "fma__fcmaddc_bcst" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand") + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "vector_operand") + (subreg:VF_AVX512FP16VL + (match_operand: 2 "bcst_vector_operand") 0) + (match_operand:VF_AVX512FP16VL 3 "vector_operand")] + UNSPEC_COMPLEX_FCMA))] + "TARGET_AVX512FP16" + "#" + "&& 1" + [(set (match_dup 0) + (unspec: + [(match_dup 1) (match_dup 2) (match_dup 3)] + UNSPEC_COMPLEX_FCMA_PAIR))] + { + operands[0] = lowpart_subreg (mode, operands[0], mode); + operands[1] = lowpart_subreg (mode, operands[1], mode); + operands[3] = lowpart_subreg (mode, operands[3], mode); + }) + (define_insn "___mask" [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") (vec_merge:VF_AVX512FP16VL diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c new file mode 100644 index 0000000..3c8e8423 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } } */ + +#include + +volatile __m512h res0, a0, c0; +volatile __m256h res1, a1, c1; +volatile __m128h res2, a2, c2; +volatile _Float16 *b; + +void extern +avx_test(void) +{ + res0 = _mm512_fmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0); + res0 = _mm512_fcmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0); + + res1 = _mm256_fmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1); + res1 = _mm256_fcmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1); + + res2 = _mm_fmadd_pch (a2, _mm_set1_pch(*(b + 2 * 6)), c2); + res2 = _mm_fcmadd_pch (a2, _mm_set1_pch(*(b + 2 * 6)), c2); +}