From c4d423c74abd470d0161ec2a428fd01de5e1ed76 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 28 Feb 2019 11:43:30 -0800 Subject: [PATCH] AVX512FP16: Add vmovw/vmovsh. gcc/ChangeLog: * config/i386/avx512fp16intrin.h: (_mm_cvtsi16_si128): New intrinsic. (_mm_cvtsi128_si16): Likewise. (_mm_mask_load_sh): Likewise. (_mm_maskz_load_sh): Likewise. (_mm_mask_store_sh): Likewise. (_mm_move_sh): Likewise. (_mm_mask_move_sh): Likewise. (_mm_maskz_move_sh): Likewise. * config/i386/i386-builtin-types.def: Add corresponding builtin types. * config/i386/i386-builtin.def: Add corresponding new builtins. * config/i386/i386-expand.c (ix86_expand_special_args_builtin): Handle new builtin types. (ix86_expand_vector_init_one_nonzero): Adjust for FP16 target. * config/i386/sse.md (VI2F): New mode iterator. (vec_set_0): Use new mode iterator. (avx512f_mov_mask): Adjust for HF vector mode. (avx512f_store_mask): Ditto. --- gcc/config/i386/avx512fp16intrin.h | 59 ++++++++++++++++++++++++++++++++++ gcc/config/i386/i386-builtin-types.def | 3 ++ gcc/config/i386/i386-builtin.def | 5 +++ gcc/config/i386/i386-expand.c | 11 +++++++ gcc/config/i386/sse.md | 33 ++++++++++--------- 5 files changed, 95 insertions(+), 16 deletions(-) diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index 5d66ca5..baa5be4 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -2453,6 +2453,65 @@ _mm512_maskz_getmant_round_ph (__mmask32 __U, __m512h __A, #endif /* __OPTIMIZE__ */ +/* Intrinsics vmovw. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi16_si128 (short __A) +{ + return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si16 (__m128i __A) +{ + return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, 0); +} + +/* Intrinsics vmovsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_sh (__m128h __A, __mmask8 __B, _Float16 const* __C) +{ + return __builtin_ia32_loadsh_mask (__C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_sh (__mmask8 __A, _Float16 const* __B) +{ + return __builtin_ia32_loadsh_mask (__B, _mm_setzero_ph (), __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C) +{ + __builtin_ia32_storesh_mask (__A, __C, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_sh (__m128h __A, __m128h __B) +{ + __A[0] = __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_move_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vmovsh_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_move_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A); +} + #ifdef __DISABLE_AVX512FP16__ #undef __DISABLE_AVX512FP16__ #pragma GCC pop_options diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 78127fa..126cc0c 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -134,6 +134,7 @@ DEF_POINTER_TYPE (PCVOID, VOID, CONST) DEF_POINTER_TYPE (PVOID, VOID) DEF_POINTER_TYPE (PDOUBLE, DOUBLE) DEF_POINTER_TYPE (PFLOAT, FLOAT) +DEF_POINTER_TYPE (PCFLOAT16, FLOAT16, CONST) DEF_POINTER_TYPE (PSHORT, SHORT) DEF_POINTER_TYPE (PUSHORT, USHORT) DEF_POINTER_TYPE (PINT, INT) @@ -1308,6 +1309,8 @@ DEF_FUNCTION_TYPE (QI, V8HF, INT, UQI) DEF_FUNCTION_TYPE (HI, V16HF, INT, UHI) DEF_FUNCTION_TYPE (SI, V32HF, INT, USI) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF) +DEF_FUNCTION_TYPE (VOID, PCFLOAT16, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, PCFLOAT16, V8HF, UQI) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, UQI) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT) DEF_FUNCTION_TYPE (V8HF, V8HF, INT, V8HF, UQI) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 0292059..b8f782b 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -393,6 +393,10 @@ BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mas BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) +/* AVX512FP16 */ +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_loadhf_mask, "__builtin_ia32_loadsh_mask", IX86_BUILTIN_LOADSH_MASK, UNKNOWN, (int) V8HF_FTYPE_PCFLOAT16_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_storehf_mask, "__builtin_ia32_storesh_mask", IX86_BUILTIN_STORESH_MASK, UNKNOWN, (int) VOID_FTYPE_PCFLOAT16_V8HF_UQI) + /* RDPKRU and WRPKRU. */ BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru, "__builtin_ia32_rdpkru", IX86_BUILTIN_RDPKRU, UNKNOWN, (int) UNSIGNED_FTYPE_VOID) BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru, "__builtin_ia32_wrpkru", IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED) @@ -2826,6 +2830,7 @@ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_getexpv8hf_mask, "__builtin_ia32_getexpph128_mask", IX86_BUILTIN_GETEXPPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_getmantv16hf_mask, "__builtin_ia32_getmantph256_mask", IX86_BUILTIN_GETMANTPH256, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_getmantv8hf_mask, "__builtin_ia32_getmantph128_mask", IX86_BUILTIN_GETMANTPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_movhf_mask, "__builtin_ia32_vmovsh_mask", IX86_BUILTIN_VMOVSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 84acaa2..7e83087 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -11085,6 +11085,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case VOID_FTYPE_PFLOAT_V16SF_UHI: case VOID_FTYPE_PFLOAT_V8SF_UQI: case VOID_FTYPE_PFLOAT_V4SF_UQI: + case VOID_FTYPE_PCFLOAT16_V8HF_UQI: case VOID_FTYPE_PV32QI_V32HI_USI: case VOID_FTYPE_PV16QI_V16HI_UHI: case VOID_FTYPE_PUDI_V8HI_UQI: @@ -11157,6 +11158,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case V16SF_FTYPE_PCFLOAT_V16SF_UHI: case V8SF_FTYPE_PCFLOAT_V8SF_UQI: case V4SF_FTYPE_PCFLOAT_V4SF_UQI: + case V8HF_FTYPE_PCFLOAT16_V8HF_UQI: nargs = 3; klass = load; memory = 0; @@ -14194,6 +14196,8 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, break; case E_V8HImode: use_vector_set = TARGET_SSE2; + gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0 + ? gen_vec_setv8hi_0 : NULL; break; case E_V8QImode: use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; @@ -14205,8 +14209,12 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, use_vector_set = TARGET_SSE4_1; break; case E_V32QImode: + use_vector_set = TARGET_AVX; + break; case E_V16HImode: use_vector_set = TARGET_AVX; + gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0 + ? gen_vec_setv16hi_0 : NULL; break; case E_V8SImode: use_vector_set = TARGET_AVX; @@ -14254,6 +14262,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, use_vector_set = TARGET_AVX512FP16 && one_var == 0; gen_vec_set_0 = gen_vec_setv32hf_0; break; + case E_V32HImode: + use_vector_set = TARGET_AVX512FP16 && one_var == 0; + gen_vec_set_0 = gen_vec_setv32hi_0; default: break; } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a1d4192..5dbbed0 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -779,6 +779,7 @@ (V32HF "TARGET_AVX512BW")]) ;; Int-float size matches +(define_mode_iterator VI2F [V8HI V16HI V32HI V8HF V16HF V32HF]) (define_mode_iterator VI4F_128 [V4SI V4SF]) (define_mode_iterator VI8F_128 [V2DI V2DF]) (define_mode_iterator VI4F_256 [V8SI V8SF]) @@ -1361,13 +1362,13 @@ [(set (match_dup 0) (match_dup 1))]) (define_insn "avx512f_mov_mask" - [(set (match_operand:VF_128 0 "register_operand" "=v") - (vec_merge:VF_128 - (vec_merge:VF_128 - (match_operand:VF_128 2 "register_operand" "v") - (match_operand:VF_128 3 "nonimm_or_0_operand" "0C") + [(set (match_operand:VFH_128 0 "register_operand" "=v") + (vec_merge:VFH_128 + (vec_merge:VFH_128 + (match_operand:VFH_128 2 "register_operand" "v") + (match_operand:VFH_128 3 "nonimm_or_0_operand" "0C") (match_operand:QI 4 "register_operand" "Yk")) - (match_operand:VF_128 1 "register_operand" "v") + (match_operand:VFH_128 1 "register_operand" "v") (const_int 1)))] "TARGET_AVX512F" "vmov\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}" @@ -1380,7 +1381,7 @@ (vec_merge: (vec_merge: (vec_duplicate: - (match_operand:MODEF 1 "memory_operand")) + (match_operand:MODEFH 1 "memory_operand")) (match_operand: 2 "nonimm_or_0_operand") (match_operand:QI 3 "register_operand")) (match_dup 4) @@ -1393,7 +1394,7 @@ (vec_merge: (vec_merge: (vec_duplicate: - (match_operand:MODEF 1 "memory_operand" "m")) + (match_operand:MODEFH 1 "memory_operand" "m")) (match_operand: 2 "nonimm_or_0_operand" "0C") (match_operand:QI 3 "register_operand" "Yk")) (match_operand: 4 "const0_operand" "C") @@ -1406,11 +1407,11 @@ (set_attr "mode" "")]) (define_insn "avx512f_store_mask" - [(set (match_operand:MODEF 0 "memory_operand" "=m") - (if_then_else:MODEF + [(set (match_operand:MODEFH 0 "memory_operand" "=m") + (if_then_else:MODEFH (and:QI (match_operand:QI 2 "register_operand" "Yk") (const_int 1)) - (vec_select:MODEF + (vec_select:MODEFH (match_operand: 1 "register_operand" "v") (parallel [(const_int 0)])) (match_dup 0)))] @@ -8818,11 +8819,11 @@ ;; vmovw clears also the higer bits (define_insn "vec_set_0" - [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v,v") - (vec_merge:VF_AVX512FP16 - (vec_duplicate:VF_AVX512FP16 - (match_operand:HF 2 "nonimmediate_operand" "r,m")) - (match_operand:VF_AVX512FP16 1 "const0_operand" "C,C") + [(set (match_operand:VI2F 0 "register_operand" "=v,v") + (vec_merge:VI2F + (vec_duplicate:VI2F + (match_operand: 2 "nonimmediate_operand" "r,m")) + (match_operand:VI2F 1 "const0_operand" "C,C") (const_int 1)))] "TARGET_AVX512FP16" "@ -- 2.7.4