From 8bc700f4c3fbe405413db02281ef2918bfa831fc Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 17 Jan 2022 10:47:46 +0800 Subject: [PATCH] Enhance vec_pack_trunc for integral mode mask. For testcase in PR, the patch supports QI:4 -> HI:16 pack with multi steps(first pack QI:4 -> QI:8 through vec_pack_sbool_trunc_qi, then pack QI:8 -> HI:16 through vec_pack_trunc_hi). Similar for QI:2 -> HI:16 which is test4 in mask-pack-prefer-128.c. gcc/ChangeLog: PR target/103771 * tree-vect-stmts.cc (supportable_narrowing_operation): Enhance integral mode mask pack by multi steps which takes vec_pack_sbool_trunc_optab as start when elements number is less than BITS_PER_UNITS. gcc/testsuite/ChangeLog: * gcc.target/i386/mask-pack-prefer128.c: New test. * gcc.target/i386/mask-pack-prefer256.c: New test. * gcc.target/i386/pr103771.c: New test. --- gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c | 8 ++++++++ gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c | 8 ++++++++ gcc/testsuite/gcc.target/i386/pr103771.c | 18 ++++++++++++++++++ gcc/tree-vect-stmts.cc | 11 +++++++---- 4 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c create mode 100644 gcc/testsuite/gcc.target/i386/pr103771.c diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c new file mode 100644 index 0000000..c9ea37c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=128" } */ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */ +/* { dg-final { scan-assembler-not "maskmov" } } */ + +#include "mask-pack.c" diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c new file mode 100644 index 0000000..841f51b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd -fdump-tree-vect-details -mprefer-vector-width=256" } */ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */ +/* { dg-final { scan-assembler-not "maskmov" } } */ + +#include "mask-pack.c" diff --git a/gcc/testsuite/gcc.target/i386/pr103771.c b/gcc/testsuite/gcc.target/i386/pr103771.c new file mode 100644 index 0000000..a1a9952 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103771.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-march=cascadelake -O3 -fdump-tree-vect-details -mprefer-vector-width=128" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ + +typedef unsigned char uint8_t; + +static uint8_t x264_clip_uint8 (int x) +{ + return x & (~255) ? (-x) >> 31 : x; +} + +void +mc_weight (uint8_t* __restrict dst, uint8_t* __restrict src, + int i_width,int i_scale) +{ + for(int x = 0; x < i_width; x++) + dst[x] = x264_clip_uint8 (src[x] * i_scale); +} diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 95be4f3..824ebb6 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12124,6 +12124,7 @@ supportable_narrowing_operation (enum tree_code code, tree intermediate_type, prev_type; machine_mode intermediate_mode, prev_mode; int i; + unsigned HOST_WIDE_INT n_elts; bool uns; *multi_step_cvt = 0; @@ -12133,8 +12134,9 @@ supportable_narrowing_operation (enum tree_code code, c1 = VEC_PACK_TRUNC_EXPR; if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype) && VECTOR_BOOLEAN_TYPE_P (vectype) - && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype) - && SCALAR_INT_MODE_P (TYPE_MODE (vectype))) + && SCALAR_INT_MODE_P (TYPE_MODE (vectype)) + && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts) + && n_elts < BITS_PER_UNIT) optab1 = vec_pack_sbool_trunc_optab; else optab1 = optab_for_tree_code (c1, vectype, optab_default); @@ -12225,8 +12227,9 @@ supportable_narrowing_operation (enum tree_code code, = lang_hooks.types.type_for_mode (intermediate_mode, uns); if (VECTOR_BOOLEAN_TYPE_P (intermediate_type) && VECTOR_BOOLEAN_TYPE_P (prev_type) - && intermediate_mode == prev_mode - && SCALAR_INT_MODE_P (prev_mode)) + && SCALAR_INT_MODE_P (prev_mode) + && TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts) + && n_elts < BITS_PER_UNIT) interm_optab = vec_pack_sbool_trunc_optab; else interm_optab -- 2.7.4