From cc1facefe3b4e3b067d95291a7dba834b830ff18 Mon Sep 17 00:00:00 2001 From: Prathamesh Kulkarni Date: Fri, 18 Oct 2019 05:13:26 +0000 Subject: [PATCH] re PR target/86753 (gcc.target/aarch64/sve/vcond_[45].c fail after recent combine patch) 2019-10-18 Prathamesh Kulkarni Richard Sandiford PR target/86753 * tree-vectorizer.h (scalar_cond_masked_key): New struct, and define hashmap traits for it. (loop_vec_info::scalar_cond_masked_set): New member. (vect_record_loop_mask): Adjust prototype. * tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree): Implement method. * tree-vect-loop.c (vectorizable_reduction): Pass NULL as last arg to vect_record_loop_mask. (vectorizable_live_operation): Likewise. (vect_record_loop_mask): New param scalar_mask. Add entry cond, loop_mask to scalar_cond_masked_set if scalar_mask is non NULL. * tree-vect-stmts.c (check_load_store_masking): New param scalar_mask. Pass it as last arg to vect_record_loop_mask. (vectorizable_call): Pass scalar_mask as last arg to vect_record_loop_mask. (vectorizable_store): Likewise. (vectorizable_load): Likewise. (vectorizable_condition): Check if another part of vectorized code applies loop_mask to condition or to it's inverse, and if yes, apply loop_mask to result of vector comparison. testsuite/ * gcc.target/aarch64/sve/cond_cnot_2.c: Remove XFAIL from { scan-assembler-not {\tsel\t}. * gcc.target/aarch64/sve/cond_convert_1.c: Adjust to make only one load conditional. * gcc.target/aarch64/sve/cond_convert_4.c: Likewise. * gcc.target/aarch64/sve/cond_unary_2.c: Likewise. * gcc.target/aarch64/sve/vcond_4.c: Remove XFAIL's. * gcc.target/aarch64/sve/vcond_5.c: Likewise. Co-Authored-By: Richard Sandiford From-SVN: r277141 --- gcc/ChangeLog | 25 ++++++ gcc/testsuite/ChangeLog | 13 +++ gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c | 2 +- .../gcc.target/aarch64/sve/cond_convert_1.c | 5 +- .../gcc.target/aarch64/sve/cond_convert_4.c | 5 +- .../gcc.target/aarch64/sve/cond_unary_2.c | 5 +- gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c | 36 ++++---- gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c | 16 ++-- gcc/tree-vect-loop.c | 16 +++- gcc/tree-vect-stmts.c | 95 ++++++++++++++++++++-- gcc/tree-vectorizer.c | 33 ++++++++ gcc/tree-vectorizer.h | 73 ++++++++++++++++- 12 files changed, 281 insertions(+), 43 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b9a6c15..273d13c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,28 @@ +2019-10-18 Prathamesh Kulkarni + Richard Sandiford + + PR target/86753 + * tree-vectorizer.h (scalar_cond_masked_key): New struct, + and define hashmap traits for it. + (loop_vec_info::scalar_cond_masked_set): New member. + (vect_record_loop_mask): Adjust prototype. + * tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree): + Implement method. + * tree-vect-loop.c (vectorizable_reduction): Pass NULL as last arg to + vect_record_loop_mask. + (vectorizable_live_operation): Likewise. + (vect_record_loop_mask): New param scalar_mask. Add entry + cond, loop_mask to scalar_cond_masked_set if scalar_mask is non NULL. + * tree-vect-stmts.c (check_load_store_masking): New param scalar_mask. + Pass it as last arg to vect_record_loop_mask. + (vectorizable_call): Pass scalar_mask as last arg to + vect_record_loop_mask. + (vectorizable_store): Likewise. + (vectorizable_load): Likewise. + (vectorizable_condition): Check if another part of vectorized code + applies loop_mask to condition or to it's inverse, and if yes, + apply loop_mask to result of vector comparison. + 2019-10-17 John David Anglin * config/pa/pa.c (pa_output_indirect_call): Fix typos in last change. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 2d2a274..fc63457 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,16 @@ +2019-10-18 Prathamesh Kulkarni + Richard Sandiford + + PR target/86753 + * gcc.target/aarch64/sve/cond_cnot_2.c: Remove XFAIL + from { scan-assembler-not {\tsel\t}. + * gcc.target/aarch64/sve/cond_convert_1.c: Adjust to make + only one load conditional. + * gcc.target/aarch64/sve/cond_convert_4.c: Likewise. + * gcc.target/aarch64/sve/cond_unary_2.c: Likewise. + * gcc.target/aarch64/sve/vcond_4.c: Remove XFAIL's. + * gcc.target/aarch64/sve/vcond_5.c: Likewise. + 2019-10-18 Jakub Jelinek PR tree-optimization/92056 diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c index d689e21..3df2431 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c @@ -32,4 +32,4 @@ TEST_ALL (DEF_LOOP) /* { dg-final { scan-assembler-not {\tmov\tz} } } */ /* { dg-final { scan-assembler-not {\tmovprfx\t} } } */ /* Currently we canonicalize the ?: so that !b[i] is the "false" value. */ -/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c index dcc3076..86064eb 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c @@ -11,7 +11,10 @@ INT_TYPE *__restrict pred, int n) \ { \ for (int i = 0; i < n; ++i) \ - r[i] = pred[i] ? (FLOAT_TYPE) a[i] : b[i]; \ + { \ + FLOAT_TYPE bi = b[i]; \ + r[i] = pred[i] ? (FLOAT_TYPE) a[i] : bi; \ + } \ } #define TEST_ALL(T) \ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c index 7e5f2a7..e3a947b 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c @@ -11,7 +11,10 @@ INT_TYPE *__restrict pred, int n) \ { \ for (int i = 0; i < n; ++i) \ - r[i] = pred[i] ? (INT_TYPE) a[i] : b[i]; \ + { \ + INT_TYPE bi = b[i]; \ + r[i] = pred[i] ? (INT_TYPE) a[i] : bi; \ + } \ } #define TEST_ALL(T) \ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c index 991ccf0..97d1b8f 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c @@ -13,7 +13,10 @@ TYPE *__restrict pred, int n) \ { \ for (int i = 0; i < n; ++i) \ - r[i] = pred[i] ? OP (a[i]) : b[i]; \ + { \ + TYPE bi = b[i]; \ + r[i] = pred[i] ? OP (a[i]) : bi; \ + } \ } #define TEST_INT_TYPE(T, TYPE) \ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c index 00d8476..b38f23e 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c @@ -98,24 +98,24 @@ TEST_CMP (nugt) /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ /* 5 for lt, 5 for ult and 5 for nult. */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* 5 for le, 5 for ule and 5 for nule. */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* 5 for gt, 5 for ugt and 5 for nugt. */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* 5 for ge, 5 for uge and 5 for nuge. */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} } } */ /* 3 loops * 5 invocations for all 12 unordered comparisons. */ -/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 } } */ /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 7 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 14 { xfail *-*-* } } } */ @@ -123,19 +123,19 @@ TEST_CMP (nugt) /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} } } */ /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2, for all 12 unordered comparisons. */ -/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c index 23bfb7b..2f16fbf 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c @@ -19,16 +19,16 @@ /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */ /* 5 for le, 5 for ule and 5 for nule. */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */ /* 5 for gt, 5 for ugt, 5 for nueq and 5 for nugt. */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 20 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */ /* 5 for ge, 5 for uge and 5 for nuge. */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} } } */ /* 3 loops * 5 invocations for ordered, unordered amd ueq. */ @@ -43,14 +43,14 @@ /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} } } */ /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2, diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 80db6ab..10920ac 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -6330,7 +6330,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node, } else vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, - vectype_in); + vectype_in, NULL); } if (dump_enabled_p () && reduction_type == FOLD_LEFT_REDUCTION) @@ -7561,7 +7561,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info, gcc_assert (ncopies == 1 && !slp_node); vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype); + 1, vectype, NULL); } } return true; @@ -7760,11 +7760,12 @@ vect_double_mask_nunits (tree type) /* Record that a fully-masked version of LOOP_VINFO would need MASKS to contain a sequence of NVECTORS masks that each control a vector of type - VECTYPE. */ + VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND + these vector masks with the vector version of SCALAR_MASK. */ void vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, - unsigned int nvectors, tree vectype) + unsigned int nvectors, tree vectype, tree scalar_mask) { gcc_assert (nvectors != 0); if (masks->length () < nvectors) @@ -7775,6 +7776,13 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, unsigned int nscalars_per_iter = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); + + if (scalar_mask) + { + scalar_cond_masked_key cond (scalar_mask, nvectors); + loop_vinfo->scalar_cond_masked_set.add (cond); + } + if (rgm->max_nscalars_per_iter < nscalars_per_iter) { rgm->max_nscalars_per_iter = nscalars_per_iter; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index e7255fb..acdd907 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info, says how the load or store is going to be implemented and GROUP_SIZE is the number of load or store statements in the containing group. If the access is a gather load or scatter store, GS_INFO describes - its arguments. + its arguments. If the load or store is conditional, SCALAR_MASK is the + condition under which it occurs. Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not supported, otherwise record the required mask types. */ @@ -1888,7 +1889,7 @@ static void check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, vec_load_store_type vls_type, int group_size, vect_memory_access_type memory_access_type, - gather_scatter_info *gs_info) + gather_scatter_info *gs_info, tree scalar_mask) { /* Invariant loads need no special support. */ if (memory_access_type == VMAT_INVARIANT) @@ -1912,7 +1913,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, return; } unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype); + vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); return; } @@ -1936,7 +1937,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, return; } unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype); + vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); return; } @@ -1974,7 +1975,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); unsigned int nvectors; if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors)) - vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype); + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask); else gcc_unreachable (); } @@ -3436,7 +3437,9 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, unsigned int nvectors = (slp_node ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) : ncopies); - vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out); + tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno); + vect_record_loop_mask (loop_vinfo, masks, nvectors, + vectype_out, scalar_mask); } return true; } @@ -7390,7 +7393,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) check_load_store_masking (loop_vinfo, vectype, vls_type, group_size, - memory_access_type, &gs_info); + memory_access_type, &gs_info, mask); STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; vect_model_store_cost (stmt_info, ncopies, rhs_dt, memory_access_type, @@ -8637,7 +8640,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size, - memory_access_type, &gs_info); + memory_access_type, &gs_info, mask); STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; vect_model_load_cost (stmt_info, ncopies, memory_access_type, @@ -10007,6 +10010,35 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, /* Handle cond expr. */ for (j = 0; j < ncopies; j++) { + tree loop_mask = NULL_TREE; + bool swap_cond_operands = false; + + /* See whether another part of the vectorized code applies a loop + mask to the condition, or to its inverse. */ + + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + { + scalar_cond_masked_key cond (cond_expr, ncopies); + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j); + } + else + { + bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); + cond.code = invert_tree_comparison (cond.code, honor_nans); + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, + vectype, j); + cond_code = cond.code; + swap_cond_operands = true; + } + } + } + stmt_vec_info new_stmt_info = NULL; if (j == 0) { @@ -10084,6 +10116,9 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vec_then_clause = vec_oprnds2[i]; vec_else_clause = vec_oprnds3[i]; + if (swap_cond_operands) + std::swap (vec_then_clause, vec_else_clause); + if (masked) vec_compare = vec_cond_lhs; else @@ -10122,6 +10157,50 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, } } } + + /* If we decided to apply a loop mask to the result of the vector + comparison, AND the comparison with the mask now. Later passes + should then be able to reuse the AND results between mulitple + vector statements. + + For example: + for (int i = 0; i < 100; ++i) + x[i] = y[i] ? z[i] : 10; + + results in following optimized GIMPLE: + + mask__35.8_43 = vect__4.7_41 != { 0, ... }; + vec_mask_and_46 = loop_mask_40 & mask__35.8_43; + _19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B]; + vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46); + vect_iftmp.12_52 = VEC_COND_EXPR ; + + instead of using a masked and unmasked forms of + vec != { 0, ... } (masked in the MASK_LOAD, + unmasked in the VEC_COND_EXPR). */ + + if (loop_mask) + { + if (COMPARISON_CLASS_P (vec_compare)) + { + tree tmp = make_ssa_name (vec_cmp_type); + tree op0 = TREE_OPERAND (vec_compare, 0); + tree op1 = TREE_OPERAND (vec_compare, 1); + gassign *g = gimple_build_assign (tmp, + TREE_CODE (vec_compare), + op0, op1); + vect_finish_stmt_generation (stmt_info, g, gsi); + vec_compare = tmp; + } + + tree tmp2 = make_ssa_name (vec_cmp_type); + gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, + vec_compare, loop_mask); + vect_finish_stmt_generation (stmt_info, g, gsi); + vec_compare = tmp2; + } + if (reduction_type == EXTRACT_LAST_REDUCTION) { if (!is_gimple_val (vec_compare)) diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 512e2e0..1a0cc93 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1515,3 +1515,36 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt) { return new pass_ipa_increase_alignment (ctxt); } + +/* If the condition represented by T is a comparison or the SSA name + result of a comparison, extract the comparison's operands. Represent + T as NE_EXPR otherwise. */ + +void +scalar_cond_masked_key::get_cond_ops_from_tree (tree t) +{ + if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison) + { + this->code = TREE_CODE (t); + this->op0 = TREE_OPERAND (t, 0); + this->op1 = TREE_OPERAND (t, 1); + return; + } + + if (TREE_CODE (t) == SSA_NAME) + if (gassign *stmt = dyn_cast (SSA_NAME_DEF_STMT (t))) + { + tree_code code = gimple_assign_rhs_code (stmt); + if (TREE_CODE_CLASS (code) == tcc_comparison) + { + this->code = code; + this->op0 = gimple_assign_rhs1 (stmt); + this->op1 = gimple_assign_rhs2 (stmt); + return; + } + } + + this->code = NE_EXPR; + this->op0 = t; + this->op1 = build_zero_cst (TREE_TYPE (t)); +} diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index d59ba13..5c3b3c9 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -177,7 +177,75 @@ public: #define SLP_TREE_TWO_OPERATORS(S) (S)->two_operators #define SLP_TREE_DEF_TYPE(S) (S)->def_type +/* Key for map that records association between + scalar conditions and corresponding loop mask, and + is populated by vect_record_loop_mask. */ +struct scalar_cond_masked_key +{ + scalar_cond_masked_key (tree t, unsigned ncopies_) + : ncopies (ncopies_) + { + get_cond_ops_from_tree (t); + } + + void get_cond_ops_from_tree (tree); + + unsigned ncopies; + tree_code code; + tree op0; + tree op1; +}; + +template<> +struct default_hash_traits +{ + typedef scalar_cond_masked_key compare_type; + typedef scalar_cond_masked_key value_type; + + static inline hashval_t + hash (value_type v) + { + inchash::hash h; + h.add_int (v.code); + inchash::add_expr (v.op0, h, 0); + inchash::add_expr (v.op1, h, 0); + h.add_int (v.ncopies); + return h.end (); + } + + static inline bool + equal (value_type existing, value_type candidate) + { + return (existing.ncopies == candidate.ncopies + && existing.code == candidate.code + && operand_equal_p (existing.op0, candidate.op0, 0) + && operand_equal_p (existing.op1, candidate.op1, 0)); + } + + static inline void + mark_empty (value_type &v) + { + v.ncopies = 0; + } + + static inline bool + is_empty (value_type v) + { + return v.ncopies == 0; + } + + static inline void mark_deleted (value_type &) {} + + static inline bool is_deleted (const value_type &) + { + return false; + } + + static inline void remove (value_type &) {} +}; + +typedef hash_set scalar_cond_masked_set_type; /* Describes two objects whose addresses must be unequal for the vectorized loop to be valid. */ @@ -426,6 +494,9 @@ public: on inactive scalars. */ vec_loop_masks masks; + /* Set of scalar conditions that have loop mask applied. */ + scalar_cond_masked_set_type scalar_cond_masked_set; + /* If we are using a loop mask to align memory addresses, this variable contains the number of vector elements that we should skip in the first iteration of the vector loop (i.e. the number of leading @@ -1637,7 +1708,7 @@ extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *, extern tree vect_halve_mask_nunits (tree); extern tree vect_double_mask_nunits (tree); extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *, - unsigned int, tree); + unsigned int, tree, tree); extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *, unsigned int, tree, unsigned int); extern stmt_vec_info info_for_reduction (stmt_vec_info); -- 2.7.4