From 144aee70b80de50f96a97ee64edd2f1c237c4906 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 5 May 2020 11:36:47 +0200 Subject: [PATCH] match.pd: Canonicalize (x + (x << cst)) into (x * cst2) [PR94800] The popcount* testcases show yet another creative way to write popcount, but rather than adjusting the popcount matcher to deal with it, I think we just should canonicalize those (X + (X << C) to X * (1 + (1 << C)) and (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2)), because for multiplication we already have simplification rules that can handle nested multiplication (X * CST1 * CST2), while the the shifts and adds we have nothing like that. And user could have written the multiplication anyway, so if we don't emit the fastest or smallest code for the multiplication by constant, we should improve that. At least on the testcases seems the emitted code is reasonable according to cost, except that perhaps we could in some cases try to improve expansion of vector multiplication by uniform constant. 2020-05-05 Jakub Jelinek PR tree-optimization/94800 * match.pd (X + (X << C) to X * (1 + (1 << C)), (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2))): New canonicalizations. * gcc.dg/tree-ssa/pr94800.c: New test. * gcc.dg/tree-ssa/popcount5.c: New test. * gcc.dg/tree-ssa/popcount5l.c: New test. * gcc.dg/tree-ssa/popcount5ll.c: New test. --- gcc/ChangeLog | 5 ++ gcc/match.pd | 35 +++++++++++++ gcc/testsuite/ChangeLog | 6 +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5.c | 22 ++++++++ gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c | 32 ++++++++++++ gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c | 22 ++++++++ gcc/testsuite/gcc.dg/tree-ssa/pr94800.c | 80 +++++++++++++++++++++++++++++ 7 files changed, 202 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/popcount5.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr94800.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b796203..cc076c8 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,10 @@ 2020-05-05 Jakub Jelinek + PR tree-optimization/94800 + * match.pd (X + (X << C) to X * (1 + (1 << C)), + (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2))): New + canonicalizations. + PR target/94942 * config/i386/mmx.md (*vec_dupv4hi): Use xYw constraints instead of Yv. diff --git a/gcc/match.pd b/gcc/match.pd index c45e94c..d957517 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -2570,6 +2570,41 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) && single_use (@3)) (mult (plusminus @2 { build_one_cst (type); }) @0)))))) +#if GIMPLE +/* Canonicalize X + (X << C) into X * (1 + (1 << C)) and + (X << C1) + (X << C2) into X * ((1 << C1) + (1 << C2)). */ +(simplify + (plus:c @0 (lshift:s @0 INTEGER_CST@1)) + (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0)) + && tree_fits_uhwi_p (@1) + && tree_to_uhwi (@1) < element_precision (type)) + (with { tree t = type; + if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t); + wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1), + element_precision (type)); + w += 1; + tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t) + : t, w); + cst = build_uniform_cst (t, cst); } + (convert (mult (convert:t @0) { cst; }))))) +(simplify + (plus (lshift:s @0 INTEGER_CST@1) (lshift:s @0 INTEGER_CST@2)) + (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0)) + && tree_fits_uhwi_p (@1) + && tree_to_uhwi (@1) < element_precision (type) + && tree_fits_uhwi_p (@2) + && tree_to_uhwi (@2) < element_precision (type)) + (with { tree t = type; + if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t); + unsigned int prec = element_precision (type); + wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1), prec); + w += wi::set_bit_in_zero (tree_to_uhwi (@2), prec); + tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t) + : t, w); + cst = build_uniform_cst (t, cst); } + (convert (mult (convert:t @0) { cst; }))))) +#endif + /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */ (for minmax (min max FMIN_ALL FMAX_ALL) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 6475b7d..2b48741 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,11 @@ 2020-05-05 Jakub Jelinek + PR tree-optimization/94800 + * gcc.dg/tree-ssa/pr94800.c: New test. + * gcc.dg/tree-ssa/popcount5.c: New test. + * gcc.dg/tree-ssa/popcount5l.c: New test. + * gcc.dg/tree-ssa/popcount5ll.c: New test. + PR target/94942 * gcc.target/i386/pr94942.c: New test. diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount5.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount5.c new file mode 100644 index 0000000..6358d4f --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount5.c @@ -0,0 +1,22 @@ +/* PR tree-optimization/94800 */ +/* { dg-do compile } */ +/* { dg-require-effective-target popcount } */ +/* { dg-require-effective-target int32plus } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +const unsigned m1 = 0x55555555UL; +const unsigned m2 = 0x33333333UL; +const unsigned m4 = 0x0F0F0F0FUL; +const int shift = 24; + +int popcount64c(unsigned x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + x += (x << 8); + x += (x << 16); + return x >> shift; +} + +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c new file mode 100644 index 0000000..b761667 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c @@ -0,0 +1,32 @@ +/* PR tree-optimization/94800 */ +/* { dg-do compile { target int32plus } } */ +/* { dg-require-effective-target popcountl } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +#if __SIZEOF_LONG__ == 4 +const unsigned long m1 = 0x55555555UL; +const unsigned long m2 = 0x33333333UL; +const unsigned long m4 = 0x0F0F0F0FUL; +const int shift = 24; +#else +const unsigned long m1 = 0x5555555555555555UL; +const unsigned long m2 = 0x3333333333333333UL; +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; +const int shift = 56; +#endif + + +int popcount64c(unsigned long x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + x += (x << 8); + x += (x << 16); +#if __SIZEOF_LONG__ != 4 + x += (x << 32); +#endif + return x >> shift; +} + +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c new file mode 100644 index 0000000..831d5e17 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c @@ -0,0 +1,22 @@ +/* PR tree-optimization/94800 */ +/* { dg-do compile } */ +/* { dg-require-effective-target popcountll } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +const unsigned long long m1 = 0x5555555555555555ULL; +const unsigned long long m2 = 0x3333333333333333ULL; +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; +const int shift = 56; + +int popcount64c(unsigned long long x) +{ + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + x += (x << 8); + x += (x << 16); + x += (x << 32); + return x >> shift; +} + +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr94800.c b/gcc/testsuite/gcc.dg/tree-ssa/pr94800.c new file mode 100644 index 0000000..4f92df3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr94800.c @@ -0,0 +1,80 @@ +/* PR tree-optimization/94800 */ +/* { dg-do compile { target { ilp32 || lp64 } } } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times " \* 72340172838076673" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " \* 16843009" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " \* 289360691352306692" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times " \* 1229782938247303441" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-not "<<" "optimized" } } */ + +unsigned long long int +foo (unsigned long long int x) +{ + unsigned long long int a = x + (x << 8); + unsigned long long int b = a + (a << 16); + unsigned long long int c = b + (b << 32); + return c; +} + +unsigned int +bar (unsigned int x) +{ + unsigned int a = x + (x << 8); + unsigned int b = a + (a << 16); + return b; +} + +unsigned long long int +baz (unsigned long long int x) +{ + unsigned long long int a = (x << 2) + (x << 10); + unsigned long long int b = a + (a << 16); + unsigned long long int c = b + (b << 32); + return c; +} + +unsigned long long int +qux (unsigned long long int x) +{ + unsigned long long int a = x + (x << 4); + unsigned long long int b = a + (a << 8); + unsigned long long int c = b + (b << 16); + unsigned long long int d = c + (c << 32); + return d; +} + +long long int +quux (long long int x) +{ + long long int a = x + (x << 8); + long long int b = a + (a << 16); + long long int c = b + (b << 32); + return c; +} + +int +corge (int x) +{ + int a = x + (x << 8); + int b = a + (a << 16); + return b; +} + +long long int +garply (long long int x) +{ + long long int a = (x << 2) + (x << 10); + long long int b = a + (a << 16); + long long int c = b + (b << 32); + return c; +} + +long long int +waldo (long long int x) +{ + long long int a = x + (x << 4); + long long int b = a + (a << 8); + long long int c = b + (b << 16); + long long int d = c + (c << 32); + return d; +} -- 2.7.4