tree-optimization/101178 - handle VEC_PERM in SLP permute propagation
authorRichard Biener <rguenther@suse.de>
Wed, 30 Jun 2021 14:28:50 +0000 (16:28 +0200)
committerRichard Biener <rguenther@suse.de>
Thu, 1 Jul 2021 07:47:58 +0000 (09:47 +0200)
This adds handling of VEC_PERM nodes to SLP permute propagation.
Previously VEC_PERM acted as forced materialization of incoming
permutes since it is a good place to do that (with the constraint
of those only appearing for two-operator nodes).  The following
patch, in addition to supporting (but not forcing) this, enables
VEC_PERM nodes acting as "any" permute on the outgoing side since
they also can consume arbitrary permutes on that side.

This again (meh) changes how we represent permutes and materialization
on the graph vertices now explicitely having the common incoming
permute as well as an outgoing permute and in case both are
different the vertex acts as materialization point of the incoming
permute.

2021-06-30  Richard Biener  <rguenther@suse.de>

PR tree-optimization/101178
* tree-vect-slp.c (slpg_vertex::materialize): Remove.
(slpg::perm_in): Add.
(slpg::get_perm_in): Remove.
(slpg::get_perm_materialized): Add.
(vect_optimize_slp): Handle VEC_PERM nodes more optimally
during permute propagation and materialization.

* gcc.dg/vect/bb-slp-72.c: New testcase.
* gcc.dg/vect/bb-slp-73.c: Likewise.
* gcc.dg/vect/bb-slp-74.c: Likewise.

gcc/testsuite/gcc.dg/vect/bb-slp-72.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/bb-slp-73.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/bb-slp-74.c [new file with mode: 0644]
gcc/tree-vect-slp.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-72.c b/gcc/testsuite/gcc.dg/vect/bb-slp-72.c
new file mode 100644 (file)
index 0000000..5b243fc
--- /dev/null
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+
+#include "tree-vect.h"
+
+double x[2], y[2], z[2], w[2];
+
+void __attribute__((noipa)) foo ()
+{
+  double tem0 = x[1] + y[1];
+  double tem1 = x[0] - y[0];
+  double tem2 = z[1] * tem0;
+  double tem3 = z[0] * tem1;
+  z[0] = tem2 - w[0];
+  z[1] = tem3 + w[1];
+}
+
+int main()
+{
+  check_vect ();
+
+  x[0] = 1.; x[1] = 2.;
+  y[0] = 7.; y[1] = -5.;
+  z[0] = 2.; z[1] = 3.;
+  w[0] = 9.; w[1] = -5.;
+  foo ();
+  if (z[0] != -18. || z[1] != -17.)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-73.c b/gcc/testsuite/gcc.dg/vect/bb-slp-73.c
new file mode 100644 (file)
index 0000000..d4c8a51
--- /dev/null
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+
+#include "tree-vect.h"
+
+double x[2], y[2], z[2], w[2];
+
+void __attribute__((noipa)) foo ()
+{
+  double tem0 = x[1] + y[1];
+  double tem1 = x[0] - y[0];
+  double tem2 = z[1] * tem0;
+  double tem3 = z[0] * tem1;
+  z[0] = tem2 - w[1];
+  z[1] = tem3 + w[0];
+}
+
+int main()
+{
+  check_vect ();
+
+  x[0] = 1.; x[1] = 2.;
+  y[0] = 7.; y[1] = -5.;
+  z[0] = 2.; z[1] = 3.;
+  w[0] = 9.; w[1] = -5.;
+  foo ();
+  if (z[0] != -4. || z[1] != -3.)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-74.c b/gcc/testsuite/gcc.dg/vect/bb-slp-74.c
new file mode 100644 (file)
index 0000000..d3d5a02
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+
+#include "tree-vect.h"
+
+double a[2], b[2], c[2];
+
+void __attribute__((noipa)) foo ()
+{
+  double tem0 = a[1] + b[1];
+  double tem1 = a[0] - b[0];
+  c[0] = 2. * tem0;
+  c[1] = 5. * tem1;
+}
+
+int main()
+{
+  check_vect ();
+
+  a[0] = 1.; a[1] = 3.;
+  b[0] = -5.; b[1] = 13.;
+  foo ();
+  if (c[0] != 32. || c[1] != 30.)
+    __builtin_abort ();
+  return 0;
+}
+
+/* We'd like to see at most one VEC_PERM_EXPR, not one for a blend
+   and one for a permute materialized somewhere else.  But addsub
+   pattern recog can likely get in the way here.  */
+/* { dg-final { scan-tree-dump-times "  \[^ \]\+ = VEC_PERM_EXPR" 1 "slp2" } } */
index 10195d3..966b281 100644 (file)
@@ -3470,16 +3470,19 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 struct slpg_vertex
 {
   slpg_vertex (slp_tree node_)
-    : node (node_), perm_out (-1), materialize (0) {}
+    : node (node_), perm_in (-1), perm_out (-1) {}
 
-  int get_perm_in () const { return materialize ? materialize : perm_out; }
+  int get_perm_materialized () const
+    { return perm_in != perm_out ? perm_in : 0; }
 
   slp_tree node;
-  /* The permutation on the outgoing lanes (towards SLP parents).  */
+  /* The common permutation on the incoming lanes (towards SLP children).  */
+  int perm_in;
+  /* The permutation on the outgoing lanes (towards SLP parents).  When
+     the node is a materialization point for a permute this differs
+     from perm_in (and is then usually zero).  Materialization happens
+     on the input side.  */
   int perm_out;
-  /* The permutation that is applied by this node.  perm_out is
-     relative to this.  */
-  int materialize;
 };
 
 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
@@ -3614,7 +3617,11 @@ vect_optimize_slp (vec_info *vinfo)
       /* Leafs do not change across iterations.  Note leafs also double
         as entries to the reverse graph.  */
       if (!slpg->vertices[idx].succ)
-       vertices[idx].perm_out = 0;
+       {
+         vertices[idx].perm_in = 0;
+         vertices[idx].perm_out = 0;
+       }
+
       /* Loads are the only thing generating permutes.  */
       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
        continue;
@@ -3663,6 +3670,7 @@ vect_optimize_slp (vec_info *vinfo)
       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
        perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
       perms.safe_push (perm);
+      vertices[idx].perm_in = perms.length () - 1;
       vertices[idx].perm_out = perms.length () - 1;
     }
 
@@ -3702,8 +3710,11 @@ vect_optimize_slp (vec_info *vinfo)
              if (STMT_VINFO_DATA_REF (rep)
                  && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
                {
+                 /* ???  We're forcing materialization in place
+                    of the child here, we'd need special handling
+                    in materialization to leave perm_in -1 here.  */
+                 vertices[idx].perm_in = 0;
                  vertices[idx].perm_out = 0;
-                 continue;
                }
              /* We cannot move a permute across an operation that is
                 not independent on lanes.  Note this is an explicit
@@ -3717,20 +3728,19 @@ vect_optimize_slp (vec_info *vinfo)
                  case CFN_COMPLEX_MUL:
                  case CFN_COMPLEX_MUL_CONJ:
                  case CFN_VEC_ADDSUB:
+                   vertices[idx].perm_in = 0;
                    vertices[idx].perm_out = 0;
-                   continue;
                  default:;
                  }
            }
 
-         int perm;
          if (!slpg->vertices[idx].succ)
            /* Pick up pre-computed leaf values.  */
-           perm = vertices[idx].perm_out;
+           ;
          else
            {
              bool any_succ_perm_out_m1 = false;
-             perm = vertices[idx].get_perm_in ();
+             int perm_in = vertices[idx].perm_in;
              for (graph_edge *succ = slpg->vertices[idx].succ;
                   succ; succ = succ->succ_next)
                {
@@ -3752,18 +3762,18 @@ vect_optimize_slp (vec_info *vinfo)
                        any_succ_perm_out_m1 = true;
                      continue;
                    }
-                 if (perm == -1)
-                   perm = succ_perm;
+                 if (perm_in == -1)
+                   perm_in = succ_perm;
                  else if (succ_perm == 0
-                          || !vect_slp_perms_eq (perms, perm, succ_perm))
+                          || !vect_slp_perms_eq (perms, perm_in, succ_perm))
                    {
-                     perm = 0;
+                     perm_in = 0;
                      break;
                    }
                }
 
              /* Adjust any incoming permutes we treated optimistically.  */
-             if (perm != -1 && any_succ_perm_out_m1)
+             if (perm_in != -1 && any_succ_perm_out_m1)
                {
                  for (graph_edge *succ = slpg->vertices[idx].succ;
                       succ; succ = succ->succ_next)
@@ -3772,24 +3782,36 @@ vect_optimize_slp (vec_info *vinfo)
                      if (vertices[succ->dest].perm_out == -1
                          && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
                          && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
-                       vertices[succ->dest].perm_out = perm;
+                       {
+                         vertices[succ->dest].perm_out = perm_in;
+                         /* And ensure this propagates.  */
+                         if (vertices[succ->dest].perm_in == -1)
+                           vertices[succ->dest].perm_in = perm_in;
+                       }
                    }
                  changed = true;
                }
 
-             if (!vect_slp_perms_eq (perms, perm,
-                                     vertices[idx].get_perm_in ()))
+             if (!vect_slp_perms_eq (perms, perm_in,
+                                     vertices[idx].perm_in))
                {
                  /* Make sure we eventually converge.  */
-                 gcc_checking_assert (vertices[idx].get_perm_in () == -1
-                                      || perm == 0);
-                 if (perm == 0)
-                   {
-                     vertices[idx].perm_out = 0;
-                     vertices[idx].materialize = 0;
-                   }
-                 if (!vertices[idx].materialize)
-                   vertices[idx].perm_out = perm;
+                 gcc_checking_assert (vertices[idx].perm_in == -1
+                                      || perm_in == 0);
+                 vertices[idx].perm_in = perm_in;
+
+                 /* While we can handle VEC_PERM nodes as transparent
+                    pass-through they can be a cheap materialization
+                    point as well.  In addition they can act as source
+                    of a random permutation as well.
+                    The following ensures that former materialization
+                    points that now have zero incoming permutes no
+                    longer appear as such and that former "any" permutes
+                    get pass-through.  We keep VEC_PERM nodes optimistic
+                    as "any" outgoing permute though.  */
+                 if (vertices[idx].perm_out != 0
+                     && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
+                   vertices[idx].perm_out = perm_in;
                  changed = true;
                }
            }
@@ -3799,25 +3821,19 @@ vect_optimize_slp (vec_info *vinfo)
          if (!do_materialization)
            continue;
 
+         int perm = vertices[idx].perm_out;
          if (perm == 0 || perm == -1)
            continue;
 
          /* Decide on permute materialization.  Look whether there's
             a use (pred) edge that is permuted differently than us.
-            In that case mark ourselves so the permutation is applied.
-            For VEC_PERM_EXPRs the permutation doesn't carry along
-            from children to parents so force materialization at the
-            point of the VEC_PERM_EXPR.  In principle VEC_PERM_EXPRs
-            are a source of an arbitrary permutation again, similar
-            to constants/externals - that's something we do not yet
-            optimally handle.  */
-         bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
-                                    && slpg->vertices[idx].pred != NULL);
+            In that case mark ourselves so the permutation is applied.  */
+         bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
          if (all_preds_permuted)
            for (graph_edge *pred = slpg->vertices[idx].pred;
                 pred; pred = pred->pred_next)
              {
-               int pred_perm = vertices[pred->src].get_perm_in ();
+               int pred_perm = vertices[pred->src].perm_in;
                gcc_checking_assert (pred_perm != -1);
                if (!vect_slp_perms_eq (perms, perm, pred_perm))
                  {
@@ -3827,10 +3843,8 @@ vect_optimize_slp (vec_info *vinfo)
              }
          if (!all_preds_permuted)
            {
-             if (!vertices[idx].materialize)
-               changed = true;
-             vertices[idx].materialize = perm;
              vertices[idx].perm_out = 0;
+             changed = true;
            }
        }
 
@@ -3848,46 +3862,43 @@ vect_optimize_slp (vec_info *vinfo)
   /* Materialize.  */
   for (i = 0; i < vertices.length (); ++i)
     {
-      int perm = vertices[i].get_perm_in ();
-      if (perm <= 0)
-       continue;
-
+      int perm_in = vertices[i].perm_in;
       slp_tree node = vertices[i].node;
 
-      /* First permute invariant/external original successors.  */
+      /* First permute invariant/external original successors, we handle
+        those optimistically during propagation and duplicate them if
+        they are used with different permutations.  */
       unsigned j;
       slp_tree child;
-      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-       {
-         if (!child
-             || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
-                 && SLP_TREE_DEF_TYPE (child) != vect_external_def))
-           continue;
+      if (perm_in > 0)
+       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+         {
+           if (!child
+               || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
+                   && SLP_TREE_DEF_TYPE (child) != vect_external_def))
+             continue;
 
-         /* If the vector is uniform there's nothing to do.  */
-         if (vect_slp_tree_uniform_p (child))
-           continue;
+           /* If the vector is uniform there's nothing to do.  */
+           if (vect_slp_tree_uniform_p (child))
+             continue;
 
-         /* We can end up sharing some externals via two_operator
-            handling.  Be prepared to unshare those.  */
-         if (child->refcnt != 1)
-           {
-             gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
-             SLP_TREE_CHILDREN (node)[j] = child
-               = vect_create_new_slp_node
-                   (SLP_TREE_SCALAR_OPS (child).copy ());
-           }
-         vect_slp_permute (perms[perm],
-                           SLP_TREE_SCALAR_OPS (child), true);
-       }
+           /* We can end up sharing some externals via two_operator
+              handling.  Be prepared to unshare those.  */
+           if (child->refcnt != 1)
+             {
+               gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
+               SLP_TREE_CHILDREN (node)[j] = child
+                 = vect_create_new_slp_node
+                     (SLP_TREE_SCALAR_OPS (child).copy ());
+             }
+           vect_slp_permute (perms[perm_in],
+                             SLP_TREE_SCALAR_OPS (child), true);
+         }
 
-      if (vertices[i].materialize)
+      if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
        {
-         if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
-           /* For loads simply drop the permutation, the load permutation
-              already performs the desired permutation.  */
-           ;
-         else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+         /* Apply the common permutes to the input vectors.  */
+         if (perm_in > 0)
            {
              /* If the node is already a permute node we can apply
                 the permutation to the lane selection, effectively
@@ -3896,12 +3907,30 @@ vect_optimize_slp (vec_info *vinfo)
                dump_printf_loc (MSG_NOTE, vect_location,
                                 "simplifying permute node %p\n",
                                 node);
-
              for (unsigned k = 0;
                   k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
                SLP_TREE_LANE_PERMUTATION (node)[k].second
-                 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
+                 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
+           }
+         /* Apply the anticipated output permute to the permute and
+            stmt vectors.  */
+         int perm_out = vertices[i].perm_out;
+         if (perm_out > 0)
+           {
+             vect_slp_permute (perms[perm_out],
+                               SLP_TREE_SCALAR_STMTS (node), true);
+             vect_slp_permute (perms[perm_out],
+                               SLP_TREE_LANE_PERMUTATION (node), true);
            }
+       }
+      else if (vertices[i].get_perm_materialized () != 0)
+       {
+         if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+           /* For loads simply drop the permutation, the load permutation
+              already performs the desired permutation.  */
+           ;
+         else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+           gcc_unreachable ();
          else
            {
              if (dump_enabled_p ())
@@ -3916,7 +3945,7 @@ vect_optimize_slp (vec_info *vinfo)
              SLP_TREE_CHILDREN (node) = vNULL;
              SLP_TREE_SCALAR_STMTS (copy)
                = SLP_TREE_SCALAR_STMTS (node).copy ();
-             vect_slp_permute (perms[perm],
+             vect_slp_permute (perms[perm_in],
                                SLP_TREE_SCALAR_STMTS (copy), true);
              gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
              SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
@@ -3936,28 +3965,31 @@ vect_optimize_slp (vec_info *vinfo)
              SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
              for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
                SLP_TREE_LANE_PERMUTATION (node)
-                 .quick_push (std::make_pair (0, perms[perm][j]));
+                 .quick_push (std::make_pair (0, perms[perm_in][j]));
              SLP_TREE_CODE (node) = VEC_PERM_EXPR;
            }
        }
-      else
+      else if (perm_in > 0) /* perm_in == perm_out */
        {
          /* Apply the reverse permutation to our stmts.  */
-         vect_slp_permute (perms[perm],
+         vect_slp_permute (perms[perm_in],
                            SLP_TREE_SCALAR_STMTS (node), true);
-         /* And to the load permutation, which we can simply
+         /* And to the lane/load permutation, which we can simply
             make regular by design.  */
          if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
            {
+             gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
              /* ???  When we handle non-bijective permutes the idea
                 is that we can force the load-permutation to be
                 { min, min + 1, min + 2, ... max }.  But then the
                 scalar defs might no longer match the lane content
                 which means wrong-code with live lane vectorization.
                 So we possibly have to have NULL entries for those.  */
-             vect_slp_permute (perms[perm],
+             vect_slp_permute (perms[perm_in],
                                SLP_TREE_LOAD_PERMUTATION (node), true);
            }
+         else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+           gcc_unreachable ();
        }
     }
 
@@ -3991,14 +4023,14 @@ vect_optimize_slp (vec_info *vinfo)
            }
          else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
                   && SLP_TREE_REF_COUNT (old) == 1
-                  && vertices[old->vertex].materialize)
+                  && vertices[old->vertex].get_perm_materialized () != 0)
            {
              /* ???  For loads the situation is more complex since
                 we can't modify the permute in place in case the
                 node is used multiple times.  In fact for loads this
                 should be somehow handled in the propagation engine.  */
              /* Apply the reverse permutation to our stmts.  */
-             int perm = vertices[old->vertex].get_perm_in ();
+             int perm = vertices[old->vertex].get_perm_materialized ();
              vect_slp_permute (perms[perm],
                                SLP_TREE_SCALAR_STMTS (old), true);
              vect_slp_permute (perms[perm],