2015-05-12 Richard Biener <rguenther@suse.de>
authorrguenth <rguenth@138bc75d-0d04-0410-961f-82ee72b054a4>
Tue, 12 May 2015 11:55:40 +0000 (11:55 +0000)
committerrguenth <rguenth@138bc75d-0d04-0410-961f-82ee72b054a4>
Tue, 12 May 2015 11:55:40 +0000 (11:55 +0000)
PR tree-optimization/37021
* tree-vectorizer.h (struct _slp_tree): Add two_operators flag.
(SLP_TREE_TWO_OPERATORS): New define.
* tree-vect-slp.c (vect_create_new_slp_node): Initialize
SLP_TREE_TWO_OPERATORS.
(vect_build_slp_tree_1): Allow two mixing plus/minus in an
SLP node.
(vect_build_slp_tree): Adjust.
(vect_analyze_slp_cost_1): Likewise.
(vect_schedule_slp_instance): Vectorize mixing plus/minus by
emitting two vector stmts and mixing the results.

* gcc.target/i386/vect-addsub.c: New testcase.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@223059 138bc75d-0d04-0410-961f-82ee72b054a4

gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/i386/vect-addsub.c [new file with mode: 0644]
gcc/tree-vect-slp.c
gcc/tree-vectorizer.h

index 5313de1..adc96c3 100644 (file)
@@ -1,3 +1,17 @@
+2015-05-12  Richard Biener  <rguenther@suse.de>
+
+       PR tree-optimization/37021
+       * tree-vectorizer.h (struct _slp_tree): Add two_operators flag.
+       (SLP_TREE_TWO_OPERATORS): New define.
+       * tree-vect-slp.c (vect_create_new_slp_node): Initialize
+       SLP_TREE_TWO_OPERATORS.
+       (vect_build_slp_tree_1): Allow two mixing plus/minus in an
+       SLP node.
+       (vect_build_slp_tree): Adjust.
+       (vect_analyze_slp_cost_1): Likewise.
+       (vect_schedule_slp_instance): Vectorize mixing plus/minus by
+       emitting two vector stmts and mixing the results.
+
 2015-05-12  Dominik Vogt  <vogt@linux.vnet.ibm.com>
 
        * call.c (print_z_candidates): Remove dead code.
index 0d4b058..9819e2a 100644 (file)
@@ -1,3 +1,8 @@
+2015-05-12  Richard Biener  <rguenther@suse.de>
+
+       PR tree-optimization/37021
+       * gcc.target/i386/vect-addsub.c: New testcase.
+
 2015-05-11  Alexander Monakov  <amonakov@ispras.ru>
 
        * gcc.target/i386/pr65753.c: Use -O2 instead of -O.
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsub.c b/gcc/testsuite/gcc.target/i386/vect-addsub.c
new file mode 100644 (file)
index 0000000..2d75328
--- /dev/null
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse4 -mtune=generic" } */
+
+/* We need SSE4 so the backend recognizes a { 0, 5, 2, 7 } constant
+   permutation as supported as the vectorizer wants to generate
+
+     vect__6.10_24 = vect__3.6_20 - vect__5.9_23;
+     vect__6.11_25 = vect__3.6_20 + vect__5.9_23;
+     _26 = VEC_PERM_EXPR <vect__6.10_24, vect__6.11_25, { 0, 5, 2, 7 }>;
+
+   See also the ??? comment about using and/andn/or in expand_vec_perm_blend
+   for non-SSE4 targets.  */
+
+void testf (float * __restrict__ p, float * __restrict q)
+{
+  p[0] = p[0] - q[0];
+  p[1] = p[1] + q[1];
+  p[2] = p[2] - q[2];
+  p[3] = p[3] + q[3];
+}
+
+/* { dg-final { scan-assembler "addsubps" } } */
index d6efe94..c675b1c 100644 (file)
@@ -160,6 +160,7 @@ vect_create_new_slp_node (vec<gimple> scalar_stmts)
   SLP_TREE_VEC_STMTS (node).create (0);
   SLP_TREE_CHILDREN (node).create (nops);
   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
+  SLP_TREE_TWO_OPERATORS (node) = false;
 
   return node;
 }
@@ -472,11 +473,14 @@ static bool
 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                       vec<gimple> stmts, unsigned int group_size,
                       unsigned nops, unsigned int *max_nunits,
-                      unsigned int vectorization_factor, bool *matches)
+                      unsigned int vectorization_factor, bool *matches,
+                      bool *two_operators)
 {
   unsigned int i;
-  gimple stmt = stmts[0];
-  enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
+  gimple first_stmt = stmts[0], stmt = stmts[0];
+  enum tree_code first_stmt_code = ERROR_MARK;
+  enum tree_code alt_stmt_code = ERROR_MARK;
+  enum tree_code rhs_code = ERROR_MARK;
   enum tree_code first_cond_code = ERROR_MARK;
   tree lhs;
   bool need_same_oprnds = false;
@@ -675,10 +679,20 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       else
        {
          if (first_stmt_code != rhs_code
+             && alt_stmt_code == ERROR_MARK)
+           alt_stmt_code = rhs_code;
+         if (first_stmt_code != rhs_code
              && (first_stmt_code != IMAGPART_EXPR
                  || rhs_code != REALPART_EXPR)
              && (first_stmt_code != REALPART_EXPR
                  || rhs_code != IMAGPART_EXPR)
+             /* Handle mismatches in plus/minus by computing both
+                and merging the results.  */
+             && !((first_stmt_code == PLUS_EXPR
+                   || first_stmt_code == MINUS_EXPR)
+                  && (alt_stmt_code == PLUS_EXPR
+                      || alt_stmt_code == MINUS_EXPR)
+                  && rhs_code == alt_stmt_code)
               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
                    && (first_stmt_code == ARRAY_REF
                        || first_stmt_code == BIT_FIELD_REF
@@ -692,7 +706,10 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                                   "Build SLP failed: different operation "
                                   "in stmt ");
                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
-                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "original stmt ");
+                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                   first_stmt, 0);
                }
              /* Mismatch.  */
              continue;
@@ -921,6 +938,43 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
     if (!matches[i])
       return false;
 
+  /* If we allowed a two-operation SLP node verify the target can cope
+     with the permute we are going to use.  */
+  if (alt_stmt_code != ERROR_MARK
+      && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
+    {
+      unsigned char *sel
+       = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
+      for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
+       {
+         sel[i] = i;
+         if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
+           sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
+       }
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         for (i = 0; i < group_size; ++i)
+           if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
+             {
+               matches[i] = false;
+               if (dump_enabled_p ())
+                 {
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Build SLP failed: different operation "
+                                    "in stmt ");
+                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                     stmts[i], 0);
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "original stmt ");
+                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                     first_stmt, 0);
+                 }
+             }
+         return false;
+       }
+      *two_operators = true;
+    }
+
   return true;
 }
 
@@ -957,10 +1011,13 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   else
     return false;
 
+  bool two_operators = false;
   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
                              SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
-                             max_nunits, vectorization_factor, matches))
+                             max_nunits, vectorization_factor, matches,
+                             &two_operators))
     return false;
+  SLP_TREE_TWO_OPERATORS (*node) = two_operators;
 
   /* If the SLP node is a load, terminate the recursion.  */
   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
@@ -1519,8 +1576,17 @@ vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
        }
     }
   else
-    record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
-                     stmt_info, 0, vect_body);
+    {
+      record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
+                       stmt_info, 0, vect_body);
+      if (SLP_TREE_TWO_OPERATORS (node))
+       {
+         record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
+                           stmt_info, 0, vect_body);
+         record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
+                           stmt_info, 0, vect_body);
+       }
+    }
 
   /* Scan operands and account for prologue cost of constants/externals.
      ???  This over-estimates cost for multiple uses and should be
@@ -3352,6 +3418,74 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
     }
 
+  /* Handle two-operation SLP nodes by vectorizing the group with
+     both operations and then performing a merge.  */
+  if (SLP_TREE_TWO_OPERATORS (node))
+    {
+      enum tree_code code0 = gimple_assign_rhs_code (stmt);
+      enum tree_code ocode;
+      gimple ostmt;
+      unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
+      bool allsame = true;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
+       if (gimple_assign_rhs_code (ostmt) != code0)
+         {
+           mask[i] = 1;
+           allsame = false;
+           ocode = gimple_assign_rhs_code (ostmt);
+         }
+       else
+         mask[i] = 0;
+      if (!allsame)
+       {
+         vec<gimple> v0;
+         vec<gimple> v1;
+         unsigned j;
+         tree tmask = NULL_TREE;
+         vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+         v0 = SLP_TREE_VEC_STMTS (node).copy ();
+         SLP_TREE_VEC_STMTS (node).truncate (0);
+         gimple_assign_set_rhs_code (stmt, ocode);
+         vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+         gimple_assign_set_rhs_code (stmt, code0);
+         v1 = SLP_TREE_VEC_STMTS (node).copy ();
+         SLP_TREE_VEC_STMTS (node).truncate (0);
+         tree meltype = build_nonstandard_integer_type
+             (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
+         tree mvectype = get_same_sized_vectype (meltype, vectype);
+         unsigned k = 0, l;
+         for (j = 0; j < v0.length (); ++j)
+           {
+             tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
+             for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
+               {
+                 if (k > group_size)
+                   k = 0;
+                 melts[l] = build_int_cst
+                     (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
+               }
+             tmask = build_vector (mvectype, melts);
+
+             /* ???  Not all targets support a VEC_PERM_EXPR with a
+                constant mask that would translate to a vec_merge RTX
+                (with their vec_perm_const_ok).  We can either not
+                vectorize in that case or let veclower do its job.
+                Unfortunately that isn't too great and at least for
+                plus/minus we'd eventually like to match targets
+                vector addsub instructions.  */
+             gimple vstmt;
+             vstmt = gimple_build_assign (make_ssa_name (vectype),
+                                          VEC_PERM_EXPR,
+                                          gimple_assign_lhs (v0[j]),
+                                          gimple_assign_lhs (v1[j]), tmask);
+             vect_finish_stmt_generation (stmt, vstmt, &si);
+             SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
+           }
+         v0.release ();
+         v1.release ();
+         return false;
+       }
+    }
   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
   return is_store;
 }
index 0796cc1..5a4fdbb 100644 (file)
@@ -111,6 +111,8 @@ struct _slp_tree {
      scalar elements in one scalar iteration (GROUP_SIZE) multiplied by VF
      divided by vector size.  */
   unsigned int vec_stmts_size;
+  /* Whether the scalar computations use two different operators.  */
+  bool two_operators;
 };
 
 
@@ -146,6 +148,7 @@ typedef struct _slp_instance {
 #define SLP_TREE_VEC_STMTS(S)                    (S)->vec_stmts
 #define SLP_TREE_NUMBER_OF_VEC_STMTS(S)          (S)->vec_stmts_size
 #define SLP_TREE_LOAD_PERMUTATION(S)             (S)->load_permutation
+#define SLP_TREE_TWO_OPERATORS(S)               (S)->two_operators
 
 /* This structure is used in creation of an SLP tree.  Each instance
    corresponds to the same operand in a group of scalar stmts in an SLP