re PR tree-optimization/87914 (gcc fails to vectorize bitreverse code)
authorRichard Biener <rguenther@suse.de>
Wed, 7 Nov 2018 15:01:09 +0000 (15:01 +0000)
committerRichard Biener <rguenth@gcc.gnu.org>
Wed, 7 Nov 2018 15:01:09 +0000 (15:01 +0000)
2018-11-07  Richard Biener  <rguenther@suse.de>

PR tree-optimization/87914
* tree-vect-loop.c (vect_is_simple_reduction): Improve detection
of nested cycles.
(vectorizable_reduction): Handle shifts and rotates by dispatching
to vectorizable_shift.
* tree-vect-stmts.c (vect_get_vec_def_for_operand_1): Handle
in-loop uses of vect_nested_cycle defs.  Merge cycle and internal
def cases.
(vectorizable_shift): Export and handle being called as
vect_nested_cycle.
(vect_analyze_stmt): Call vectorizable_shift after
vectorizable_reduction.
* tree-vectorizer.h (vectorizable_shift): Declare.

* lib/target-supports.exp (check_effective_target_vect_var_shift): New.
(check_avx2_available): Likewise.
* g++.dg/vect/pr87914.cc: New testcase.

From-SVN: r265876

gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/g++.dg/vect/pr87914.cc [new file with mode: 0644]
gcc/testsuite/lib/target-supports.exp
gcc/tree-vect-loop.c
gcc/tree-vect-stmts.c
gcc/tree-vectorizer.h

index c893ee2..21ba2ef 100644 (file)
@@ -1,3 +1,19 @@
+2018-11-07  Richard Biener  <rguenther@suse.de>
+
+       PR tree-optimization/87914
+       * tree-vect-loop.c (vect_is_simple_reduction): Improve detection
+       of nested cycles.
+       (vectorizable_reduction): Handle shifts and rotates by dispatching
+       to vectorizable_shift.
+       * tree-vect-stmts.c (vect_get_vec_def_for_operand_1): Handle
+       in-loop uses of vect_nested_cycle defs.  Merge cycle and internal
+       def cases.
+       (vectorizable_shift): Export and handle being called as
+       vect_nested_cycle.
+       (vect_analyze_stmt): Call vectorizable_shift after
+       vectorizable_reduction.
+       * tree-vectorizer.h (vectorizable_shift): Declare.
+
 2018-11-07  Jan Hubicka  <jh@suse.cz>
 
        * ipa-devirt.c (odr_types_equivalent_p): Expect constants
index 79e0fa3..210ad30 100644 (file)
@@ -1,3 +1,10 @@
+2018-11-07  Richard Biener  <rguenther@suse.de>
+
+       PR tree-optimization/87914
+       * lib/target-supports.exp (check_effective_target_vect_var_shift): New.
+       (check_avx2_available): Likewise.
+       * g++.dg/vect/pr87914.cc: New testcase.
+
 2018-11-07  Chenghua Xu  <paul.hua.gm@gmail.com>
 
        * gcc.target/mips/loongson-ctz.c: Fix typo.
diff --git a/gcc/testsuite/g++.dg/vect/pr87914.cc b/gcc/testsuite/g++.dg/vect/pr87914.cc
new file mode 100644 (file)
index 0000000..12fbba3
--- /dev/null
@@ -0,0 +1,49 @@
+// { dg-do run }
+// { dg-additional-options "-fopenmp-simd" }
+// { dg-additional-options "-mavx2" { target { avx2_runtime } } }
+
+extern "C" int memcmp(const void *s1, const void *s2, __SIZE_TYPE__ n);
+extern "C" void abort(void);
+
+template <typename T>
+T reverseBits(T x)
+{
+  unsigned int s = sizeof(x) * 8;
+  T mask = ~T(0);
+  while ((s >>= 1) > 0)
+    {
+      mask ^= (mask << s);
+      x = ((x >> s) & mask) | ((x << s) & ~mask); // unsupported use in stmt
+    }
+  return x;
+}
+
+void __attribute__((noinline,noipa))
+test_reverseBits(unsigned* x)
+{
+#pragma omp simd aligned(x:32)
+  for (int i = 0; i < 16; ++i)
+    x[i] = reverseBits(x[i]); // couldn't vectorize loop
+}
+
+int main()
+{
+  unsigned arr[16] __attribute__((aligned(32)))
+    = { 0x01020304, 0x05060708, 0x0a0b0c0d, 0x0e0f1011,
+        0x11121314, 0x45065708, 0xfa0b3c0du, 0x0e0f1211,
+        0x21222324, 0x55066708, 0xfa0b2c0du, 0x1e0f1011,
+        0x31323334, 0x65067708, 0xfa0b5c0du, 0x0e3f1011 };
+  unsigned arr2[16]
+    = { 0x20c04080, 0x10e060a0, 0xb030d050, 0x8808f070u,
+        0x28c84888, 0x10ea60a2, 0xb03cd05f, 0x8848f070u,
+        0x24c44484, 0x10e660aa, 0xb034d05f, 0x8808f078u, 
+        0x2ccc4c8c, 0x10ee60a6, 0xb03ad05f, 0x8808fc70u };
+
+  test_reverseBits (arr);
+
+  if (memcmp (arr, arr2, sizeof (arr)) != 0)
+    abort ();
+  return 0;
+}
+
+// { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" { target { vect_var_shift && vect_int } } } }
index 76c393d..c202a08 100644 (file)
@@ -5329,6 +5329,15 @@ proc check_effective_target_vect_shift { } {
                 && [check_effective_target_s390_vx]) }}]
 }
 
+# Return 1 if the target supports hardware vector shift by register operation.
+
+proc check_effective_target_vect_var_shift { } {
+    return [check_cached_effective_target_indexed vect_var_shift {
+      expr {(([istarget i?86-*-*] || [istarget x86_64-*-*])
+            && [check_avx2_available])
+      }}]
+}
+
 proc check_effective_target_whole_vector_shift { } {
     if { [istarget i?86-*-*] || [istarget x86_64-*-*]
         || [istarget ia64-*-*]
@@ -7163,6 +7172,19 @@ proc check_avx_available { } {
   return 0;
 }
 
+# Return true if we are compiling for AVX2 target.
+
+proc check_avx2_available { } {
+  if { [check_no_compiler_messages avx_available assembly {
+    #ifndef __AVX2__
+    #error unsupported
+    #endif
+  } ""] } {
+    return 1;
+  }
+  return 0;
+}
+
 # Return true if we are compiling for SSSE3 target.
 
 proc check_ssse3_available { } {
index 41a46c2..5ce203b 100644 (file)
@@ -2843,6 +2843,11 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           return NULL;
         }
 
+      /* For inner loop reductions in nested vectorization there are no
+         constraints on the number of uses in the inner loop.  */
+      if (loop == vect_loop->inner)
+       continue;
+
       nloop_uses++;
       if (nloop_uses > 1)
         {
@@ -2901,13 +2906,19 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       else
        /* We can have more than one loop-closed PHI.  */
        lcphis.safe_push (as_a <gphi *> (use_stmt));
-      if (nloop_uses > 1)
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction used in loop.\n");
-         return NULL;
-       }
+    }
+
+  /* If this isn't a nested cycle or if the nested cycle reduction value
+     is used ouside of the inner loop we cannot handle uses of the reduction
+     value.  */
+  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
+  if ((!nested_in_vect_loop || !lcphis.is_empty ())
+      && nloop_uses > 1)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "reduction used in loop.\n");
+      return NULL;
     }
 
   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
@@ -2968,9 +2979,15 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
     }
 
   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
-  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
   code = orig_code = gimple_assign_rhs_code (def_stmt);
 
+  if (nested_in_vect_loop && !check_reduction)
+    {
+      if (dump_enabled_p ())
+       report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
+      return def_stmt_info;
+    }
+
   /* We can handle "res -= x[i]", which is non-associative by
      simply rewriting this into "res += -x[i]".  Avoid changing
      gimple instruction for the first simple tests and only do this
@@ -6448,6 +6465,19 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   vec_mode = TYPE_MODE (vectype_in);
   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
 
+  if (nested_cycle)
+    {
+      def_bb = gimple_bb (reduc_def_phi);
+      def_stmt_loop = def_bb->loop_father;
+      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                                       loop_preheader_edge (def_stmt_loop));
+      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
+      if (def_arg_stmt_info
+         && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
+             == vect_double_reduction_def))
+        double_reduc = true;
+    }
+
   if (code == COND_EXPR)
     {
       /* Only call during the analysis stage, otherwise we'll lose
@@ -6462,20 +6492,26 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
          return false;
         }
     }
-  else
+  else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
+          || code == LROTATE_EXPR || code == RROTATE_EXPR)
     {
-      /* 4. Supportable by target?  */
-
-      if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
-         || code == LROTATE_EXPR || code == RROTATE_EXPR)
+      /* Only call during the analysis stage, otherwise we'll lose
+        STMT_VINFO_TYPE.  We only support this for nested cycles
+        without double reductions at the moment.  */
+      if (!nested_cycle
+         || double_reduc
+         || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
+                                               NULL, cost_vec)))
        {
-         /* Shifts and rotates are only supported by vectorizable_shifts,
-            not vectorizable_reduction.  */
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "unsupported shift or rotation.\n");
+                            "unsupported shift or rotation in reduction\n");
          return false;
        }
+    }
+  else
+    {
+      /* 4. Supportable by target?  */
 
       /* 4.1. check support for the operation in the loop  */
       optab = optab_for_tree_code (code, vectype_in, optab_default);
@@ -6580,19 +6616,6 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
        orig_code = cond_reduc_op_code;
     }
 
-  if (nested_cycle)
-    {
-      def_bb = gimple_bb (reduc_def_phi);
-      def_stmt_loop = def_bb->loop_father;
-      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                       loop_preheader_edge (def_stmt_loop));
-      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
-      if (def_arg_stmt_info
-         && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
-             == vect_double_reduction_def))
-        double_reduc = true;
-    }
-
   reduc_fn = IFN_LAST;
 
   if (reduction_type == TREE_CODE_REDUCTION
@@ -6963,6 +6986,12 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
           /* Multiple types are not supported for condition.  */
           break;
         }
+      if (code == LSHIFT_EXPR
+         || code == RSHIFT_EXPR)
+       {
+         vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
+         break;
+       }
 
       /* Handle uses.  */
       if (j == 0)
index 7127c17..8133149 100644 (file)
@@ -1461,6 +1461,16 @@ vect_get_vec_def_for_operand_1 (stmt_vec_info def_stmt_info,
       /* Code should use vect_get_vec_def_for_operand.  */
       gcc_unreachable ();
 
+    /* Operand is defined by a loop header phi.  In case of nested
+       cycles we also may have uses of the backedge def.  */
+    case vect_reduction_def:
+    case vect_double_reduction_def:
+    case vect_nested_cycle:
+    case vect_induction_def:
+      gcc_assert (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
+                 || dt == vect_nested_cycle);
+      /* Fallthru.  */
+
     /* operand is defined inside the loop.  */
     case vect_internal_def:
       {
@@ -1480,23 +1490,6 @@ vect_get_vec_def_for_operand_1 (stmt_vec_info def_stmt_info,
        return vec_oprnd;
       }
 
-    /* operand is defined by a loop header phi.  */
-    case vect_reduction_def:
-    case vect_double_reduction_def:
-    case vect_nested_cycle:
-    case vect_induction_def:
-      {
-       gcc_assert (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI);
-
-       /* Get the def from the vectorized stmt.  */
-       vec_stmt_info = STMT_VINFO_VEC_STMT (def_stmt_info);
-       if (gphi *phi = dyn_cast <gphi *> (vec_stmt_info->stmt))
-         vec_oprnd = PHI_RESULT (phi);
-       else
-         vec_oprnd = gimple_get_lhs (vec_stmt_info->stmt);
-       return vec_oprnd;
-      }
-
     default:
       gcc_unreachable ();
     }
@@ -5363,7 +5356,7 @@ vect_supportable_shift (enum tree_code code, tree scalar_type)
    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
    Return true if STMT_INFO is vectorizable in this way.  */
 
-static bool
+bool
 vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                    stmt_vec_info *vec_stmt, slp_tree slp_node,
                    stmt_vector_for_cost *cost_vec)
@@ -5401,6 +5394,7 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
+      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle
       && ! vec_stmt)
     return false;
 
@@ -5480,7 +5474,8 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
      shift/rotate amount is a vector, use the vector/vector shift optabs.  */
 
   if ((dt[1] == vect_internal_def
-       || dt[1] == vect_induction_def)
+       || dt[1] == vect_induction_def
+       || dt[1] == vect_nested_cycle)
       && !slp_node)
     scalar_shift_arg = false;
   else if (dt[1] == vect_constant_def
@@ -9540,7 +9535,6 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize,
          || vectorizable_simd_clone_call (stmt_info, NULL, NULL, node,
                                           cost_vec)
          || vectorizable_conversion (stmt_info, NULL, NULL, node, cost_vec)
-         || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_operation (stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_assignment (stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_load (stmt_info, NULL, NULL, node, node_instance,
@@ -9549,6 +9543,7 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize,
          || vectorizable_reduction (stmt_info, NULL, NULL, node,
                                     node_instance, cost_vec)
          || vectorizable_induction (stmt_info, NULL, NULL, node, cost_vec)
+         || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_condition (stmt_info, NULL, NULL, NULL, 0, node,
                                     cost_vec)
          || vectorizable_comparison (stmt_info, NULL, NULL, NULL, node,
index e1292aa..e66f28b 100644 (file)
@@ -1483,6 +1483,9 @@ extern opt_result vect_analyze_stmt (stmt_vec_info, bool *, slp_tree,
 extern bool vectorizable_condition (stmt_vec_info, gimple_stmt_iterator *,
                                    stmt_vec_info *, tree, int, slp_tree,
                                    stmt_vector_for_cost *);
+extern bool vectorizable_shift (stmt_vec_info, gimple_stmt_iterator *,
+                               stmt_vec_info *, slp_tree,
+                               stmt_vector_for_cost *);
 extern void vect_get_load_cost (stmt_vec_info, int, bool,
                                unsigned int *, unsigned int *,
                                stmt_vector_for_cost *,