[vect] Add main vectorized loop unrolling

author Andre Vieira <andre.simoesdiasvieira@arm.com>

Tue, 18 Jan 2022 15:57:39 +0000 (15:57 +0000)

committer Andre Vieira <andre.simoesdiasvieira@arm.com>

Tue, 18 Jan 2022 16:23:23 +0000 (16:23 +0000)
author Andre Vieira <andre.simoesdiasvieira@arm.com>
Tue, 18 Jan 2022 15:57:39 +0000 (15:57 +0000)
committer Andre Vieira <andre.simoesdiasvieira@arm.com>
Tue, 18 Jan 2022 16:23:23 +0000 (16:23 +0000)
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc

index f1410b0..0fe3529 100644 (file)
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -154,7 +154,8 @@ along with GCC; see the file COPYING3.  If not see
     http://gcc.gnu.org/projects/tree-ssa/vectorization.html
  */
  
-static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
+static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
+                                               unsigned *);
  static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
                                                bool *, bool *);
  
@@ -831,6 +832,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
      skip_main_loop_edge (nullptr),
      skip_this_loop_edge (nullptr),
      reusable_accumulators (),
+    suggested_unroll_factor (1),
      max_vectorization_factor (0),
      mask_skip_niters (NULL_TREE),
      rgroup_compare_type (NULL_TREE),
@@ -1834,7 +1836,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
     definitely no, or -1 if it's worth retrying.  */
  
  static int
-vect_analyze_loop_costing (loop_vec_info loop_vinfo)
+vect_analyze_loop_costing (loop_vec_info loop_vinfo,
+                          unsigned *suggested_unroll_factor)
  {
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
@@ -1868,7 +1871,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
  
    int min_profitable_iters, min_profitable_estimate;
    vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
-                                     &min_profitable_estimate);
+                                     &min_profitable_estimate,
+                                     suggested_unroll_factor);
  
    if (min_profitable_iters < 0)
      {
@@ -2152,10 +2156,16 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
          vectors to the epilogue, with the main loop continuing to operate
          on full vectors.
  
+        If we are unrolling we also do not want to use partial vectors. This
+        is to avoid the overhead of generating multiple masks and also to
+        avoid having to execute entire iterations of FALSE masked instructions
+        when dealing with one or less full iterations.
+
          ??? We could then end up failing to use partial vectors if we
          decide to peel iterations into a prologue, and if the main loop
          then ends up processing fewer than VF iterations.  */
-      if (param_vect_partial_vector_usage == 1
+      if ((param_vect_partial_vector_usage == 1
+          || loop_vinfo->suggested_unroll_factor > 1)
           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
           && !vect_known_niters_smaller_than_vf (loop_vinfo))
         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
@@ -2222,7 +2232,8 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
     for it.  The different analyses will record information in the
     loop_vec_info struct.  */
  static opt_result
-vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
+vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
+                    unsigned *suggested_unroll_factor)
  {
    opt_result ok = opt_result::success ();
    int res;
@@ -2382,6 +2393,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
       set of rgroups.  */
    gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
  
+  /* Apply the suggested unrolling factor, this was determined by the backend
+     during finish_cost the first time we ran the analyzis for this
+     vector mode.  */
+  if (loop_vinfo->suggested_unroll_factor > 1)
+    LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
+
    /* This is the point where we can re-start analysis with SLP forced off.  */
  start_over:
  
@@ -2573,7 +2590,7 @@ start_over:
      return ok;
  
    /* Check the costings of the loop make vectorizing worthwhile.  */
-  res = vect_analyze_loop_costing (loop_vinfo);
+  res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
    if (res < 0)
      {
        ok = opt_result::failure_at (vect_location,
@@ -2851,15 +2868,38 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
  
    machine_mode vector_mode = vector_modes[mode_i];
    loop_vinfo->vector_mode = vector_mode;
+  unsigned int suggested_unroll_factor = 1;
  
    /* Run the main analysis.  */
-  opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
+  opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
+                                       &suggested_unroll_factor);
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
                      "***** Analysis %s with vector mode %s\n",
                      res ? "succeeded" : " failed",
                      GET_MODE_NAME (loop_vinfo->vector_mode));
  
+  if (!main_loop_vinfo && suggested_unroll_factor > 1)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "***** Re-trying analysis for unrolling"
+                        " with unroll factor %d.\n",
+                        suggested_unroll_factor);
+      loop_vec_info unroll_vinfo
+       = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
+      unroll_vinfo->vector_mode = vector_mode;
+      unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
+      opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL);
+      if (new_res)
+       {
+         delete loop_vinfo;
+         loop_vinfo = unroll_vinfo;
+       }
+      else
+       delete unroll_vinfo;
+    }
+
    /* Remember the autodetected vector mode.  */
    if (vector_mode == VOIDmode)
      autodetected_vector_mode = loop_vinfo->vector_mode;
@@ -3860,7 +3900,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
  static void
  vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
                                     int *ret_min_profitable_niters,
-                                   int *ret_min_profitable_estimate)
+                                   int *ret_min_profitable_estimate,
+                                   unsigned *suggested_unroll_factor)
  {
    int min_profitable_iters;
    int min_profitable_estimate;
@@ -4227,7 +4268,22 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
  
    /* Complete the target-specific cost calculations.  */
    finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
-              &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
+              &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
+              suggested_unroll_factor);
+
+  if (suggested_unroll_factor && *suggested_unroll_factor > 1
+      && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
+      && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
+                   *suggested_unroll_factor,
+                   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "can't unroll as unrolled vectorization factor larger"
+                        " than maximum vectorization factor: %d\n",
+                        LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+      *suggested_unroll_factor = 1;
+    }
  
    vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
  
@@ -7194,10 +7250,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
  
     This only works when we see both the reduction PHI and its only consumer
     in vectorizable_reduction and there are no intermediate stmts
-   participating.  */
+   participating.  When unrolling we want each unrolled iteration to have its
+   own reduction accumulator since one of the main goals of unrolling a
+   reduction is to reduce the aggregate loop-carried latency.  */
    if (ncopies > 1
        && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
-      && reduc_chain_length == 1)
+      && reduc_chain_length == 1
+      && loop_vinfo->suggested_unroll_factor == 1)
      single_defuse_cycle = true;
  
    if (single_defuse_cycle || lane_reduc_code_p)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index 86f90ae..524c86c 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -642,6 +642,13 @@ public:
       about the reductions that generated them.  */
    hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
  
+  /* The number of times that the target suggested we unroll the vector loop
+     in order to promote more ILP.  This value will be used to re-analyze the
+     loop for vectorization and if successful the value will be folded into
+     vectorization_factor (and therefore exactly divides
+     vectorization_factor).  */
+  unsigned int suggested_unroll_factor;
+
    /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
       if there is no particular limit.  */
    unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1465,6 +1472,7 @@ public:
    unsigned int epilogue_cost () const;
    unsigned int outside_cost () const;
    unsigned int total_cost () const;
+  unsigned int suggested_unroll_factor () const;
  
  protected:
    unsigned int record_stmt_cost (stmt_vec_info, vect_cost_model_location,
@@ -1484,6 +1492,9 @@ protected:
    /* The costs of the three regions, indexed by vect_cost_model_location.  */
    unsigned int m_costs[3];
  
+  /* The suggested unrolling factor determined at finish_cost.  */
+  unsigned int m_suggested_unroll_factor;
+
    /* True if finish_cost has been called.  */
    bool m_finished;
  };
@@ -1496,6 +1507,7 @@ vector_costs::vector_costs (vec_info *vinfo, bool costing_for_scalar)
    : m_vinfo (vinfo),
      m_costing_for_scalar (costing_for_scalar),
      m_costs (),
+    m_suggested_unroll_factor(1),
      m_finished (false)
  {
  }
@@ -1544,6 +1556,15 @@ vector_costs::total_cost () const
    return body_cost () + outside_cost ();
  }
  
+/* Return the suggested unroll factor.  */
+
+inline unsigned int
+vector_costs::suggested_unroll_factor () const
+{
+  gcc_checking_assert (m_finished);
+  return m_suggested_unroll_factor;
+}
+
  #define VECT_MAX_COST 1000
  
  /* The maximum number of intermediate steps required in multi-step type
@@ -1720,12 +1741,14 @@ add_stmt_cost (vector_costs *costs, stmt_info_for_cost *i)
  static inline void
  finish_cost (vector_costs *costs, const vector_costs *scalar_costs,
              unsigned *prologue_cost, unsigned *body_cost,
-            unsigned *epilogue_cost)
+            unsigned *epilogue_cost, unsigned *suggested_unroll_factor = NULL)
  {
    costs->finish_cost (scalar_costs);
    *prologue_cost = costs->prologue_cost ();
    *body_cost = costs->body_cost ();
    *epilogue_cost = costs->epilogue_cost ();
+  if (suggested_unroll_factor)
+    *suggested_unroll_factor = costs->suggested_unroll_factor ();
  }
  
  inline void
author	Andre Vieira <andre.simoesdiasvieira@arm.com>
	Tue, 18 Jan 2022 15:57:39 +0000 (15:57 +0000)
committer	Andre Vieira <andre.simoesdiasvieira@arm.com>
	Tue, 18 Jan 2022 16:23:23 +0000 (16:23 +0000)
gcc/tree-vect-loop.cc		patch \| blob \| history
gcc/tree-vectorizer.h		patch \| blob \| history