From 6756706ea636d6f9aab85cef22659cc35143476f Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Fri, 12 Nov 2021 17:33:01 +0000
Subject: [PATCH] aarch64: Use real scalar op counts

Now that vector finish_costs is passed the associated scalar costs,
we can record the scalar issue information while computing the scalar
costs, rather than trying to estimate it while computing the vector
costs.

This simplifies things a little, but the main motivation is to improve
accuracy.

gcc/
	* config/aarch64/aarch64.c (aarch64_vector_costs::m_scalar_ops)
	(aarch64_vector_costs::m_sve_ops): Replace with...
	(aarch64_vector_costs::m_ops): ...this.
	(aarch64_vector_costs::analyze_loop_vinfo): Update accordingly.
	(aarch64_vector_costs::adjust_body_cost_sve): Likewise.
	(aarch64_vector_costs::aarch64_vector_costs): Likewise.
	Initialize m_vec_flags here rather than in add_stmt_cost.
	(aarch64_vector_costs::count_ops): Test for scalar reductions too.
	Allow vectype to be null.
	(aarch64_vector_costs::add_stmt_cost): Call count_ops for scalar
	code too.  Don't require vectype to be nonnull.
	(aarch64_vector_costs::adjust_body_cost): Take the loop_vec_info
	and scalar costs as parameters.  Use the scalar costs to determine
	the cycles per iteration of the scalar loop, then multiply it
	by the estimated VF.
	(aarch64_vector_costs::finish_cost): Update call accordingly.
---
 gcc/config/aarch64/aarch64.c | 182 +++++++++++++++++++++----------------------
 1 file changed, 88 insertions(+), 94 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d8bbc66..3944c09 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14813,7 +14813,8 @@ private:
 					fractional_cost, fractional_cost,
 					bool, unsigned int, unsigned int *,
 					bool *);
-  unsigned int adjust_body_cost (unsigned int);
+  unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
+				 unsigned int);
 
   /* True if we have performed one-time initialization based on the
      vec_info.  */
@@ -14850,22 +14851,16 @@ private:
      iterate, otherwise it is zero.  */
   uint64_t m_num_vector_iterations = 0;
 
-  /* Used only when vectorizing loops.  Estimates the number and kind of scalar
-     operations that would be needed to perform the same work as one iteration
-     of the vector loop.  */
-  aarch64_vec_op_count m_scalar_ops;
+  /* Used only when vectorizing loops.  Estimates the number and kind of
+     operations that would be needed by one iteration of the scalar
+     or vector loop.  */
+  aarch64_vec_op_count m_ops;
 
-  /* Used only when vectorizing loops.  If M_VEC_FLAGS & VEC_ADVSIMD,
-     this structure estimates the number and kind of operations that the
-     vector loop would contain.  If M_VEC_FLAGS & VEC_SVE, the structure
-     estimates what the equivalent Advanced SIMD-only code would need in
-     order to perform the same work as one iteration of the SVE loop.  */
+  /* Used only when vectorizing loops for SVE.  It estimates what the
+     equivalent Advanced SIMD-only code would need in order to perform
+     the same work as one iteration of the SVE loop.  */
   aarch64_vec_op_count m_advsimd_ops;
 
-  /* Used only when vectorizing loops with SVE.  It estimates the number and
-     kind of operations that the SVE loop would contain.  */
-  aarch64_vec_op_count m_sve_ops;
-
   /* Used to detect cases in which we end up costing the same load twice,
      once to account for results that are actually used and once to account
      for unused results.  */
@@ -14875,9 +14870,10 @@ private:
 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
 					    bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar),
-    m_scalar_ops (aarch64_tune_params.vec_costs->issue_info, 0),
-    m_advsimd_ops (aarch64_tune_params.vec_costs->issue_info, VEC_ADVSIMD),
-    m_sve_ops (aarch64_tune_params.vec_costs->issue_info, VEC_ANY_SVE)
+    m_vec_flags (costing_for_scalar ? 0
+		 : aarch64_classify_vector_mode (vinfo->vector_mode)),
+    m_ops (aarch64_tune_params.vec_costs->issue_info, m_vec_flags),
+    m_advsimd_ops (aarch64_tune_params.vec_costs->issue_info, VEC_ADVSIMD)
 {
 }
 
@@ -15016,7 +15012,7 @@ aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
 	if (rgm->type)
 	  num_masks += num_vectors_m1 + 1;
-      m_sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
+      m_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
     }
 }
 
@@ -15550,8 +15546,8 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 /* COUNT, KIND, STMT_INFO and VECTYPE are the same as for
    vector_costs::add_stmt_cost and they describe an operation in the
    body of a vector loop.  Record issue information relating to the vector
-   operation in OPS, where OPS is one of m_scalar_ops, m_advsimd_ops
-   or m_sve_ops; see the comments above those variables for details.
+   operation in OPS, where OPS is one of m_ops or m_advsimd_ops; see the
+   comments above those variables for details.
 
    FACTOR says how many iterations of the loop described by VEC_FLAGS would be
    needed to match one iteration of the vector loop in VINFO.  */
@@ -15570,14 +15566,14 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
 
   /* Calculate the minimum cycles per iteration imposed by a reduction
      operation.  */
-  if ((kind == vector_stmt || kind == vec_to_scalar)
+  if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
       && vect_is_reduction (stmt_info))
     {
       unsigned int base
 	= aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, vec_flags);
       if (vect_reduc_type (m_vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
 	{
-	  if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
+	  if (vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
 	    {
 	      /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar
 		 as a single operation, whereas for Advanced SIMD it is a
@@ -15744,11 +15740,6 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
     {
-      /* If we're costing the vector code, record whether we're vectorizing
-	 for Advanced SIMD or SVE.  */
-      if (!m_costing_for_scalar)
-	m_vec_flags = aarch64_classify_vector_mode (m_vinfo->vector_mode);
-
       if (loop_vinfo)
 	analyze_loop_vinfo (loop_vinfo);
 
@@ -15793,31 +15784,16 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	 innermost loop, also estimate the operations that would need
 	 to be issued by all relevant implementations of the loop.  */
       if (loop_vinfo
-	  && m_vec_flags
-	  && where == vect_body
+	  && (m_costing_for_scalar || where == vect_body)
 	  && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
-	  && vectype
 	  && stmt_cost != 0)
 	{
-	  /* Record estimates for the scalar code.  */
-	  count_ops (count, kind, stmt_info, vectype, &m_scalar_ops,
-		     vect_nunits_for_cost (vectype));
-
-	  if (aarch64_sve_mode_p (m_vinfo->vector_mode)
-	      && m_sve_ops.base_issue_info ())
-	    {
-	      /* Record estimates for a possible Advanced SIMD version
-		 of the SVE code.  */
-	      count_ops (count, kind, stmt_info, vectype, &m_advsimd_ops,
-			 aarch64_estimated_sve_vq ());
-
-	      /* Record estimates for the SVE code itself.  */
-	      count_ops (count, kind, stmt_info, vectype, &m_sve_ops, 1);
-	    }
-	  else
-	    /* Record estimates for the Advanced SIMD code.  Treat SVE like
-	       Advanced SIMD if the CPU has no specific SVE costs.  */
-	    count_ops (count, kind, stmt_info, vectype, &m_advsimd_ops, 1);
+	  count_ops (count, kind, stmt_info, vectype, &m_ops, 1);
+	  if (aarch64_sve_mode_p (m_vinfo->vector_mode))
+	    /* Record estimates for a possible Advanced SIMD version
+	       of the SVE code.  */
+	    count_ops (count, kind, stmt_info, vectype,
+		       &m_advsimd_ops, aarch64_estimated_sve_vq ());
 	}
 
       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
@@ -15885,7 +15861,7 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
   /* Estimate the minimum number of cycles per iteration needed to issue
      non-predicate operations.  */
   fractional_cost sve_nonpred_issue_cycles_per_iter
-    = aarch64_estimate_min_cycles_per_iter (&m_sve_ops, issue_info->sve);
+    = aarch64_estimate_min_cycles_per_iter (&m_ops, issue_info->sve);
 
   /* Estimate the minimum number of cycles per iteration needed to rename
      SVE instructions.
@@ -15901,9 +15877,7 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
        ??? This value is very much on the pessimistic side, but seems to work
        pretty well in practice.  */
     sve_rename_cycles_per_iter
-      = { m_sve_ops.general_ops
-	  + m_sve_ops.loads
-	  + m_sve_ops.pred_ops + 1, 5 };
+      = { m_ops.general_ops + m_ops.loads + m_ops.pred_ops + 1, 5 };
 
   /* Combine the rename and non-predicate issue limits into a single value.  */
   fractional_cost sve_nonpred_cycles_per_iter
@@ -15912,7 +15886,7 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
   /* Separately estimate the minimum number of cycles per iteration needed
      to issue the predicate operations.  */
   fractional_cost sve_pred_issue_cycles_per_iter
-    = { m_sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
+    = { m_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
 
   /* Calculate the overall limit on the number of cycles per iteration.  */
   fractional_cost sve_cycles_per_iter
@@ -15920,15 +15894,15 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
 
   if (dump_enabled_p ())
     {
-      m_sve_ops.dump ();
+      m_ops.dump ();
       dump_printf_loc (MSG_NOTE, vect_location,
 		       "  estimated cycles per iteration = %f\n",
 		       sve_cycles_per_iter.as_double ());
-      if (m_sve_ops.pred_ops)
+      if (m_ops.pred_ops)
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "    predicate issue = %f\n",
 			 sve_pred_issue_cycles_per_iter.as_double ());
-      if (m_sve_ops.pred_ops || sve_rename_cycles_per_iter)
+      if (m_ops.pred_ops || sve_rename_cycles_per_iter)
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "    non-predicate issue = %f\n",
 			 sve_nonpred_issue_cycles_per_iter.as_double ());
@@ -16008,8 +15982,13 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
    and return the new cost.  */
 unsigned int
-aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
+aarch64_vector_costs::
+adjust_body_cost (loop_vec_info loop_vinfo,
+		  const aarch64_vector_costs *scalar_costs,
+		  unsigned int body_cost)
 {
+  const auto &scalar_ops = scalar_costs->m_ops;
+  unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
   unsigned int orig_body_cost = body_cost;
   bool should_disparage = false;
 
@@ -16056,19 +16035,11 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
     return body_cost;
 
   fractional_cost scalar_cycles_per_iter
-    = aarch64_estimate_min_cycles_per_iter (&m_scalar_ops,
-					    issue_info->scalar);
-
-  fractional_cost advsimd_cycles_per_iter
-    = aarch64_estimate_min_cycles_per_iter (&m_advsimd_ops,
-					    issue_info->advsimd);
+    = aarch64_estimate_min_cycles_per_iter (&scalar_ops, issue_info->scalar);
+  scalar_cycles_per_iter *= estimated_vf;
 
-  bool could_use_advsimd
-    = ((m_vec_flags & VEC_ADVSIMD)
-       || (aarch64_autovec_preference != 2
-	   && (aarch64_tune_params.extra_tuning_flags
-	       & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
-	   && !m_saw_sve_only_op));
+  fractional_cost vector_cycles_per_iter
+    = aarch64_estimate_min_cycles_per_iter (&m_ops, m_ops.base_issue_info ());
 
   if (dump_enabled_p ())
     {
@@ -16077,32 +16048,40 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
 			 "Vector loop iterates at most %wd times\n",
 			 m_num_vector_iterations);
       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
-      m_scalar_ops.dump ();
+      scalar_ops.dump ();
       dump_printf_loc (MSG_NOTE, vect_location,
-		       "  estimated cycles per iteration = %f\n",
-		       scalar_cycles_per_iter.as_double ());
-      if (could_use_advsimd)
-	{
-	  dump_printf_loc (MSG_NOTE, vect_location,
-			   "Advanced SIMD issue estimate:\n");
-	  m_advsimd_ops.dump ();
-	  dump_printf_loc (MSG_NOTE, vect_location,
-			   "  estimated cycles per iteration = %f\n",
-			   advsimd_cycles_per_iter.as_double ());
-	}
-      else
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Loop could not use Advanced SIMD\n");
+		       "  estimated cycles per vector iteration"
+		       " (for VF %d) = %f\n",
+		       estimated_vf, scalar_cycles_per_iter.as_double ());
     }
 
-  fractional_cost vector_cycles_per_iter = advsimd_cycles_per_iter;
-  unsigned int vector_reduction_latency = m_advsimd_ops.reduction_latency;
-
   if ((m_vec_flags & VEC_ANY_SVE) && issue_info->sve)
     {
+      bool could_use_advsimd
+	= (aarch64_autovec_preference != 2
+	   && (aarch64_tune_params.extra_tuning_flags
+	       & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
+	   && !m_saw_sve_only_op);
+
+      fractional_cost advsimd_cycles_per_iter
+	= aarch64_estimate_min_cycles_per_iter (&m_advsimd_ops,
+						issue_info->advsimd);
       if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
-      vector_reduction_latency = m_sve_ops.reduction_latency;
+	{
+	  if (could_use_advsimd)
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "Advanced SIMD issue estimate:\n");
+	      m_advsimd_ops.dump ();
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "  estimated cycles per iteration = %f\n",
+			       advsimd_cycles_per_iter.as_double ());
+	    }
+	  else
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Loop could not use Advanced SIMD\n");
+	  dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
+	}
       vector_cycles_per_iter
 	= adjust_body_cost_sve (issue_info, scalar_cycles_per_iter,
 				advsimd_cycles_per_iter, could_use_advsimd,
@@ -16123,6 +16102,18 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
 				&body_cost, &should_disparage);
 	}
     }
+  else
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Vector issue estimate:\n");
+	  m_ops.dump ();
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "  estimated cycles per iteration = %f\n",
+			   vector_cycles_per_iter.as_double ());
+	}
+    }
 
   /* Decide whether to stick to latency-based costs or whether to try to
      take issue rates into account.  */
@@ -16164,8 +16155,8 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
      vector code is an improvement, even if adding the other (non-loop-carried)
      latencies tends to hide this saving.  We therefore reduce the cost of the
      vector loop body in proportion to the saving.  */
-  else if (m_scalar_ops.reduction_latency > vector_reduction_latency
-	   && m_scalar_ops.reduction_latency == scalar_cycles_per_iter
+  else if (scalar_ops.reduction_latency > m_ops.reduction_latency
+	   && scalar_ops.reduction_latency == scalar_cycles_per_iter
 	   && scalar_cycles_per_iter > vector_cycles_per_iter
 	   && !should_disparage)
     {
@@ -16181,13 +16172,16 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
 }
 
 void
-aarch64_vector_costs::finish_cost (const vector_costs *scalar_costs)
+aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
 {
+  auto *scalar_costs
+    = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
   if (loop_vinfo
       && m_vec_flags
       && aarch64_use_new_vector_costs_p ())
-    m_costs[vect_body] = adjust_body_cost (m_costs[vect_body]);
+    m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
+					   m_costs[vect_body]);
 
   vector_costs::finish_cost (scalar_costs);
 }
-- 
2.7.4