From 1205a8cadb6bd41cdf5b13d7aca8fb44332002e5 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Fri, 26 Mar 2021 16:08:38 +0000
Subject: [PATCH] aarch64: Take issue rate into account for vector loop costs

When SVE is enabled, GCC needs to do a three-way comparison
between scalar, Advanced SIMD and SVE code.  The normal costs
tend to be latency-based, which is well-suited to SLP.  However,
comparing sums of latency costs means that we effectively treat
the code as executing sequentially.  This can hide the effect of
pipeline bubbles or resource contention that in practice are quite
important for loop vectorisation.  This is particularly true for
loops that involve reductions.

This patch therefore tries to estimate how quickly each piece
of code could issue, using a very (very) simplistic model.
It then uses this to adjust the loop vector costs up or down as
appropriate.  Part of the Advanced SIMD vs. SVE adjustment is
opt-in and is not enabled by default even for use_new_vector_costs.

Like with the previous patches, this one only becomes active if
a CPU selects use_new_vector_costs.  It should therefore have
a very low impact on other CPUs.  The code also mostly ignores
CPUs that have no issue information, even if use_new_vector_costs
is enabled for some reason.

gcc/
	* config/aarch64/aarch64.opt
	(-param=aarch64-loop-vect-issue-rate-niters=): New parameter.
	* doc/invoke.texi: Document it.
	* config/aarch64/aarch64-protos.h (aarch64_base_vec_issue_info)
	(aarch64_scalar_vec_issue_info, aarch64_simd_vec_issue_info)
	(aarch64_advsimd_vec_issue_info, aarch64_sve_vec_issue_info)
	(aarch64_vec_issue_info): New structures.
	(cpu_vector_cost): Write comments above the variables rather
	than to the side.
	(cpu_vector_cost::issue_info): New member variable.
	* config/aarch64/aarch64.c: Include gimple-pretty-print.h
	and tree-ssa-loop-niter.h.
	(generic_vector_cost, a64fx_vector_cost, qdf24xx_vector_cost)
	(thunderx_vector_cost, tsv110_vector_cost, cortexa57_vector_cost)
	(exynosm1_vector_cost, xgene1_vector_cost, thunderx2t99_vector_cost)
	(thunderx3t110_vector_cost): Initialize issue_info to null.
	(neoversev1_scalar_issue_info, neoversev1_advsimd_issue_info)
	(neoversev1_sve_issue_info, neoversev1_vec_issue_info): New structures.
	(neoversev1_vector_cost): Use them.
	(aarch64_vec_op_count, aarch64_sve_op_count): New structures.
	(aarch64_vector_costs::saw_sve_only_op): New member variable.
	(aarch64_vector_costs::num_vector_iterations): Likewise.
	(aarch64_vector_costs::scalar_ops): Likewise.
	(aarch64_vector_costs::advsimd_ops): Likewise.
	(aarch64_vector_costs::sve_ops): Likewise.
	(aarch64_vector_costs::seen_loads): Likewise.
	(aarch64_simd_vec_costs_for_flags): New function.
	(aarch64_analyze_loop_vinfo): Initialize num_vector_iterations.
	Count the number of predicate operations required by SVE WHILE
	instructions.
	(aarch64_comparison_type, aarch64_multiply_add_p): New functions.
	(aarch64_sve_only_stmt_p, aarch64_in_loop_reduction_latency): Likewise.
	(aarch64_count_ops): Likewise.
	(aarch64_add_stmt_cost): Record whether see an SVE operation
	that cannot currently be implementing using Advanced SIMD.
	Record issue information about the scalar, Advanced SIMD
	and (where relevant) SVE versions of a loop.
	(aarch64_vec_op_count::dump): New function.
	(aarch64_sve_op_count::dump): Likewise.
	(aarch64_estimate_min_cycles_per_iter): Likewise.
	(aarch64_adjust_body_cost): If issue information is available,
	try to compare the issue rates of the various loop implementations
	and increase or decrease the vector body cost accordingly.
---
 gcc/config/aarch64/aarch64-protos.h | 178 +++++++-
 gcc/config/aarch64/aarch64.c        | 798 +++++++++++++++++++++++++++++++++++-
 gcc/config/aarch64/aarch64.opt      |   3 +
 gcc/doc/invoke.texi                 |   8 +
 4 files changed, 966 insertions(+), 21 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 2ffa96e..ca1ed9e 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -281,19 +281,177 @@ struct sve_vec_cost : simd_vec_cost
   const int scatter_store_elt_cost;
 };
 
+/* Base information about how the CPU issues code, containing
+   information that is relevant to scalar, Advanced SIMD and SVE
+   operations.
+
+   The structure uses the general term "operation" to refer to
+   whichever subdivision of an instruction makes sense for the CPU.
+   These operations would typically be micro operations or macro
+   operations.
+
+   Note that this structure and the ones derived from it are only
+   as general as they need to be for the CPUs that currently use them.
+   They will probably need to be extended or refined as more CPUs are
+   added.  */
+struct aarch64_base_vec_issue_info
+{
+  /* How many loads and stores can be issued per cycle.  */
+  const unsigned int loads_stores_per_cycle;
+
+  /* How many stores can be issued per cycle.  */
+  const unsigned int stores_per_cycle;
+
+  /* How many integer or FP/SIMD operations can be issued per cycle.
+
+     Currently we don't try to distinguish the two.  For vector code,
+     we only really track FP/SIMD operations during vector costing;
+     we don't for example try to cost arithmetic operations like
+     address calculations, which are only decided later during ivopts.
+
+     For scalar code, we effectively assume that code operates entirely
+     on integers or entirely on floating-point values.  Again, we don't
+     try to take address calculations into account.
+
+     This is not very precise, but it's only meant to be a heuristic.
+     We could certainly try to do better in future if there's an example
+     of something that would benefit.  */
+  const unsigned int general_ops_per_cycle;
+
+  /* How many FP/SIMD operations to count for a floating-point or
+     vector load operation.
+
+     When constructing an Advanced SIMD vector from elements that have
+     been loaded from memory, these values apply to each individual load.
+     When using an SVE gather load, the values apply to each element of
+     the gather.  */
+  const unsigned int fp_simd_load_general_ops;
+
+  /* How many FP/SIMD operations to count for a floating-point or
+     vector store operation.
+
+     When storing individual elements of an Advanced SIMD vector out to
+     memory, these values apply to each individual store.  When using an
+     SVE scatter store, these values apply to each element of the scatter.  */
+  const unsigned int fp_simd_store_general_ops;
+};
+
+using aarch64_scalar_vec_issue_info = aarch64_base_vec_issue_info;
+
+/* Base information about the issue stage for vector operations.
+   This structure contains information that is relevant to both
+   Advanced SIMD and SVE.  */
+struct aarch64_simd_vec_issue_info : aarch64_base_vec_issue_info
+{
+  constexpr aarch64_simd_vec_issue_info (aarch64_base_vec_issue_info base,
+					 unsigned int ld2_st2_general_ops,
+					 unsigned int ld3_st3_general_ops,
+					 unsigned int ld4_st4_general_ops)
+    : aarch64_base_vec_issue_info (base),
+      ld2_st2_general_ops (ld2_st2_general_ops),
+      ld3_st3_general_ops (ld3_st3_general_ops),
+      ld4_st4_general_ops (ld4_st4_general_ops)
+  {}
+
+  /* How many FP/SIMD operations to count for each vector loaded or
+     stored by an LD[234] or ST[234] operation, in addition to the
+     base costs given in the parent class.  For example, the full
+     number of operations for an LD3 would be:
+
+       load ops:    3
+       general ops: 3 * (fp_simd_load_general_ops + ld3_st3_general_ops).  */
+  const unsigned int ld2_st2_general_ops;
+  const unsigned int ld3_st3_general_ops;
+  const unsigned int ld4_st4_general_ops;
+};
+
+using aarch64_advsimd_vec_issue_info = aarch64_simd_vec_issue_info;
+
+/* Information about the issue stage for SVE.  The main thing this adds
+   is a concept of "predicate operations".  */
+struct aarch64_sve_vec_issue_info : aarch64_simd_vec_issue_info
+{
+  constexpr aarch64_sve_vec_issue_info
+    (aarch64_simd_vec_issue_info base,
+     unsigned int pred_ops_per_cycle,
+     unsigned int while_pred_ops,
+     unsigned int int_cmp_pred_ops,
+     unsigned int fp_cmp_pred_ops,
+     unsigned int gather_scatter_pair_general_ops,
+     unsigned int gather_scatter_pair_pred_ops)
+    : aarch64_simd_vec_issue_info (base),
+      pred_ops_per_cycle (pred_ops_per_cycle),
+      while_pred_ops (while_pred_ops),
+      int_cmp_pred_ops (int_cmp_pred_ops),
+      fp_cmp_pred_ops (fp_cmp_pred_ops),
+      gather_scatter_pair_general_ops (gather_scatter_pair_general_ops),
+      gather_scatter_pair_pred_ops (gather_scatter_pair_pred_ops)
+  {}
+
+  /* How many predicate operations can be issued per cycle.  */
+  const unsigned int pred_ops_per_cycle;
+
+  /* How many predicate operations are generated by a WHILExx
+     instruction.  */
+  const unsigned int while_pred_ops;
+
+  /* How many predicate operations are generated by an integer
+     comparison instruction.  */
+  const unsigned int int_cmp_pred_ops;
+
+  /* How many predicate operations are generated by a floating-point
+     comparison instruction.  */
+  const unsigned int fp_cmp_pred_ops;
+
+  /* How many general and predicate operations are generated by each pair
+     of elements in a gather load or scatter store.  These values apply
+     on top of the per-element counts recorded in fp_simd_load_general_ops
+     and fp_simd_store_general_ops.
+
+     The reason for using pairs is that that is the largest possible
+     granule size for 128-bit SVE, which can load and store 2 64-bit
+     elements or 4 32-bit elements.  */
+  const unsigned int gather_scatter_pair_general_ops;
+  const unsigned int gather_scatter_pair_pred_ops;
+};
+
+/* Information related to instruction issue for a particular CPU.  */
+struct aarch64_vec_issue_info
+{
+  const aarch64_base_vec_issue_info *const scalar;
+  const aarch64_simd_vec_issue_info *const advsimd;
+  const aarch64_sve_vec_issue_info *const sve;
+};
+
 /* Cost for vector insn classes.  */
 struct cpu_vector_cost
 {
-  const int scalar_int_stmt_cost;	 /* Cost of any int scalar operation,
-					    excluding load and store.  */
-  const int scalar_fp_stmt_cost;	 /* Cost of any fp scalar operation,
-					    excluding load and store.  */
-  const int scalar_load_cost;		 /* Cost of scalar load.  */
-  const int scalar_store_cost;		 /* Cost of scalar store.  */
-  const int cond_taken_branch_cost;	 /* Cost of taken branch.  */
-  const int cond_not_taken_branch_cost;  /* Cost of not taken branch.  */
-  const advsimd_vec_cost *advsimd;	 /* Cost of Advanced SIMD operations.  */
-  const sve_vec_cost *sve;		 /* Cost of SVE operations.  */
+  /* Cost of any integer scalar operation, excluding load and store.  */
+  const int scalar_int_stmt_cost;
+
+  /* Cost of any fp scalar operation, excluding load and store.  */
+  const int scalar_fp_stmt_cost;
+
+  /* Cost of a scalar load.  */
+  const int scalar_load_cost;
+
+  /* Cost of a scalar store.  */
+  const int scalar_store_cost;
+
+  /* Cost of a taken branch.  */
+  const int cond_taken_branch_cost;
+
+  /* Cost of a not-taken branch.  */
+  const int cond_not_taken_branch_cost;
+
+  /* Cost of an Advanced SIMD operations.  */
+  const advsimd_vec_cost *advsimd;
+
+  /* Cost of an SVE operations, or null if SVE is not implemented.  */
+  const sve_vec_cost *sve;
+
+  /* Issue information, or null if none is provided.  */
+  const aarch64_vec_issue_info *const issue_info;
 };
 
 /* Branch costs.  */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6d18d82..6d961be 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -74,6 +74,8 @@
 #include "intl.h"
 #include "expmed.h"
 #include "function-abi.h"
+#include "gimple-pretty-print.h"
+#include "tree-ssa-loop-niter.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -652,7 +654,8 @@ static const struct cpu_vector_cost generic_vector_cost =
   3, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &generic_advsimd_vector_cost, /* advsimd  */
-  &generic_sve_vector_cost /* sve */
+  &generic_sve_vector_cost, /* sve */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
@@ -719,7 +722,8 @@ static const struct cpu_vector_cost a64fx_vector_cost =
   3, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &a64fx_advsimd_vector_cost, /* advsimd  */
-  &a64fx_sve_vector_cost /* sve  */
+  &a64fx_sve_vector_cost, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
@@ -756,7 +760,8 @@ static const struct cpu_vector_cost qdf24xx_vector_cost =
   3, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &qdf24xx_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 
@@ -794,7 +799,8 @@ static const struct cpu_vector_cost thunderx_vector_cost =
   3, /* cond_taken_branch_cost  */
   3, /* cond_not_taken_branch_cost  */
   &thunderx_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
@@ -830,7 +836,8 @@ static const struct cpu_vector_cost tsv110_vector_cost =
   1, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &tsv110_advsimd_vector_cost, /* advsimd  */
-  NULL, /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
@@ -867,7 +874,8 @@ static const struct cpu_vector_cost cortexa57_vector_cost =
   1, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &cortexa57_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
@@ -903,7 +911,8 @@ static const struct cpu_vector_cost exynosm1_vector_cost =
   1, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &exynosm1_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
@@ -940,7 +949,8 @@ static const struct cpu_vector_cost xgene1_vector_cost =
   2, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &xgene1_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
@@ -977,7 +987,8 @@ static const struct cpu_vector_cost thunderx2t99_vector_cost =
   2, /* cond_taken_branch_cost  */
   1,  /* cond_not_taken_branch_cost  */
   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
@@ -1013,7 +1024,8 @@ static const struct cpu_vector_cost thunderx3t110_vector_cost =
   2, /* cond_taken_branch_cost  */
   1,  /* cond_not_taken_branch_cost  */
   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
-  NULL /* sve  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
 };
 
 
@@ -1696,6 +1708,58 @@ static const sve_vec_cost neoversev1_sve_vector_cost =
   3 /* scatter_store_elt_cost  */
 };
 
+static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  4, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    4, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
+{
+  {
+    {
+      2, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      2, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    2, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  1, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversev1_vec_issue_info =
+{
+  &neoversev1_scalar_issue_info,
+  &neoversev1_advsimd_issue_info,
+  &neoversev1_sve_issue_info
+};
+
 /* Neoverse V1 costs for vector insn classes.  */
 static const struct cpu_vector_cost neoversev1_vector_cost =
 {
@@ -1706,7 +1770,8 @@ static const struct cpu_vector_cost neoversev1_vector_cost =
   1, /* cond_taken_branch_cost  */
   1, /* cond_not_taken_branch_cost  */
   &neoversev1_advsimd_vector_cost, /* advsimd  */
-  &neoversev1_sve_vector_cost /* sve  */
+  &neoversev1_sve_vector_cost, /* sve  */
+  &neoversev1_vec_issue_info /* issue_info  */
 };
 
 static const struct tune_params neoversev1_tunings =
@@ -14120,6 +14185,38 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
 
 /* Vectorizer cost model target hooks.  */
 
+/* Information about how the CPU would issue the scalar, Advanced SIMD
+   or SVE version of a vector loop, using the scheme defined by the
+   aarch64_base_vec_issue_info hierarchy of structures.  */
+struct aarch64_vec_op_count
+{
+  void dump () const;
+
+  /* The number of individual "general" operations.  See the comments
+     in aarch64_base_vec_issue_info for details.  */
+  unsigned int general_ops = 0;
+
+  /* The number of load and store operations, under the same scheme
+     as above.  */
+  unsigned int loads = 0;
+  unsigned int stores = 0;
+
+  /* The minimum number of cycles needed to execute all loop-carried
+     operations, which in the vector code become associated with
+     reductions.  */
+  unsigned int reduction_latency = 0;
+};
+
+/* Extends aarch64_vec_op_count with SVE-specific information.  */
+struct aarch64_sve_op_count : aarch64_vec_op_count
+{
+  void dump () const;
+
+  /* The number of individual predicate operations.  See the comments
+     in aarch64_sve_vec_issue_info for details.  */
+  unsigned int pred_ops = 0;
+};
+
 /* Information about vector code that we're in the process of costing.  */
 struct aarch64_vector_costs
 {
@@ -14138,6 +14235,10 @@ struct aarch64_vector_costs
      vectorization.  */
   bool is_loop = false;
 
+  /* True if we've seen an SVE operation that we cannot currently vectorize
+     using Advanced SIMD.  */
+  bool saw_sve_only_op = false;
+
   /* - If VEC_FLAGS is zero then we're costing the original scalar code.
      - If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
        SIMD code.
@@ -14159,6 +14260,32 @@ struct aarch64_vector_costs
      the heuristic.  */
   unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0;
   unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0;
+
+  /* If we're vectorizing a loop that executes a constant number of times,
+     this variable gives the number of times that the vector loop would
+     iterate, otherwise it is zero.  */
+  uint64_t num_vector_iterations = 0;
+
+  /* Used only when vectorizing loops.  Estimates the number and kind of scalar
+     operations that would be needed to perform the same work as one iteration
+     of the vector loop.  */
+  aarch64_vec_op_count scalar_ops;
+
+  /* Used only when vectorizing loops.  If VEC_FLAGS & VEC_ADVSIMD,
+     this structure estimates the number and kind of operations that the
+     vector loop would contain.  If VEC_FLAGS & VEC_SVE, the structure
+     estimates what the equivalent Advanced SIMD-only code would need in
+     order to perform the same work as one iteration of the SVE loop.  */
+  aarch64_vec_op_count advsimd_ops;
+
+  /* Used only when vectorizing loops with SVE.  It estimates the number and
+     kind of operations that the SVE loop would contain.  */
+  aarch64_sve_op_count sve_ops;
+
+  /* Used to detect cases in which we end up costing the same load twice,
+     once to account for results that are actually used and once to account
+     for unused results.  */
+  hash_map<nofree_ptr_hash<_stmt_vec_info>, unsigned int> seen_loads;
 };
 
 /* Implement TARGET_VECTORIZE_INIT_COST.  */
@@ -14190,6 +14317,16 @@ aarch64_simd_vec_costs (tree vectype)
   return costs->advsimd;
 }
 
+/* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
+static const simd_vec_cost *
+aarch64_simd_vec_costs_for_flags (unsigned int flags)
+{
+  const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
+  if ((flags & VEC_ANY_SVE) && costs->sve)
+    return costs->sve;
+  return costs->advsimd;
+}
+
 /* Decide whether to use the unrolling heuristic described above
    aarch64_vector_costs::unrolled_advsimd_niters, updating that
    field if so.  LOOP_VINFO describes the loop that we're vectorizing
@@ -14250,6 +14387,19 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo,
 {
   costs->is_loop = true;
 
+  /* Record the number of times that the vector loop would execute,
+     if known.  */
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  auto scalar_niters = max_stmt_executions_int (loop);
+  if (scalar_niters >= 0)
+    {
+      unsigned int vf = vect_vf_for_cost (loop_vinfo);
+      if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
+	costs->num_vector_iterations = scalar_niters / vf;
+      else
+	costs->num_vector_iterations = CEIL (scalar_niters, vf);
+    }
+
   /* Detect whether we're costing the scalar code or the vector code.
      This is a bit hacky: it would be better if the vectorizer told
      us directly.
@@ -14265,6 +14415,20 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo,
      apply the unrolling heuristic described above
      aarch64_vector_costs::unrolled_advsimd_niters.  */
   aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs);
+
+  /* Record the issue information for any SVE WHILE instructions that the
+     loop needs.  */
+  auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
+  if (issue_info->sve && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
+    {
+      unsigned int num_masks = 0;
+      rgroup_controls *rgm;
+      unsigned int num_vectors_m1;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
+	if (rgm->type)
+	  num_masks += num_vectors_m1 + 1;
+      costs->sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
+    }
 }
 
 /* Do one-time initialization of COSTS given that we're costing the block
@@ -14407,6 +14571,17 @@ aarch64_embedded_comparison_type (stmt_vec_info stmt_info)
   return NULL_TREE;
 }
 
+/* If STMT_INFO is a comparison or contains an embedded comparison, return the
+   scalar type of the values being compared.  Return null otherwise.  */
+static tree
+aarch64_comparison_type (stmt_vec_info stmt_info)
+{
+  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
+    if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
+      return TREE_TYPE (gimple_assign_rhs1 (assign));
+  return aarch64_embedded_comparison_type (stmt_info);
+}
+
 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
    vectors would produce a series of LDP or STP operations.  KIND is the
    kind of statement that STMT_INFO represents.  */
@@ -14470,6 +14645,79 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
 	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
 }
 
+/* Return true if STMT_INFO is the second part of a two-statement multiply-add
+   or multiply-subtract sequence that might be suitable for fusing into a
+   single instruction.  */
+static bool
+aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info)
+{
+  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (!assign)
+    return false;
+  tree_code code = gimple_assign_rhs_code (assign);
+  if (code != PLUS_EXPR && code != MINUS_EXPR)
+    return false;
+
+  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
+      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
+    return false;
+
+  for (int i = 1; i < 3; ++i)
+    {
+      tree rhs = gimple_op (assign, i);
+      /* ??? Should we try to check for a single use as well?  */
+      if (TREE_CODE (rhs) != SSA_NAME)
+	continue;
+
+      stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
+      if (!def_stmt_info
+	  || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
+	continue;
+      gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
+      if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
+	continue;
+
+      return true;
+    }
+  return false;
+}
+
+/* Return true if the vectorized form of STMT_INFO is something that is only
+   possible when using SVE instead of Advanced SIMD.  VECTYPE is the type of
+   the vector that STMT_INFO is operating on.  */
+static bool
+aarch64_sve_only_stmt_p (stmt_vec_info stmt_info, tree vectype)
+{
+  if (!aarch64_sve_mode_p (TYPE_MODE (vectype)))
+    return false;
+
+  if (STMT_VINFO_DATA_REF (stmt_info))
+    {
+      /* Check for true gathers and scatters (rather than just strided accesses
+	 that we've chosen to implement using gathers and scatters).  Although
+	 in principle we could use elementwise accesses for Advanced SIMD,
+	 the vectorizer doesn't yet support that.  */
+      if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	return true;
+
+      /* Check for masked loads and stores.  */
+      if (auto *call = dyn_cast<gcall *> (stmt_info->stmt))
+	if (gimple_call_internal_p (call)
+	    && internal_fn_mask_index (gimple_call_internal_fn (call)) >= 0)
+	  return true;
+    }
+
+  /* Check for 64-bit integer multiplications.  */
+  auto *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (assign
+      && gimple_assign_rhs_code (assign) == MULT_EXPR
+      && GET_MODE_INNER (TYPE_MODE (vectype)) == DImode
+      && !integer_pow2p (gimple_assign_rhs2 (assign)))
+    return true;
+
+  return false;
+}
+
 /* We are considering implementing STMT_INFO using SVE vector type VECTYPE.
    If STMT_INFO is an in-loop reduction that SVE supports directly, return
    its latency in cycles, otherwise return zero.  SVE_COSTS specifies the
@@ -14507,6 +14755,59 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
   return 0;
 }
 
+/* STMT_INFO describes a loop-carried operation in the original scalar code
+   that we are considering implementing as a reduction.  Return one of the
+   following values, depending on VEC_FLAGS:
+
+   - If VEC_FLAGS is zero, return the loop carry latency of the original
+     scalar operation.
+
+   - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
+     the Advanced SIMD implementation.
+
+   - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
+     SVE implementation.
+
+   VECTYPE is the type of vector that the vectorizer is considering using
+   for STMT_INFO, which might be different from the type of vector described
+   by VEC_FLAGS.  */
+static unsigned int
+aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
+				   tree vectype, unsigned int vec_flags)
+{
+  const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
+  const sve_vec_cost *sve_costs = nullptr;
+  if (vec_flags & VEC_ANY_SVE)
+    sve_costs = aarch64_tune_params.vec_costs->sve;
+
+  /* If the caller is asking for the SVE latency, check for forms of reduction
+     that only SVE can handle directly.  */
+  if (sve_costs)
+    {
+      unsigned int latency
+	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
+						 sve_costs);
+      if (latency)
+	return latency;
+    }
+
+  /* Handle scalar costs.  */
+  if (vec_flags == 0)
+    {
+      if (FLOAT_TYPE_P (vectype))
+	return vec_costs->scalar_fp_stmt_cost;
+      return vec_costs->scalar_int_stmt_cost;
+    }
+
+  /* Otherwise, the loop body just contains normal integer or FP operations,
+     with a vector reduction outside the loop.  */
+  const simd_vec_cost *simd_costs
+    = aarch64_simd_vec_costs_for_flags (vec_flags);
+  if (FLOAT_TYPE_P (vectype))
+    return simd_costs->fp_stmt_cost;
+  return simd_costs->int_stmt_cost;
+}
+
 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
    try to subdivide the target-independent categorization provided by KIND
@@ -14729,6 +15030,203 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
   return stmt_cost;
 }
 
+/* VINFO, COSTS, COUNT, KIND, STMT_INFO and VECTYPE are the same as for
+   TARGET_VECTORIZE_ADD_STMT_COST and they describe an operation in the
+   body of a vector loop.  Record issue information relating to the vector
+   operation in OPS, where OPS is one of COSTS->scalar_ops, COSTS->advsimd_ops
+   or COSTS->sve_ops; see the comments above those variables for details.
+   In addition:
+
+   - VEC_FLAGS is zero if OPS is COSTS->scalar_ops.
+
+   - VEC_FLAGS & VEC_ADVSIMD is nonzero if OPS is COSTS->advsimd_ops.
+
+   - VEC_FLAGS & VEC_ANY_SVE is nonzero if OPS is COSTS->sve_ops.
+
+   ISSUE_INFO provides the scalar, Advanced SIMD or SVE issue information
+   associated with OPS and VEC_FLAGS.  FACTOR says how many iterations of
+   the loop described by VEC_FLAGS would be needed to match one iteration
+   of the vector loop in VINFO.  */
+static void
+aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
+		   unsigned int count, enum vect_cost_for_stmt kind,
+		   _stmt_vec_info *stmt_info, tree vectype,
+		   unsigned int vec_flags, aarch64_vec_op_count *ops,
+		   const aarch64_base_vec_issue_info *issue_info,
+		   unsigned int factor)
+{
+  if (!issue_info)
+    return;
+
+  const aarch64_simd_vec_issue_info *simd_issue = nullptr;
+  if (vec_flags)
+    simd_issue = static_cast<const aarch64_simd_vec_issue_info *> (issue_info);
+
+  const aarch64_sve_vec_issue_info *sve_issue = nullptr;
+  if (vec_flags & VEC_ANY_SVE)
+    sve_issue = static_cast<const aarch64_sve_vec_issue_info *> (issue_info);
+
+  /* Calculate the minimum cycles per iteration imposed by a reduction
+     operation.  */
+  if ((kind == vector_stmt || kind == vec_to_scalar)
+      && aarch64_is_reduction (stmt_info))
+    {
+      unsigned int base
+	= aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype,
+					     vec_flags);
+      if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
+	{
+	  if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
+	    {
+	      /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar
+		 as a single operation, whereas for Advanced SIMD it is a
+		 per-element one.  Increase the factor accordingly, both for
+		 the reduction_latency calculation and for the op couting.  */
+	      if (vec_flags & VEC_ADVSIMD)
+		factor = vect_nunits_for_cost (vectype);
+	    }
+	  else
+	    /* An Advanced SIMD fold-left reduction is the same as a
+	       scalar one and the vectorizer therefore treats vec_to_scalar
+	       as a per-element cost.  There is no extra factor to apply for
+	       scalar code, either for reduction_latency or for the op
+	       counting below.  */
+	    factor = 1;
+	}
+
+      /* ??? Ideally for vector code we'd do COUNT * FACTOR reductions in
+	 parallel, but unfortunately that's not yet the case.  */
+      ops->reduction_latency = MAX (ops->reduction_latency,
+				    base * count * factor);
+    }
+
+  /* Assume that multiply-adds will become a single operation.  */
+  if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info))
+    return;
+
+  /* When costing scalar statements in vector code, the count already
+     includes the number of scalar elements in the vector, so we don't
+     need to apply the factor as well.  */
+  if (kind == scalar_load || kind == scalar_store || kind == scalar_stmt)
+    factor = 1;
+
+  /* This can go negative with the load handling below.  */
+  int num_copies = count * factor;
+
+  /* Count the basic operation cost associated with KIND.  */
+  switch (kind)
+    {
+    case cond_branch_taken:
+    case cond_branch_not_taken:
+    case vector_gather_load:
+    case vector_scatter_store:
+      /* We currently don't expect these to be used in a loop body.  */
+      break;
+
+    case vec_perm:
+    case vec_promote_demote:
+    case vec_construct:
+    case vec_to_scalar:
+    case scalar_to_vec:
+      /* Assume that these operations have no overhead in the original
+	 scalar code.  */
+      if (!vec_flags)
+	break;
+      /* Fallthrough.  */
+    case vector_stmt:
+    case scalar_stmt:
+      ops->general_ops += num_copies;
+      break;
+
+    case scalar_load:
+    case vector_load:
+    case unaligned_load:
+      /* When costing scalars, detect cases in which we are called twice for
+	 the same load.  This happens for LD[234] operations if only some of
+	 the results are used.  The first time represents the cost of loading
+	 the unused vectors, while the second time represents the cost of
+	 loading the useful parts.  Only the latter should count towards the
+	 scalar costs.  */
+      if (stmt_info && !vec_flags)
+	{
+	  bool existed = false;
+	  unsigned int &prev_count
+	    = costs->seen_loads.get_or_insert (stmt_info, &existed);
+	  if (existed)
+	    num_copies -= prev_count;
+	  else
+	    prev_count = num_copies;
+	}
+      ops->loads += num_copies;
+      if (vec_flags || FLOAT_TYPE_P (vectype))
+	ops->general_ops += issue_info->fp_simd_load_general_ops * num_copies;
+      break;
+
+    case vector_store:
+    case unaligned_store:
+    case scalar_store:
+      ops->stores += num_copies;
+      if (vec_flags || FLOAT_TYPE_P (vectype))
+	ops->general_ops += issue_info->fp_simd_store_general_ops * num_copies;
+      break;
+    }
+
+  /* Add any embedded comparison operations.  */
+  if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
+      && aarch64_embedded_comparison_type (stmt_info))
+    ops->general_ops += num_copies;
+
+  /* Detect COND_REDUCTIONs and things that would need to become
+     COND_REDUCTIONs if they were implemented using Advanced SIMD.
+     There are then two sets of VEC_COND_EXPRs, whereas so far we
+     have only accounted for one.  */
+  if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar))
+    {
+      int reduc_type = aarch64_reduc_type (vinfo, stmt_info);
+      if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD))
+	  || reduc_type == COND_REDUCTION)
+	ops->general_ops += num_copies;
+    }
+
+  /* Count the predicate operations needed by an SVE comparison.  */
+  if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
+    if (tree type = aarch64_comparison_type (stmt_info))
+      {
+	unsigned int base = (FLOAT_TYPE_P (type)
+			     ? sve_issue->fp_cmp_pred_ops
+			     : sve_issue->int_cmp_pred_ops);
+	costs->sve_ops.pred_ops += base * num_copies;
+      }
+
+  /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
+  if (simd_issue)
+    switch (aarch64_ld234_st234_vectors (kind, stmt_info))
+      {
+      case 2:
+	ops->general_ops += simd_issue->ld2_st2_general_ops * num_copies;
+	break;
+
+      case 3:
+	ops->general_ops += simd_issue->ld3_st3_general_ops * num_copies;
+	break;
+
+      case 4:
+	ops->general_ops += simd_issue->ld4_st4_general_ops * num_copies;
+	break;
+      }
+
+  /* Add any overhead associated with gather loads and scatter stores.  */
+  if (sve_issue
+      && (kind == scalar_load || kind == scalar_store)
+      && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+    {
+      unsigned int pairs = CEIL (count, 2);
+      costs->sve_ops.pred_ops
+	+= sve_issue->gather_scatter_pair_pred_ops * pairs;
+      ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
+    }
+}
+
 /* Implement targetm.vectorize.add_stmt_cost.  */
 static unsigned
 aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
@@ -14760,6 +15258,9 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	 of just looking at KIND.  */
       if (stmt_info && aarch64_use_new_vector_costs_p ())
 	{
+	  if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype))
+	    costs->saw_sve_only_op = true;
+
 	  stmt_cost = aarch64_detect_scalar_stmt_subtype
 	    (vinfo, kind, stmt_info, stmt_cost);
 
@@ -14781,6 +15282,44 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	  stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
 						stmt_cost);
 
+	  /* If we're recording a nonzero vector loop body cost, also estimate
+	     the operations that would need to be issued by all relevant
+	     implementations of the loop.  */
+	  auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
+	  if (loop_vinfo
+	      && issue_info
+	      && costs->vec_flags
+	      && where == vect_body
+	      && vectype
+	      && stmt_cost != 0)
+	    {
+	      /* Record estimates for the scalar code.  */
+	      aarch64_count_ops (vinfo, costs, count, kind, stmt_info, vectype,
+				 0, &costs->scalar_ops, issue_info->scalar,
+				 vect_nunits_for_cost (vectype));
+
+	      if (aarch64_sve_mode_p (vinfo->vector_mode) && issue_info->sve)
+		{
+		  /* Record estimates for a possible Advanced SIMD version
+		     of the SVE code.  */
+		  aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
+				     vectype, VEC_ADVSIMD, &costs->advsimd_ops,
+				     issue_info->advsimd,
+				     aarch64_estimated_sve_vq ());
+
+		  /* Record estimates for the SVE code itself.  */
+		  aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
+				     vectype, VEC_ANY_SVE, &costs->sve_ops,
+				     issue_info->sve, 1);
+		}
+	      else
+		/* Record estimates for the Advanced SIMD code.  Treat SVE like
+		   Advanced SIMD if the CPU has no specific SVE costs.  */
+		aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
+				   vectype, VEC_ADVSIMD, &costs->advsimd_ops,
+				   issue_info->advsimd, 1);
+	    }
+
 	  /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
 	     estimate the number of statements in the unrolled Advanced SIMD
 	     loop.  For simplicitly, we assume that one iteration of the
@@ -14805,12 +15344,56 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
   return retval;
 }
 
+/* Dump information about the structure.  */
+void
+aarch64_vec_op_count::dump () const
+{
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  load operations = %d\n", loads);
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  store operations = %d\n", stores);
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  general operations = %d\n", general_ops);
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  reduction latency = %d\n", reduction_latency);
+}
+
+/* Dump information about the structure.  */
+void
+aarch64_sve_op_count::dump () const
+{
+  aarch64_vec_op_count::dump ();
+  dump_printf_loc (MSG_NOTE, vect_location,
+		   "  predicate operations = %d\n", pred_ops);
+}
+
+/* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue
+   the operations described by OPS.  This is a very simplistic model!  */
+static unsigned int
+aarch64_estimate_min_cycles_per_iter
+  (const aarch64_vec_op_count *ops,
+   const aarch64_base_vec_issue_info *issue_info)
+{
+  unsigned int cycles = MAX (ops->reduction_latency, 1);
+  cycles = MAX (cycles, CEIL (ops->stores, issue_info->stores_per_cycle));
+  cycles = MAX (cycles, CEIL (ops->loads + ops->stores,
+			      issue_info->loads_stores_per_cycle));
+  cycles = MAX (cycles, CEIL (ops->general_ops,
+			      issue_info->general_ops_per_cycle));
+  return cycles;
+}
+
 /* BODY_COST is the cost of a vector loop body recorded in COSTS.
    Adjust the cost as necessary and return the new cost.  */
 static unsigned int
 aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
 {
   unsigned int orig_body_cost = body_cost;
+  bool should_disparage = false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "Original vector body cost = %d\n", body_cost);
 
   if (costs->unrolled_advsimd_stmts)
     {
@@ -14841,10 +15424,203 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
 				 "Increasing body cost to %d to account for"
 				 " unrolling\n", min_cost);
 	      body_cost = min_cost;
+	      should_disparage = true;
 	    }
 	}
     }
 
+  auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
+  if (!issue_info)
+    return body_cost;
+
+  unsigned int scalar_cycles_per_iter
+    = aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops,
+					    issue_info->scalar);
+  unsigned int advsimd_cycles_per_iter
+    = aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops,
+					    issue_info->advsimd);
+  bool could_use_advsimd
+    = ((costs->vec_flags & VEC_ADVSIMD)
+       || (aarch64_autovec_preference != 2
+	   && (aarch64_tune_params.extra_tuning_flags
+	       & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
+	   && !costs->saw_sve_only_op));
+
+  if (dump_enabled_p ())
+    {
+      if (IN_RANGE (costs->num_vector_iterations, 0, 65536))
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Vector loop iterates at most %wd times\n",
+			 costs->num_vector_iterations);
+      dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
+      costs->scalar_ops.dump ();
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "  estimated cycles per iteration = %d\n",
+		       scalar_cycles_per_iter);
+      if (could_use_advsimd)
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Advanced SIMD issue estimate:\n");
+	  costs->advsimd_ops.dump ();
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "  estimated cycles per iteration = %d\n",
+			   advsimd_cycles_per_iter);
+	}
+      else
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Loop could not use Advanced SIMD\n");
+    }
+
+  uint64_t vector_cycles_per_iter = advsimd_cycles_per_iter;
+  unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency;
+  if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve)
+    {
+      /* Estimate the minimum number of cycles per iteration needed to issue
+	 non-predicate operations.  */
+      unsigned int sve_cycles_per_iter
+	= aarch64_estimate_min_cycles_per_iter (&costs->sve_ops,
+						issue_info->sve);
+
+      /* Separately estimate the minimum number of cycles per iteration needed
+	 to issue the predicate operations.  */
+      unsigned int pred_cycles_per_iter
+	= CEIL (costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle);
+
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
+	  costs->sve_ops.dump ();
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "  estimated cycles per iteration for non-predicate"
+			   " operations = %d\n", sve_cycles_per_iter);
+	  if (costs->sve_ops.pred_ops)
+	    dump_printf_loc (MSG_NOTE, vect_location, "  estimated cycles per"
+			     " iteration for predicate operations = %d\n",
+			     pred_cycles_per_iter);
+	}
+
+      vector_cycles_per_iter = MAX (sve_cycles_per_iter, pred_cycles_per_iter);
+      vector_reduction_latency = costs->sve_ops.reduction_latency;
+
+      /* If the scalar version of the loop could issue at least as
+	 quickly as the predicate parts of the SVE loop, make the SVE loop
+	 prohibitively expensive.  In this case vectorization is adding an
+	 overhead that the original scalar code didn't have.
+
+	 This is mostly intended to detect cases in which WHILELOs dominate
+	 for very tight loops, which is something that normal latency-based
+	 costs would not model.  Adding this kind of cliffedge would be
+	 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
+	 code later in the function handles that case in a more
+	 conservative way.  */
+      uint64_t sve_estimate = pred_cycles_per_iter + 1;
+      if (scalar_cycles_per_iter < sve_estimate)
+	{
+	  unsigned int min_cost
+	    = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
+	  if (body_cost < min_cost)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Increasing body cost to %d because the"
+				 " scalar code could issue within the limit"
+				 " imposed by predicate operations\n",
+				 min_cost);
+	      body_cost = min_cost;
+	      should_disparage = true;
+	    }
+	}
+
+      /* If it appears that the Advanced SIMD version of a loop could issue
+	 more quickly than the SVE one, increase the SVE cost in proportion
+	 to the difference.  The intention is to make Advanced SIMD preferable
+	 in cases where an Advanced SIMD version exists, without increasing
+	 the costs so much that SVE won't be used at all.
+
+	 The reasoning is similar to the scalar vs. predicate comparison above:
+	 if the issue rate of the SVE code is limited by predicate operations
+	 (i.e. if pred_cycles_per_iter > sve_cycles_per_iter), and if the
+	 Advanced SIMD code could issue within the limit imposed by the
+	 predicate operations, the predicate operations are adding an
+	 overhead that the original code didn't have and so we should prefer
+	 the Advanced SIMD version.  However, if the predicate operations
+	 do not dominate in this way, we should only increase the cost of
+	 the SVE code if sve_cycles_per_iter is strictly greater than
+	 advsimd_cycles_per_iter.  Given rounding effects, this should mean
+	 that Advanced SIMD is either better or at least no worse.  */
+      if (sve_cycles_per_iter >= pred_cycles_per_iter)
+	sve_estimate = sve_cycles_per_iter;
+      if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate)
+	{
+	  /* This ensures that min_cost > orig_body_cost * 2.  */
+	  unsigned int min_cost
+	    = orig_body_cost * CEIL (sve_estimate, advsimd_cycles_per_iter) + 1;
+	  if (body_cost < min_cost)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Increasing body cost to %d because Advanced"
+				 " SIMD code could issue as quickly\n",
+				 min_cost);
+	      body_cost = min_cost;
+	      should_disparage = true;
+	    }
+	}
+    }
+
+  /* Decide whether to stick to latency-based costs or whether to try to
+     take issue rates into account.  */
+  unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
+  if (costs->vec_flags & VEC_ANY_SVE)
+    threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
+
+  if (costs->num_vector_iterations >= 1
+      && costs->num_vector_iterations < threshold)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Low iteration count, so using pure latency"
+			 " costs\n");
+    }
+  /* Increase the cost of the vector code if it looks like the scalar code
+     could issue more quickly.  These values are only rough estimates,
+     so minor differences should only result in minor changes.  */
+  else if (scalar_cycles_per_iter < vector_cycles_per_iter)
+    {
+      body_cost = CEIL (body_cost * vector_cycles_per_iter,
+			scalar_cycles_per_iter);
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Increasing body cost to %d because scalar code"
+			 " would issue more quickly\n", body_cost);
+    }
+  /* In general, it's expected that the proposed vector code would be able
+     to issue more quickly than the original scalar code.  This should
+     already be reflected to some extent in the latency-based costs.
+
+     However, the latency-based costs effectively assume that the scalar
+     code and the vector code execute serially, which tends to underplay
+     one important case: if the real (non-serialized) execution time of
+     a scalar iteration is dominated by loop-carried dependencies,
+     and if the vector code is able to reduce both the length of
+     the loop-carried dependencies *and* the number of cycles needed
+     to issue the code in general, we can be more confident that the
+     vector code is an improvement, even if adding the other (non-loop-carried)
+     latencies tends to hide this saving.  We therefore reduce the cost of the
+     vector loop body in proportion to the saving.  */
+  else if (costs->scalar_ops.reduction_latency > vector_reduction_latency
+	   && costs->scalar_ops.reduction_latency == scalar_cycles_per_iter
+	   && scalar_cycles_per_iter > vector_cycles_per_iter
+	   && !should_disparage)
+    {
+      body_cost = CEIL (body_cost * vector_cycles_per_iter,
+			scalar_cycles_per_iter);
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Decreasing body cost to %d account for smaller"
+			 " reduction latency\n", body_cost);
+    }
+
   return body_cost;
 }
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 91e5c61..32191cf 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -277,3 +277,6 @@ The number of Newton iterations for calculating the reciprocal for double type.
 
 -param=aarch64-autovec-preference=
 Target Joined UInteger Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4) Param
+
+-param=aarch64-loop-vect-issue-rate-niters=
+Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) IntegerRange(0, 65536) Param
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ca204c7..7f12313 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14417,6 +14417,14 @@ Use both Advanced SIMD and SVE.  Prefer SVE when the costs are deemed equal.
 @end table
 The default value is 0.
 
+@item aarch64-loop-vect-issue-rate-niters
+The tuning for some AArch64 CPUs tries to take both latencies and issue
+rates into account when deciding whether a loop should be vectorized
+using SVE, vectorized using Advanced SIMD, or not vectorized at all.
+If this parameter is set to @var{n}, GCC will not use this heuristic
+for loops that are known to execute in fewer than @var{n} Advanced
+SIMD iterations.
+
 @end table
 
 @end table
-- 
2.7.4