unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
unsigned int);
bool prefer_unrolled_loop () const;
+ unsigned int determine_suggested_unroll_factor ();
/* True if we have performed one-time initialization based on the
vec_info. */
bool m_analyzed_vinfo = false;
+ /* This loop uses an average operation that is not supported by SVE, but is
+ supported by Advanced SIMD and SVE2. */
+ bool m_has_avg = false;
+
/* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
- If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
SIMD code.
as one iteration of the SVE loop. */
if (where == vect_body && m_unrolled_advsimd_niters)
m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
+
+ /* Detect the use of an averaging operation. */
+ gimple *stmt = stmt_info->stmt;
+ if (is_gimple_call (stmt)
+ && gimple_call_internal_p (stmt))
+ {
+ switch (gimple_call_internal_fn (stmt))
+ {
+ case IFN_AVG_FLOOR:
+ case IFN_AVG_CEIL:
+ m_has_avg = true;
+ default:
+ break;
+ }
+ }
}
return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
}
return sve_cycles_per_iter;
}
+unsigned int
+aarch64_vector_costs::determine_suggested_unroll_factor ()
+{
+ bool sve = m_vec_flags & VEC_ANY_SVE;
+ /* If we are trying to unroll an Advanced SIMD main loop that contains
+ an averaging operation that we do not support with SVE and we might use a
+ predicated epilogue, we need to be conservative and block unrolling as
+ this might lead to a less optimal loop for the first and only epilogue
+ using the original loop's vectorization factor.
+ TODO: Remove this constraint when we add support for multiple epilogue
+ vectorization. */
+ if (!sve && !TARGET_SVE2 && m_has_avg)
+ return 1;
+
+ unsigned int max_unroll_factor = 1;
+ for (auto vec_ops : m_ops)
+ {
+ aarch64_simd_vec_issue_info const *vec_issue
+ = vec_ops.simd_issue_info ();
+ if (!vec_issue)
+ return 1;
+ /* Limit unroll factor to a value adjustable by the user, the default
+ value is 4. */
+ unsigned int unroll_factor = aarch64_vect_unroll_limit;
+ unsigned int factor
+ = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
+ unsigned int temp;
+
+ /* Sanity check, this should never happen. */
+ if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
+ return 1;
+
+ /* Check stores. */
+ if (vec_ops.stores > 0)
+ {
+ temp = CEIL (factor * vec_issue->stores_per_cycle,
+ vec_ops.stores);
+ unroll_factor = MIN (unroll_factor, temp);
+ }
+
+ /* Check loads + stores. */
+ if (vec_ops.loads > 0)
+ {
+ temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
+ vec_ops.loads + vec_ops.stores);
+ unroll_factor = MIN (unroll_factor, temp);
+ }
+
+ /* Check general ops. */
+ if (vec_ops.general_ops > 0)
+ {
+ temp = CEIL (factor * vec_issue->general_ops_per_cycle,
+ vec_ops.general_ops);
+ unroll_factor = MIN (unroll_factor, temp);
+ }
+ max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
+ }
+
+ /* Make sure unroll factor is power of 2. */
+ return 1 << ceil_log2 (max_unroll_factor);
+}
+
/* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
and return the new cost. */
unsigned int
if (loop_vinfo
&& m_vec_flags
&& aarch64_use_new_vector_costs_p ())
- m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
- m_costs[vect_body]);
+ {
+ m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
+ m_costs[vect_body]);
+ m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+ }
/* Apply the heuristic described above m_stp_sequence_cost. Prefer
the scalar code in the event of a tie, since there is more chance