+2012-11-06 Jan Hubicka <jh@suse.cz>
+
+ * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound,
+ vect_do_peeling_for_alignment): Fix loop bound computation.
+ * tree-vect-loop.c (vect_transform_loop): Maintain loop bounds.
+
2012-11-06 Oleg Endo <olegendo@gcc.gnu.org>
PR target/54089
+2012-11-06 Jan Hubicka <jh@suse.cz>
+
+ * gcc.target/i386/l_fma_float_?.c: Update.
+ * gcc.target/i386/l_fma_double_?.c: Update.
+
2012-11-06 Oleg Endo <olegendo@gcc.gnu.org>
PR target/54089
/* { dg-final { scan-assembler-times "vfnmadd231pd" 4 } } */
/* { dg-final { scan-assembler-times "vfnmsub132pd" 4 } } */
/* { dg-final { scan-assembler-times "vfnmsub231pd" 4 } } */
-/* { dg-final { scan-assembler-times "vfmadd132sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfmadd213sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfmsub132sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfmsub213sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfnmadd213sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfnmsub213sd" 20 } } */
+/* { dg-final { scan-assembler-times "vfmadd132sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfmadd213sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfmsub132sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfmsub213sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfnmadd213sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfnmsub213sd" 16 } } */
/* { dg-final { scan-assembler-times "vfmsub132pd" 8 } } */
/* { dg-final { scan-assembler-times "vfnmadd132pd" 8 } } */
/* { dg-final { scan-assembler-times "vfnmsub132pd" 8 } } */
-/* { dg-final { scan-assembler-times "vfmadd132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfmsub132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132sd" 40 } } */
+/* { dg-final { scan-assembler-times "vfmadd132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfmsub132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132sd" 32 } } */
/* { dg-final { scan-assembler-times "vfnmadd231pd" 4 } } */
/* { dg-final { scan-assembler-times "vfnmsub132pd" 4 } } */
/* { dg-final { scan-assembler-times "vfnmsub231pd" 4 } } */
-/* { dg-final { scan-assembler-times "vfmadd132sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfmadd213sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfmsub132sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfmsub213sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfnmadd213sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132sd" 20 } } */
-/* { dg-final { scan-assembler-times "vfnmsub213sd" 20 } } */
+/* { dg-final { scan-assembler-times "vfmadd132sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfmadd213sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfmsub132sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfmsub213sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfnmadd213sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132sd" 16 } } */
+/* { dg-final { scan-assembler-times "vfnmsub213sd" 16 } } */
/* { dg-final { scan-assembler-times "vfmsub132pd" 8 } } */
/* { dg-final { scan-assembler-times "vfnmadd132pd" 8 } } */
/* { dg-final { scan-assembler-times "vfnmsub132pd" 8 } } */
-/* { dg-final { scan-assembler-times "vfmadd132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfmsub132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132sd" 40 } } */
+/* { dg-final { scan-assembler-times "vfmadd132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfmsub132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132sd" 32 } } */
/* { dg-final { scan-assembler-times "vfmsub132pd" 8 } } */
/* { dg-final { scan-assembler-times "vfnmadd132pd" 8 } } */
/* { dg-final { scan-assembler-times "vfnmsub132pd" 8 } } */
-/* { dg-final { scan-assembler-times "vfmadd132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfmsub132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132sd" 40 } } */
+/* { dg-final { scan-assembler-times "vfmadd132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfmsub132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132sd" 32 } } */
/* { dg-final { scan-assembler-times "vfmsub132pd" 8 } } */
/* { dg-final { scan-assembler-times "vfnmadd132pd" 8 } } */
/* { dg-final { scan-assembler-times "vfnmsub132pd" 8 } } */
-/* { dg-final { scan-assembler-times "vfmadd132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfmsub132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132sd" 40 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132sd" 40 } } */
+/* { dg-final { scan-assembler-times "vfmadd132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfmsub132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132sd" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132sd" 32 } } */
/* { dg-final { scan-assembler-times "vfnmadd231ps" 4 } } */
/* { dg-final { scan-assembler-times "vfnmsub132ps" 4 } } */
/* { dg-final { scan-assembler-times "vfnmsub231ps" 4 } } */
-/* { dg-final { scan-assembler-times "vfmadd132ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfmadd213ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfmsub132ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfmsub213ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfnmadd213ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfnmsub213ss" 36 } } */
+/* { dg-final { scan-assembler-times "vfmadd132ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfmadd213ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfmsub213ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmadd213ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmsub213ss" 32 } } */
/* { dg-final { scan-assembler-times "vfmsub132ps" 8 } } */
/* { dg-final { scan-assembler-times "vfnmadd132ps" 8 } } */
/* { dg-final { scan-assembler-times "vfnmsub132ps" 8 } } */
-/* { dg-final { scan-assembler-times "vfmadd132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfmsub132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132ss" 72 } } */
+/* { dg-final { scan-assembler-times "vfmadd132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ss" 64 } } */
/* { dg-final { scan-assembler-times "vfnmadd231ps" 4 } } */
/* { dg-final { scan-assembler-times "vfnmsub132ps" 4 } } */
/* { dg-final { scan-assembler-times "vfnmsub231ps" 4 } } */
-/* { dg-final { scan-assembler-times "vfmadd132ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfmadd213ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfmsub132ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfmsub213ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfnmadd213ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132ss" 36 } } */
-/* { dg-final { scan-assembler-times "vfnmsub213ss" 36 } } */
+/* { dg-final { scan-assembler-times "vfmadd132ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfmadd213ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfmsub213ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmadd213ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ss" 32 } } */
+/* { dg-final { scan-assembler-times "vfnmsub213ss" 32 } } */
/* { dg-final { scan-assembler-times "vfmsub132ps" 8 } } */
/* { dg-final { scan-assembler-times "vfnmadd132ps" 8 } } */
/* { dg-final { scan-assembler-times "vfnmsub132ps" 8 } } */
-/* { dg-final { scan-assembler-times "vfmadd132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfmsub132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132ss" 72 } } */
+/* { dg-final { scan-assembler-times "vfmadd132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ss" 64 } } */
/* { dg-final { scan-assembler-times "vfmsub132ps" 8 } } */
/* { dg-final { scan-assembler-times "vfnmadd132ps" 8 } } */
/* { dg-final { scan-assembler-times "vfnmsub132ps" 8 } } */
-/* { dg-final { scan-assembler-times "vfmadd132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfmsub132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132ss" 72 } } */
+/* { dg-final { scan-assembler-times "vfmadd132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ss" 64 } } */
/* { dg-final { scan-assembler-times "vfmsub132ps" 8 } } */
/* { dg-final { scan-assembler-times "vfnmadd132ps" 8 } } */
/* { dg-final { scan-assembler-times "vfnmsub132ps" 8 } } */
-/* { dg-final { scan-assembler-times "vfmadd132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfmsub132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfnmadd132ss" 72 } } */
-/* { dg-final { scan-assembler-times "vfnmsub132ss" 72 } } */
+/* { dg-final { scan-assembler-times "vfmadd132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ss" 64 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ss" 64 } } */
by ratio_mult_vf_name steps. */
vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
- max_iter = LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1;
+ /* For vectorization factor N, we need to copy last N-1 values in epilogue
+ and this means N-2 loopback edge executions.
+
+ PEELING_FOR_GAPS works by subtracting last iteration and thus the epilogue
+ will execute at least LOOP_VINFO_VECT_FACTOR times. */
+ max_iter = (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+ ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) * 2
+ : LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 2;
if (check_profitability)
- max_iter = MAX (max_iter, (int) th);
+ max_iter = MAX (max_iter, (int) th - 1);
record_niter_bound (new_loop, double_int::from_shwi (max_iter), false, true);
dump_printf (MSG_OPTIMIZED_LOCATIONS,
"Setting upper bound of nb iterations for epilogue "
#ifdef ENABLE_CHECKING
slpeel_verify_cfg_after_peeling (new_loop, loop);
#endif
- max_iter = LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1;
+ /* For vectorization factor N, we need to copy at most N-1 values
+ for alignment and this means N-2 loopback edge executions. */
+ max_iter = LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 2;
if (check_profitability)
- max_iter = MAX (max_iter, (int) th);
+ max_iter = MAX (max_iter, (int) th - 1);
record_niter_bound (new_loop, double_int::from_shwi (max_iter), false, true);
dump_printf (MSG_OPTIMIZED_LOCATIONS,
"Setting upper bound of nb iterations for prologue "
bool transform_pattern_stmt = false;
bool check_profitability = false;
int th;
+ /* Record number of iterations before we started tampering with the profile. */
+ gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
+ /* If profile is inprecise, we have chance to fix it up. */
+ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
+
/* Use the more conservative vectorization threshold. If the number
of iterations is constant assume the cost check has been performed
by our caller. If the threshold makes all loops profitable that
slpeel_make_loop_iterate_ntimes (loop, ratio);
+ /* Reduce loop iterations by the vectorization factor. */
+ scale_loop_profile (loop, RDIV (REG_BR_PROB_BASE , vectorization_factor),
+ expected_iterations / vectorization_factor);
+ loop->nb_iterations_upper_bound
+ = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
+ FLOOR_DIV_EXPR);
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+ && loop->nb_iterations_upper_bound != double_int_zero)
+ loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
+ if (loop->any_estimate)
+ {
+ loop->nb_iterations_estimate
+ = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
+ FLOOR_DIV_EXPR);
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+ && loop->nb_iterations_estimate != double_int_zero)
+ loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
+ }
+
/* The memory tags and pointers in vectorized statements need to
have their SSA forms updated. FIXME, why can't this be delayed
until all the loops have been transformed? */