gimple.h: Remove all includes.
[platform/upstream/gcc.git] / gcc / tree-vect-loop.c
index 4499dac..c91c2e1 100644 (file)
@@ -1,6 +1,5 @@
 /* Loop Vectorization
-   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
-   2013 Free Software Foundation, Inc.
+   Copyright (C) 2003-2013 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
    Ira Rosen <irar@il.ibm.com>
 
@@ -25,11 +24,26 @@ along with GCC; see the file COPYING3.  If not see
 #include "coretypes.h"
 #include "dumpfile.h"
 #include "tm.h"
-#include "ggc.h"
 #include "tree.h"
+#include "stor-layout.h"
 #include "basic-block.h"
 #include "gimple-pretty-print.h"
-#include "tree-flow.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimplify-me.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
 #include "tree-pass.h"
 #include "cfgloop.h"
 #include "expr.h"
@@ -189,7 +203,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                     "=== vect_determine_vectorization_factor ===");
+                     "=== vect_determine_vectorization_factor ===\n");
 
   for (i = 0; i < nbbs; i++)
     {
@@ -203,6 +217,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
            {
              dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+             dump_printf (MSG_NOTE, "\n");
            }
 
          gcc_assert (stmt_info);
@@ -217,6 +232,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                  dump_printf_loc (MSG_NOTE, vect_location,
                                    "get vectype for scalar type:  ");
                  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
+                  dump_printf (MSG_NOTE, "\n");
                }
 
              vectype = get_vectype_for_scalar_type (scalar_type);
@@ -229,6 +245,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                                        "data-type ");
                      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                                          scalar_type);
+                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                    }
                  return false;
                }
@@ -238,11 +255,13 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                {
                  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
                  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+                  dump_printf (MSG_NOTE, "\n");
                }
 
              nunits = TYPE_VECTOR_SUBPARTS (vectype);
              if (dump_enabled_p ())
-               dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
+               dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
+                                 nunits);
 
              if (!vectorization_factor
                  || (nunits > vectorization_factor))
@@ -266,13 +285,15 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
              dump_printf_loc (MSG_NOTE, vect_location,
                                "==> examining statement: ");
              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+              dump_printf (MSG_NOTE, "\n");
            }
 
          gcc_assert (stmt_info);
 
          /* Skip stmts which do not need to be vectorized.  */
-         if (!STMT_VINFO_RELEVANT_P (stmt_info)
-             && !STMT_VINFO_LIVE_P (stmt_info))
+         if ((!STMT_VINFO_RELEVANT_P (stmt_info)
+              && !STMT_VINFO_LIVE_P (stmt_info))
+             || gimple_clobber_p (stmt))
             {
               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
@@ -286,12 +307,13 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                       dump_printf_loc (MSG_NOTE, vect_location,
                                        "==> examining pattern statement: ");
                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+                      dump_printf (MSG_NOTE, "\n");
                     }
                 }
               else
                {
                  if (dump_enabled_p ())
-                   dump_printf_loc (MSG_NOTE, vect_location, "skip.");
+                   dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
                   gsi_next (&si);
                  continue;
                 }
@@ -336,6 +358,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                                            "==> examining pattern def stmt: ");
                          dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
                                             pattern_def_stmt, 0);
+                          dump_printf (MSG_NOTE, "\n");
                        }
 
                      stmt = pattern_def_stmt;
@@ -359,6 +382,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                                    "not vectorized: irregular stmt.");
                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
                                     0);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              return false;
            }
@@ -370,6 +394,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                    "not vectorized: vector stmt in loop:");
                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              return false;
            }
@@ -394,6 +419,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                  dump_printf_loc (MSG_NOTE, vect_location,
                                    "get vectype for scalar type:  ");
                  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
+                  dump_printf (MSG_NOTE, "\n");
                }
              vectype = get_vectype_for_scalar_type (scalar_type);
              if (!vectype)
@@ -405,11 +431,19 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                                        "data-type ");
                      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                                          scalar_type);
+                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                    }
                  return false;
                }
 
              STMT_VINFO_VECTYPE (stmt_info) = vectype;
+
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
+                 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+                  dump_printf (MSG_NOTE, "\n");
+               }
             }
 
          /* The vectorization factor is according to the smallest
@@ -422,6 +456,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
              dump_printf_loc (MSG_NOTE, vect_location,
                                "get vectype for scalar type:  ");
              dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
+              dump_printf (MSG_NOTE, "\n");
            }
          vf_vectype = get_vectype_for_scalar_type (scalar_type);
          if (!vf_vectype)
@@ -432,6 +467,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                                    "not vectorized: unsupported data-type ");
                  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                                      scalar_type);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              return false;
            }
@@ -449,6 +485,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
                  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                                      vf_vectype);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              return false;
            }
@@ -457,11 +494,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
            {
              dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
              dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
+              dump_printf (MSG_NOTE, "\n");
            }
 
          nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
          if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
+           dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
          if (!vectorization_factor
              || (nunits > vectorization_factor))
            vectorization_factor = nunits;
@@ -476,13 +514,13 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 
   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
   if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
+    dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
                      vectorization_factor);
   if (vectorization_factor <= 1)
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "not vectorized: unsupported data-type");
+                         "not vectorized: unsupported data-type\n");
       return false;
     }
   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
@@ -494,7 +532,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 /* Function vect_is_simple_iv_evolution.
 
    FORNOW: A simple evolution of an induction variables in the loop is
-   considered a polynomial evolution with constant step.  */
+   considered a polynomial evolution.  */
 
 static bool
 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
@@ -503,6 +541,7 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
   tree init_expr;
   tree step_expr;
   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
+  basic_block bb;
 
   /* When there is no evolution in this loop, the evolution function
      is not "simple".  */
@@ -523,16 +562,25 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
       dump_printf (MSG_NOTE, ",  init: ");
       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
+      dump_printf (MSG_NOTE, "\n");
     }
 
   *init = init_expr;
   *step = step_expr;
 
-  if (TREE_CODE (step_expr) != INTEGER_CST)
+  if (TREE_CODE (step_expr) != INTEGER_CST
+      && (TREE_CODE (step_expr) != SSA_NAME
+         || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
+             && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
+         || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
+             && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
+                 || !flag_associative_math)))
+      && (TREE_CODE (step_expr) != REAL_CST
+         || !flag_associative_math))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "step unknown.");
+                         "step unknown.\n");
       return false;
     }
 
@@ -550,15 +598,14 @@ static void
 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 {
   basic_block bb = loop->header;
-  tree dumy;
-  vec<gimple> worklist;
-  worklist.create (64);
+  tree init, step;
+  stack_vec<gimple, 64> worklist;
   gimple_stmt_iterator gsi;
   bool double_reduc;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                     "=== vect_analyze_scalar_cycles ===");
+                     "=== vect_analyze_scalar_cycles ===\n");
 
   /* First - identify all inductions.  Reduction detection assumes that all the
      inductions have been identified, therefore, this order must not be
@@ -574,6 +621,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
        {
          dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+          dump_printf (MSG_NOTE, "\n");
        }
 
       /* Skip virtual phi's.  The data dependences that are associated with
@@ -593,13 +641,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
              dump_printf_loc (MSG_NOTE, vect_location,
                                "Access function of PHI: ");
              dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
+              dump_printf (MSG_NOTE, "\n");
            }
          STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
            = evolution_part_in_loop_num (access_fn, loop->num);
        }
 
       if (!access_fn
-         || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
+         || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
+         || (LOOP_VINFO_LOOP (loop_vinfo) != loop
+             && TREE_CODE (step) != INTEGER_CST))
        {
          worklist.safe_push (phi);
          continue;
@@ -608,7 +659,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 
       if (dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
+       dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
     }
 
@@ -626,6 +677,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
         {
           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+          dump_printf (MSG_NOTE, "\n");
         }
 
       gcc_assert (!virtual_operand_p (def)
@@ -640,7 +692,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
             {
               if (dump_enabled_p ())
                 dump_printf_loc (MSG_NOTE, vect_location,
-                                "Detected double reduction.");
+                                "Detected double reduction.\n");
 
               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
@@ -652,7 +704,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
                 {
                   if (dump_enabled_p ())
                     dump_printf_loc (MSG_NOTE, vect_location,
-                                    "Detected vectorizable nested cycle.");
+                                    "Detected vectorizable nested cycle.\n");
 
                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
@@ -662,7 +714,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
                 {
                   if (dump_enabled_p ())
                     dump_printf_loc (MSG_NOTE, vect_location,
-                                    "Detected reduction.");
+                                    "Detected reduction.\n");
 
                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
@@ -676,10 +728,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
       else
         if (dump_enabled_p ())
           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                          "Unknown def-use cycle pattern.");
+                          "Unknown def-use cycle pattern.\n");
     }
-
-  worklist.release ();
 }
 
 
@@ -724,11 +774,12 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 }
 
+
 /* Function vect_get_loop_niters.
 
-   Determine how many iterations the loop is executed.
-   If an expression that represents the number of iterations
-   can be constructed, place it in NUMBER_OF_ITERATIONS.
+   Determine how many iterations the loop is executed and place it
+   in NUMBER_OF_ITERATIONS.
+
    Return the loop exit condition.  */
 
 static gimple
@@ -738,20 +789,17 @@ vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                    "=== get_loop_niters ===");
-  niters = number_of_exit_cond_executions (loop);
-
-  if (niters != NULL_TREE
-      && niters != chrec_dont_know)
-    {
-      *number_of_iterations = niters;
-
-      if (dump_enabled_p ())
-        {
-          dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
-          dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
-        }
-    }
+                    "=== get_loop_niters ===\n");
+
+  niters = number_of_latch_executions (loop);
+  /* We want the number of loop header executions which is the number
+     of latch executions plus one.
+     ???  For UINT_MAX latch executions this number overflows to zero
+     for loops like do { n++; } while (n != 0);  */
+  if (niters && !chrec_contains_undetermined (niters))
+    niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), niters,
+                         build_int_cst (TREE_TYPE (niters), 1));
+  *number_of_iterations = niters;
 
   return get_loop_exit_condition (loop);
 }
@@ -859,7 +907,7 @@ new_loop_vec_info (struct loop *loop)
   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
-  LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
+  LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
   LOOP_VINFO_VECT_FACTOR (res) = 0;
   LOOP_VINFO_LOOP_NEST (res).create (3);
   LOOP_VINFO_DATAREFS (res).create (10);
@@ -874,9 +922,9 @@ new_loop_vec_info (struct loop *loop)
   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
   LOOP_VINFO_SLP_INSTANCES (res).create (10);
   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
-  LOOP_VINFO_PEELING_HTAB (res) = NULL;
   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
+  LOOP_VINFO_PEELING_FOR_NITER (res) = false;
   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 
   return res;
@@ -906,23 +954,9 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
   loop = LOOP_VINFO_LOOP (loop_vinfo);
 
   bbs = LOOP_VINFO_BBS (loop_vinfo);
-  nbbs = loop->num_nodes;
+  nbbs = clean_stmts ? loop->num_nodes : 0;
   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 
-  if (!clean_stmts)
-    {
-      free (LOOP_VINFO_BBS (loop_vinfo));
-      free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
-      free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
-      LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
-      LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
-      LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
-
-      free (loop_vinfo);
-      loop->aux = NULL;
-      return;
-    }
-
   for (j = 0; j < nbbs; j++)
     {
       basic_block bb = bbs[j];
@@ -943,9 +977,9 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
                   || code == POINTER_PLUS_EXPR
                   || code == MULT_EXPR)
                  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
-               swap_tree_operands (stmt,
-                                   gimple_assign_rhs1_ptr (stmt),
-                                   gimple_assign_rhs2_ptr (stmt));
+               swap_ssa_operands (stmt,
+                                  gimple_assign_rhs1_ptr (stmt),
+                                  gimple_assign_rhs2_ptr (stmt));
            }
 
          /* Free stmt_vec_info.  */
@@ -955,7 +989,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
     }
 
   free (LOOP_VINFO_BBS (loop_vinfo));
-  free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
+  vect_destroy_datarefs (loop_vinfo, NULL);
   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
@@ -969,8 +1003,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
 
-  if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
-    htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
+  if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
+    LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
 
   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
 
@@ -994,7 +1028,7 @@ vect_analyze_loop_1 (struct loop *loop)
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                    "===== analyze_loop_nest_1 =====");
+                    "===== analyze_loop_nest_1 =====\n");
 
   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 
@@ -1003,7 +1037,7 @@ vect_analyze_loop_1 (struct loop *loop)
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad inner-loop form.");
+                        "bad inner-loop form.\n");
       return NULL;
     }
 
@@ -1029,7 +1063,7 @@ vect_analyze_loop_form (struct loop *loop)
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                    "=== vect_analyze_loop_form ===");
+                    "=== vect_analyze_loop_form ===\n");
 
   /* Different restrictions apply when we are considering an inner-most loop,
      vs. an outer (nested) loop.
@@ -1053,17 +1087,17 @@ vect_analyze_loop_form (struct loop *loop)
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: control flow in loop.");
+                            "not vectorized: control flow in loop.\n");
           return NULL;
         }
 
       if (empty_block_p (loop->header))
-    {
-          if (dump_enabled_p ())
-            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: empty loop.");
-      return NULL;
-    }
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: empty loop.\n");
+         return NULL;
+       }
     }
   else
     {
@@ -1091,7 +1125,7 @@ vect_analyze_loop_form (struct loop *loop)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: multiple nested loops.");
+                            "not vectorized: multiple nested loops.\n");
          return NULL;
        }
 
@@ -1101,7 +1135,7 @@ vect_analyze_loop_form (struct loop *loop)
        {
          if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: Bad inner loop.");
+                            "not vectorized: Bad inner loop.\n");
          return NULL;
        }
 
@@ -1109,8 +1143,9 @@ vect_analyze_loop_form (struct loop *loop)
                                        LOOP_VINFO_NITERS (inner_loop_vinfo)))
        {
          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: inner-loop count not invariant.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: inner-loop count not"
+                             " invariant.\n");
          destroy_loop_vec_info (inner_loop_vinfo, true);
          return NULL;
        }
@@ -1119,7 +1154,7 @@ vect_analyze_loop_form (struct loop *loop)
         {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: control flow in loop.");
+                            "not vectorized: control flow in loop.\n");
          destroy_loop_vec_info (inner_loop_vinfo, true);
          return NULL;
         }
@@ -1134,15 +1169,15 @@ vect_analyze_loop_form (struct loop *loop)
          || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
        {
          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: unsupported outerloop form.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: unsupported outerloop form.\n");
          destroy_loop_vec_info (inner_loop_vinfo, true);
          return NULL;
        }
 
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
-                        "Considering outer-loop vectorization.");
+                        "Considering outer-loop vectorization.\n");
     }
 
   if (!single_exit (loop)
@@ -1152,10 +1187,10 @@ vect_analyze_loop_form (struct loop *loop)
         {
           if (!single_exit (loop))
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: multiple exits.");
+                            "not vectorized: multiple exits.\n");
           else if (EDGE_COUNT (loop->header->preds) != 2)
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: too many incoming edges.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: too many incoming edges.\n");
         }
       if (inner_loop_vinfo)
        destroy_loop_vec_info (inner_loop_vinfo, true);
@@ -1171,7 +1206,7 @@ vect_analyze_loop_form (struct loop *loop)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: latch block not empty.");
+                        "not vectorized: latch block not empty.\n");
       if (inner_loop_vinfo)
        destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
@@ -1185,13 +1220,13 @@ vect_analyze_loop_form (struct loop *loop)
        {
          split_loop_exit_edge (e);
          if (dump_enabled_p ())
-           dump_printf (MSG_NOTE, "split exit edge.");
+           dump_printf (MSG_NOTE, "split exit edge.\n");
        }
       else
        {
          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: abnormal loop exit edge.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: abnormal loop exit edge.\n");
          if (inner_loop_vinfo)
            destroy_loop_vec_info (inner_loop_vinfo, true);
          return NULL;
@@ -1202,56 +1237,49 @@ vect_analyze_loop_form (struct loop *loop)
   if (!loop_cond)
     {
       if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                        "not vectorized: complicated exit condition.");
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: complicated exit condition.\n");
       if (inner_loop_vinfo)
        destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
 
-  if (!number_of_iterations)
+  if (!number_of_iterations
+      || chrec_contains_undetermined (number_of_iterations))
     {
       if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "not vectorized: number of iterations cannot be "
-                        "computed.");
+                        "computed.\n");
       if (inner_loop_vinfo)
        destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
 
-  if (chrec_contains_undetermined (number_of_iterations))
+  if (integer_zerop (number_of_iterations))
     {
       if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "Infinite number of iterations.");
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: number of iterations = 0.\n");
       if (inner_loop_vinfo)
-       destroy_loop_vec_info (inner_loop_vinfo, true);
+        destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
 
-  if (!NITERS_KNOWN_P (number_of_iterations))
+  loop_vinfo = new_loop_vec_info (loop);
+  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
+  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
+
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
       if (dump_enabled_p ())
         {
           dump_printf_loc (MSG_NOTE, vect_location,
                           "Symbolic number of iterations is ");
          dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
+          dump_printf (MSG_NOTE, "\n");
         }
     }
-  else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: number of iterations = 0.");
-      if (inner_loop_vinfo)
-        destroy_loop_vec_info (inner_loop_vinfo, false);
-      return NULL;
-    }
-
-  loop_vinfo = new_loop_vec_info (loop);
-  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
-  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
 
   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
 
@@ -1291,7 +1319,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                    "=== vect_analyze_loop_operations ===");
+                    "=== vect_analyze_loop_operations ===\n");
 
   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
@@ -1327,7 +1355,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
-                        "Updating vectorization factor to %d ",
+                        "Updating vectorization factor to %d\n",
                         vectorization_factor);
     }
 
@@ -1345,6 +1373,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
             {
               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+              dump_printf (MSG_NOTE, "\n");
             }
 
           /* Inner-loop loop-closed exit phi in outer-loop vectorization
@@ -1361,9 +1390,9 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
                      != vect_double_reduction_def)
                 {
                   if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                     "Unsupported loop-closed phi in "
-                                    "outer-loop.");
+                                    "outer-loop.\n");
                   return false;
                 }
 
@@ -1382,7 +1411,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
                     return false;
 
                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
-                 if (!op_def_stmt
+                 if (gimple_nop_p (op_def_stmt)
                      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
                      || !vinfo_for_stmt (op_def_stmt))
                     return false;
@@ -1404,7 +1433,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
               /* FORNOW: not yet supported.  */
               if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "not vectorized: value used after loop.");
+                                "not vectorized: value used after loop.\n");
               return false;
             }
 
@@ -1413,8 +1442,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
             {
               /* A scalar-dependence cycle that we don't support.  */
               if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                "not vectorized: scalar dependence cycle.");
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "not vectorized: scalar dependence cycle.\n");
               return false;
             }
 
@@ -1429,10 +1458,11 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
             {
               if (dump_enabled_p ())
                 {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                   "not vectorized: relevant phi not "
                                   "supported: ");
                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                 }
              return false;
             }
@@ -1441,7 +1471,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
         {
           gimple stmt = gsi_stmt (si);
-         if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
+         if (!gimple_clobber_p (stmt)
+             && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
            return false;
         }
     } /* bbs */
@@ -1455,18 +1486,18 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
-                        "All the computation can be taken out of the loop.");
+                        "All the computation can be taken out of the loop.\n");
       if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "not vectorized: redundant loop. no profit to "
-                        "vectorize.");
+                        "vectorize.\n");
       return false;
     }
 
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                     "vectorization_factor = %d, niters = "
-                    HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
+                    HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
                     LOOP_VINFO_INT_NITERS (loop_vinfo));
 
   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
@@ -1476,11 +1507,11 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: iteration count too small.");
+                        "not vectorized: iteration count too small.\n");
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "not vectorized: iteration count smaller than "
-                        "vectorization factor.");
+                        "vectorization factor.\n");
       return false;
     }
 
@@ -1498,11 +1529,11 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: vectorization not profitable.");
+                        "not vectorized: vectorization not profitable.\n");
       if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "not vectorized: vector version will never be "
-                        "profitable.");
+                        "profitable.\n");
       return false;
     }
 
@@ -1524,12 +1555,12 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: vectorization not profitable.");
+                        "not vectorized: vectorization not profitable.\n");
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
                         "not vectorized: iteration count smaller than user "
                         "specified loop bound parameter or minimum profitable "
-                        "iterations (whichever is more conservative).");
+                        "iterations (whichever is more conservative).\n");
       return false;
     }
 
@@ -1540,38 +1571,16 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "not vectorized: estimated iteration count too "
-                         "small.");
+                         "small.\n");
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
                         "not vectorized: estimated iteration count smaller "
                          "than specified loop bound parameter or minimum "
                          "profitable iterations (whichever is more "
-                         "conservative).");
+                         "conservative).\n");
       return false;
     }
 
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
-      || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
-    {
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
-      if (!vect_can_advance_ivs_p (loop_vinfo))
-        {
-          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: can't create epilog loop 1.");
-          return false;
-        }
-      if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
-        {
-          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: can't create epilog loop 2.");
-          return false;
-        }
-    }
-
   return true;
 }
 
@@ -1600,7 +1609,19 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data references.");
+                        "bad data references.\n");
+      return false;
+    }
+
+  /* Analyze the access patterns of the data-refs in the loop (consecutive,
+     complex, etc.). FORNOW: Only handle consecutive access pattern.  */
+
+  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
+  if (!ok)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data access.\n");
       return false;
     }
 
@@ -1618,7 +1639,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "unexpected pattern.");
+                        "unexpected pattern.\n");
       return false;
     }
 
@@ -1627,13 +1648,13 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
      the dependences.
      FORNOW: fail at the first data dependence that we encounter.  */
 
-  ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
+  ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
   if (!ok
       || max_vf < min_vf)
     {
       if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "bad data dependence.");
+                            "bad data dependence.\n");
       return false;
     }
 
@@ -1642,14 +1663,14 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "can't determine vectorization factor.");
+                        "can't determine vectorization factor.\n");
       return false;
     }
   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data dependence.");
+                        "bad data dependence.\n");
       return false;
     }
 
@@ -1661,19 +1682,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data alignment.");
-      return false;
-    }
-
-  /* Analyze the access patterns of the data-refs in the loop (consecutive,
-     complex, etc.). FORNOW: Only handle consecutive access pattern.  */
-
-  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
-  if (!ok)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data access.");
+                        "bad data alignment.\n");
       return false;
     }
 
@@ -1686,7 +1695,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "too long list of versioning for alias "
-                        "run-time tests.");
+                        "run-time tests.\n");
       return false;
     }
 
@@ -1698,7 +1707,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data alignment.");
+                        "bad data alignment.\n");
       return false;
     }
 
@@ -1723,10 +1732,44 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad operation or unsupported loop bound.");
+                        "bad operation or unsupported loop bound.\n");
       return false;
     }
 
+  /* Decide whether we need to create an epilogue loop to handle
+     remaining scalar iterations.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+    {
+      if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
+                  - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+         < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+       LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+          || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+              < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+
+  /* If an epilogue loop is required make sure we can create one.  */
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+      || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
+      if (!vect_can_advance_ivs_p (loop_vinfo)
+         || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
+                                          single_exit (LOOP_VINFO_LOOP
+                                                        (loop_vinfo))))
+        {
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: can't create required "
+                            "epilog loop\n");
+          return false;
+        }
+    }
+
   return true;
 }
 
@@ -1747,7 +1790,7 @@ vect_analyze_loop (struct loop *loop)
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                    "===== analyze_loop_nest =====");
+                    "===== analyze_loop_nest =====\n");
 
   if (loop_outer (loop)
       && loop_vec_info_for_loop (loop_outer (loop))
@@ -1755,7 +1798,7 @@ vect_analyze_loop (struct loop *loop)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
-                        "outer-loop already vectorized.");
+                        "outer-loop already vectorized.\n");
       return NULL;
     }
 
@@ -1767,7 +1810,7 @@ vect_analyze_loop (struct loop *loop)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "bad loop form.");
+                            "bad loop form.\n");
          return NULL;
        }
 
@@ -1848,6 +1891,7 @@ report_vect_op (int msg_type, gimple stmt, const char *msg)
 {
   dump_printf_loc (msg_type, vect_location, "%s", msg);
   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
+  dump_printf (msg_type, "\n");
 }
 
 
@@ -2023,11 +2067,12 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
                {
                  dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
                  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
+                  dump_printf (MSG_NOTE, "\n");
                }
 
-             swap_tree_operands (next_stmt,
-                                 gimple_assign_rhs1_ptr (next_stmt),
-                                  gimple_assign_rhs2_ptr (next_stmt));
+             swap_ssa_operands (next_stmt,
+                                gimple_assign_rhs1_ptr (next_stmt),
+                                 gimple_assign_rhs2_ptr (next_stmt));
              update_stmt (next_stmt);
 
              if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
@@ -2060,6 +2105,13 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
      a3 = ...
      a2 = operation (a3, a1)
 
+   or
+
+   a3 = ...
+   loop_header:
+     a1 = phi < a0, a2 >
+     a2 = operation (a3, a1)
+
    such that:
    1. operation is commutative and associative and it is safe to
       change the order of the computation (if CHECK_REDUCTION is true)
@@ -2123,7 +2175,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "intermediate value used outside loop.");
+                            "intermediate value used outside loop.\n");
 
           return NULL;
         }
@@ -2135,7 +2187,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction used in loop.");
+                            "reduction used in loop.\n");
           return NULL;
         }
     }
@@ -2147,6 +2199,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                           "reduction: not ssa_name: ");
          dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
+          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
        }
       return NULL;
     }
@@ -2156,14 +2209,17 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "reduction: no def_stmt.");
+                        "reduction: no def_stmt.\n");
       return NULL;
     }
 
   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
     {
       if (dump_enabled_p ())
-        dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+        {
+          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+          dump_printf (MSG_NOTE, "\n");
+        }
       return NULL;
     }
 
@@ -2192,7 +2248,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction used in loop.");
+                            "reduction used in loop.\n");
          return NULL;
        }
     }
@@ -2208,7 +2264,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "unsupported phi node definition.");
+                            "unsupported phi node definition.\n");
 
           return NULL;
         }
@@ -2331,6 +2387,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
               dump_generic_expr (MSG_NOTE, TDF_SLIM,
                                 TREE_TYPE (op4));
             }
+          dump_printf (MSG_NOTE, "\n");
         }
 
       return NULL;
@@ -2415,6 +2472,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
   if (def2 && def2 == phi
       && (code == COND_EXPR
          || !def1 || gimple_nop_p (def1)
+         || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
               && (is_gimple_assign (def1)
                  || is_gimple_call (def1)
@@ -2433,6 +2491,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
   if (def1 && def1 == phi
       && (code == COND_EXPR
          || !def2 || gimple_nop_p (def2)
+         || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
              && (is_gimple_assign (def2)
                  || is_gimple_call (def2)
@@ -2452,8 +2511,8 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
            report_vect_op (MSG_NOTE, def_stmt,
                            "detected reduction: need to swap operands: ");
 
-          swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
-                             gimple_assign_rhs2_ptr (def_stmt));
+          swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
+                            gimple_assign_rhs2_ptr (def_stmt));
 
          if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
            LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
@@ -2588,7 +2647,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
                         "cost model: epilogue peel iters set to vf/2 "
-                        "because loop iterations are unknown .");
+                        "because loop iterations are unknown .\n");
 
       /* If peeled iterations are known but number of scalar loop
          iterations are unknown, count a taken branch per peeled loop.  */
@@ -2640,13 +2699,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
   int scalar_single_iter_cost = 0;
   int scalar_outside_cost = 0;
   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
+  int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
 
   /* Cost model disabled.  */
-  if (!flag_vect_cost_model)
+  if (unlimited_cost_model ())
     {
-      dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
+      dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
       *ret_min_profitable_niters = 0;
       *ret_min_profitable_estimate = 0;
       return;
@@ -2704,14 +2763,14 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     {
       peel_iters_prologue = vf/2;
       dump_printf (MSG_NOTE, "cost model: "
-                   "prologue peel iters set to vf/2.");
+                   "prologue peel iters set to vf/2.\n");
 
       /* If peeling for alignment is unknown, loop bound of main loop becomes
          unknown.  */
       peel_iters_epilogue = vf/2;
       dump_printf (MSG_NOTE, "cost model: "
                    "epilogue peel iters set to vf/2 because "
-                   "peeling for alignment is unknown.");
+                   "peeling for alignment is unknown.\n");
 
       /* If peeled iterations are unknown, count a taken branch and a not taken
          branch per peeled loop. Even if scalar loop iterations are known,
@@ -2831,7 +2890,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       else
        {
          /* Cost model check occurs at prologue generation.  */
-         if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+         if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
            scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
              + vect_get_stmt_cost (cond_branch_not_taken); 
          /* Cost model check occurs at epilogue generation.  */
@@ -2881,7 +2940,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "cost model: the vector iteration cost = %d "
                         "divided by the scalar iteration cost = %d "
-                        "is greater or equal to the vectorization factor = %d.",
+                        "is greater or equal to the vectorization factor = %d"
+                         ".\n",
                         vec_inside_cost, scalar_single_iter_cost, vf);
       *ret_min_profitable_niters = -1;
       *ret_min_profitable_estimate = -1;
@@ -2907,9 +2967,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
                    peel_iters_prologue);
       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
                    peel_iters_epilogue);
-      dump_printf (MSG_NOTE, 
+      dump_printf (MSG_NOTE,
                    "  Calculated minimum iters for profitability: %d\n",
                    min_profitable_iters);
+      dump_printf (MSG_NOTE, "\n");
     }
 
   min_profitable_iters =
@@ -2922,7 +2983,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                     "  Runtime profitability threshold = %d\n", min_profitable_iters);
+                     "  Runtime profitability threshold = %d\n",
+                     min_profitable_iters);
 
   *ret_min_profitable_niters = min_profitable_iters;
 
@@ -3011,6 +3073,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
                           "unsupported data-type ");
           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                             TREE_TYPE (reduction_op));
+          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
         }
       return false;
    }
@@ -3043,10 +3106,10 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
        }
       else
        {
-         int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+         int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
          tree bitsize =
            TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
-         int element_bitsize = tree_low_cst (bitsize, 1);
+         int element_bitsize = tree_to_uhwi (bitsize);
          int nelements = vec_size_in_bits / element_bitsize;
 
          optab = optab_for_tree_code (code, vectype, optab_default);
@@ -3079,7 +3142,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
   if (dump_enabled_p ())
     dump_printf (MSG_NOTE, 
                  "vect_model_reduction_cost: inside_cost = %d, "
-                 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
+                 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
                  prologue_cost, epilogue_cost);
 
   return true;
@@ -3108,7 +3171,7 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                      "vect_model_induction_cost: inside_cost = %d, "
-                     "prologue_cost = %d .", inside_cost, prologue_cost);
+                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
 }
 
 
@@ -3130,7 +3193,6 @@ get_initial_def_for_induction (gimple iv_phi)
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  tree scalar_type;
   tree vectype;
   int nunits;
   edge pe = loop_preheader_edge (loop);
@@ -3182,8 +3244,7 @@ get_initial_def_for_induction (gimple iv_phi)
   gcc_assert (ok);
   pe = loop_preheader_edge (iv_loop);
 
-  scalar_type = TREE_TYPE (init_expr);
-  vectype = get_vectype_for_scalar_type (scalar_type);
+  vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
   gcc_assert (vectype);
   nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -3204,6 +3265,21 @@ get_initial_def_for_induction (gimple iv_phi)
       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
                                            loop_preheader_edge (iv_loop));
       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
+      /* If the initial value is not of proper type, convert it.  */
+      if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
+       {
+         new_stmt = gimple_build_assign_with_ops
+             (VIEW_CONVERT_EXPR,
+              vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
+              build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
+         vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
+         gimple_assign_set_lhs (new_stmt, vec_init);
+         new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
+                                                new_stmt);
+         gcc_assert (!new_bb);
+         set_vinfo_for_stmt (new_stmt,
+                             new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
+       }
     }
   else
     {
@@ -3211,8 +3287,11 @@ get_initial_def_for_induction (gimple iv_phi)
 
       /* iv_loop is the loop to be vectorized. Create:
         vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
-      new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
-      new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
+      new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
+                                      vect_scalar_var, "var_");
+      new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
+                                                    init_expr),
+                                      &stmts, false, new_var);
       if (stmts)
        {
          new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
@@ -3220,30 +3299,36 @@ get_initial_def_for_induction (gimple iv_phi)
        }
 
       vec_alloc (v, nunits);
+      bool constant_p = is_gimple_min_invariant (new_name);
       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
       for (i = 1; i < nunits; i++)
        {
          /* Create: new_name_i = new_name + step_expr  */
-         enum tree_code code = POINTER_TYPE_P (scalar_type)
-                               ? POINTER_PLUS_EXPR : PLUS_EXPR;
-         init_stmt = gimple_build_assign_with_ops (code, new_var,
-                                                   new_name, step_expr);
-         new_name = make_ssa_name (new_var, init_stmt);
-         gimple_assign_set_lhs (init_stmt, new_name);
-
-         new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
-         gcc_assert (!new_bb);
-
-         if (dump_enabled_p ())
+         new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
+                                 new_name, step_expr);
+         if (!is_gimple_min_invariant (new_name))
            {
-             dump_printf_loc (MSG_NOTE, vect_location,
-                              "created new init_stmt: ");
-             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
+             init_stmt = gimple_build_assign (new_var, new_name);
+             new_name = make_ssa_name (new_var, init_stmt);
+             gimple_assign_set_lhs (init_stmt, new_name);
+             new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
+             gcc_assert (!new_bb);
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  "created new init_stmt: ");
+                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
+                  dump_printf (MSG_NOTE, "\n");
+               }
+             constant_p = false;
            }
          CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
        }
       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
-      new_vec = build_constructor (vectype, v);
+      if (constant_p)
+       new_vec = build_vector_from_ctor (vectype, v);
+      else
+       new_vec = build_constructor (vectype, v);
       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
     }
 
@@ -3257,13 +3342,23 @@ get_initial_def_for_induction (gimple iv_phi)
     {
       /* iv_loop is the loop to be vectorized. Generate:
          vec_step = [VF*S, VF*S, VF*S, VF*S]  */
-      expr = build_int_cst (TREE_TYPE (step_expr), vf);
+      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+       {
+         expr = build_int_cst (integer_type_node, vf);
+         expr = fold_convert (TREE_TYPE (step_expr), expr);
+       }
+      else
+       expr = build_int_cst (TREE_TYPE (step_expr), vf);
       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
                              expr, step_expr);
+      if (TREE_CODE (step_expr) == SSA_NAME)
+       new_name = vect_init_vector (iv_phi, new_name,
+                                    TREE_TYPE (step_expr), NULL);
     }
 
   t = unshare_expr (new_name);
-  gcc_assert (CONSTANT_CLASS_P (new_name));
+  gcc_assert (CONSTANT_CLASS_P (new_name)
+             || TREE_CODE (new_name) == SSA_NAME);
   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
   gcc_assert (stepvectype);
   new_vec = build_vector_from_val (stepvectype, t);
@@ -3316,11 +3411,21 @@ get_initial_def_for_induction (gimple iv_phi)
       gcc_assert (!nested_in_vect_loop);
 
       /* Create the vector that holds the step of the induction.  */
-      expr = build_int_cst (TREE_TYPE (step_expr), nunits);
+      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+       {
+         expr = build_int_cst (integer_type_node, nunits);
+         expr = fold_convert (TREE_TYPE (step_expr), expr);
+       }
+      else
+       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
                              expr, step_expr);
+      if (TREE_CODE (step_expr) == SSA_NAME)
+       new_name = vect_init_vector (iv_phi, new_name,
+                                    TREE_TYPE (step_expr), NULL);
       t = unshare_expr (new_name);
-      gcc_assert (CONSTANT_CLASS_P (new_name));
+      gcc_assert (CONSTANT_CLASS_P (new_name)
+                 || TREE_CODE (new_name) == SSA_NAME);
       new_vec = build_vector_from_val (stepvectype, t);
       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
 
@@ -3382,6 +3487,7 @@ get_initial_def_for_induction (gimple iv_phi)
              dump_printf_loc (MSG_NOTE, vect_location,
                               "vector of inductions after inner-loop:");
              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
+              dump_printf (MSG_NOTE, "\n");
            }
        }
     }
@@ -3395,6 +3501,7 @@ get_initial_def_for_induction (gimple iv_phi)
       dump_printf (MSG_NOTE, "\n");
       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
                        SSA_NAME_DEF_STMT (vec_def), 0);
+      dump_printf (MSG_NOTE, "\n");
     }
 
   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
@@ -3698,14 +3805,14 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
   bool extract_scalar_result = false;
   gimple use_stmt, orig_stmt, reduction_phi = NULL;
   bool nested_in_vect_loop = false;
-  vec<gimple> new_phis = vNULL;
-  vec<gimple> inner_phis = vNULL;
+  auto_vec<gimple> new_phis;
+  auto_vec<gimple> inner_phis;
   enum vect_def_type dt = vect_unknown_def_type;
   int j, i;
-  vec<tree> scalar_results = vNULL;
+  auto_vec<tree> scalar_results;
   unsigned int group_size = 1, k, ratio;
-  vec<tree> vec_initial_defs = vNULL;
-  vec<gimple> phis;
+  auto_vec<tree> vec_initial_defs;
+  auto_vec<gimple> phis;
   bool slp_reduc = false;
   tree new_phi_result;
   gimple inner_phi = NULL;
@@ -3802,14 +3909,13 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
               dump_printf (MSG_NOTE, "\n");
               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
+              dump_printf (MSG_NOTE, "\n");
             }
 
           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
         }
     }
 
-  vec_initial_defs.release ();
-
   /* 2. Create epilog code.
         The reduction epilog code operates across the elements of the vector
         of partial results computed by the vectorized loop.
@@ -3998,7 +4104,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
 
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
-                        "Reduce using direct vector reduction.");
+                        "Reduce using direct vector reduction.\n");
 
       vec_dest = vect_create_destination_var (scalar_dest, vectype);
       tmp = build1 (reduc_code, vectype, new_phi_result);
@@ -4014,8 +4120,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
       enum tree_code shift_code = ERROR_MARK;
       bool have_whole_vector_shift = true;
       int bit_offset;
-      int element_bitsize = tree_low_cst (bitsize, 1);
-      int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+      int element_bitsize = tree_to_uhwi (bitsize);
+      int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
       tree vec_temp;
 
       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
@@ -4049,7 +4155,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
 
           if (dump_enabled_p ())
             dump_printf_loc (MSG_NOTE, vect_location,
-                            "Reduce using vector shifts");
+                            "Reduce using vector shifts\n");
 
           vec_dest = vect_create_destination_var (scalar_dest, vectype);
           new_temp = new_phi_result;
@@ -4090,9 +4196,9 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
 
           if (dump_enabled_p ())
             dump_printf_loc (MSG_NOTE, vect_location,
-                            "Reduce using scalar code. ");
+                            "Reduce using scalar code.\n");
 
-          vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+          vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
             {
               if (gimple_code (new_phi) == GIMPLE_PHI)
@@ -4181,7 +4287,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
 
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
-                        "extract scalar result");
+                        "extract scalar result\n");
 
       if (BYTES_BIG_ENDIAN)
         bitpos = size_binop (MULT_EXPR,
@@ -4229,7 +4335,6 @@ vect_finalize_reduction:
       epilog_stmt = gimple_build_assign (new_dest, expr);
       new_temp = make_ssa_name (new_dest, epilog_stmt);
       gimple_assign_set_lhs (epilog_stmt, new_temp);
-      SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
       if (nested_in_vect_loop)
         {
@@ -4326,12 +4431,12 @@ vect_finalize_reduction:
          result.  (The reduction result is expected to have two immediate uses -
          one at the latch block, and one at the loop exit).  */
       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
-        if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
+        if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
+           && !is_gimple_debug (USE_STMT (use_p)))
           phis.safe_push (USE_STMT (use_p));
 
-      /* We expect to have found an exit_phi because of loop-closed-ssa
-         form.  */
-      gcc_assert (!phis.is_empty ());
+      /* While we expect to have found an exit_phi because of loop-closed-ssa
+         form we can end up without one if the scalar cycle is dead.  */
 
       FOR_EACH_VEC_ELT (phis, i, exit_phi)
         {
@@ -4421,6 +4526,7 @@ vect_finalize_reduction:
                       dump_printf_loc (MSG_NOTE, vect_location,
                                       "created double reduction phi node: ");
                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
+                      dump_printf (MSG_NOTE, "\n");
                     }
 
                   vect_phi_res = PHI_RESULT (vect_phi);
@@ -4456,7 +4562,10 @@ vect_finalize_reduction:
       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
         {
           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
-            phis.safe_push (USE_STMT (use_p));
+           {
+             if (!is_gimple_debug (USE_STMT (use_p)))
+               phis.safe_push (USE_STMT (use_p));
+           }
           else
             {
               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
@@ -4466,7 +4575,8 @@ vect_finalize_reduction:
                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
                     {
                       if (!flow_bb_inside_loop_p (loop,
-                                             gimple_bb (USE_STMT (phi_use_p))))
+                                             gimple_bb (USE_STMT (phi_use_p)))
+                         && !is_gimple_debug (USE_STMT (phi_use_p)))
                         phis.safe_push (USE_STMT (phi_use_p));
                     }
                 }
@@ -4485,10 +4595,7 @@ vect_finalize_reduction:
 
       phis.release ();
     }
-
-  scalar_results.release ();
-  new_phis.release ();
-} 
+}
 
 
 /* Function vectorizable_reduction.
@@ -4575,10 +4682,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   struct loop * def_stmt_loop, *outer_loop = NULL;
   tree def_arg;
   gimple def_arg_stmt;
-  vec<tree> vec_oprnds0 = vNULL;
-  vec<tree> vec_oprnds1 = vNULL;
-  vec<tree> vect_defs = vNULL;
-  vec<gimple> phis = vNULL;
+  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds1;
+  auto_vec<tree> vect_defs;
+  auto_vec<gimple> phis;
   int vec_num;
   tree def0, def1, tem, op0, op1 = NULL_TREE;
 
@@ -4693,7 +4800,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
      The last use is the reduction variable.  In case of nested cycle this
      assumption is not true: we use reduc_index to record the index of the
      reduction variable.  */
-  for (i = 0; i < op_type-1; i++)
+  for (i = 0; i < op_type - 1; i++)
     {
       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
       if (i == 0 && code == COND_EXPR)
@@ -4725,11 +4832,18 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   if (!vectype_in)
     vectype_in = tem;
   gcc_assert (is_simple_use);
-  gcc_assert (dt == vect_reduction_def
-              || dt == vect_nested_cycle
-              || ((dt == vect_internal_def || dt == vect_external_def
-                   || dt == vect_constant_def || dt == vect_induction_def)
-                   && nested_cycle && found_nested_cycle_def));
+  if (!(dt == vect_reduction_def
+       || dt == vect_nested_cycle
+       || ((dt == vect_internal_def || dt == vect_external_def
+            || dt == vect_constant_def || dt == vect_induction_def)
+           && nested_cycle && found_nested_cycle_def)))
+    {
+      /* For pattern recognized stmts, orig_stmt might be a reduction,
+        but some helper statements for the pattern might not, or
+        might be COND_EXPRs with reduction uses in the condition.  */
+      gcc_assert (orig_stmt);
+      return false;
+    }
   if (!found_nested_cycle_def)
     reduc_def_stmt = def_stmt;
 
@@ -4768,7 +4882,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "unsupported condition in reduction");
+                            "unsupported condition in reduction\n");
 
             return false;
         }
@@ -4777,13 +4891,24 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
     {
       /* 4. Supportable by target?  */
 
+      if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
+         || code == LROTATE_EXPR || code == RROTATE_EXPR)
+       {
+         /* Shifts and rotates are only supported by vectorizable_shifts,
+            not vectorizable_reduction.  */
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "unsupported shift or rotation.\n");
+         return false;
+       }
+
       /* 4.1. check support for the operation in the loop  */
       optab = optab_for_tree_code (code, vectype_in, optab_default);
       if (!optab)
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "no optab.");
+                            "no optab.\n");
 
           return false;
         }
@@ -4791,7 +4916,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
         {
           if (dump_enabled_p ())
-            dump_printf (MSG_NOTE, "op not supported by target.");
+            dump_printf (MSG_NOTE, "op not supported by target.\n");
 
           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
@@ -4799,7 +4924,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
             return false;
 
           if (dump_enabled_p ())
-           dump_printf (MSG_NOTE, "proceeding using word mode.");
+           dump_printf (MSG_NOTE, "proceeding using word mode.\n");
         }
 
       /* Worthwhile without SIMD support?  */
@@ -4809,7 +4934,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not worthwhile without SIMD support.");
+                            "not worthwhile without SIMD support.\n");
 
           return false;
         }
@@ -4890,7 +5015,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "no optab for reduction.");
+                            "no optab for reduction.\n");
 
           epilog_reduc_code = ERROR_MARK;
         }
@@ -4900,7 +5025,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduc op not supported by target.");
+                            "reduc op not supported by target.\n");
 
           epilog_reduc_code = ERROR_MARK;
         }
@@ -4911,7 +5036,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "no reduc code for scalar code.");
+                            "no reduc code for scalar code.\n");
 
           return false;
         }
@@ -4921,7 +5046,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "multiple types in double reduction");
+                        "multiple types in double reduction\n");
 
       return false;
     }
@@ -4940,7 +5065,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "invalid types in dot-prod");
+                            "invalid types in dot-prod\n");
 
           return false;
         }
@@ -4957,7 +5082,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   /** Transform.  **/
 
   if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
+    dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
 
   /* FORNOW: Multiple types are not supported for condition.  */
   if (code == COND_EXPR)
@@ -5176,10 +5301,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
                                     epilog_reduc_code, phis, reduc_index,
                                     double_reduc, slp_node);
 
-  phis.release ();
-  vec_oprnds0.release ();
-  vec_oprnds1.release ();
-
   return true;
 }
 
@@ -5243,7 +5364,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "multiple types in nested loop.");
+                            "multiple types in nested loop.\n");
          return false;
        }
 
@@ -5266,9 +5387,9 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
                && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
            {
              if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                 "inner-loop induction only used outside "
-                                "of the outer vectorized loop.");
+                                "of the outer vectorized loop.\n");
              return false;
            }
        }
@@ -5291,7 +5412,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
-                         "=== vectorizable_induction ===");
+                         "=== vectorizable_induction ===\n");
       vect_model_induction_cost (stmt_info, ncopies);
       return true;
     }
@@ -5299,7 +5420,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
   /** Transform.  **/
 
   if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
+    dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
 
   vec_def = get_initial_def_for_induction (phi);
   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
@@ -5314,7 +5435,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
 bool
 vectorizable_live_operation (gimple stmt,
                             gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
-                            gimple *vec_stmt ATTRIBUTE_UNUSED)
+                            gimple *vec_stmt)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
@@ -5334,7 +5455,41 @@ vectorizable_live_operation (gimple stmt,
     return false;
 
   if (!is_gimple_assign (stmt))
-    return false;
+    {
+      if (gimple_call_internal_p (stmt)
+         && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
+         && gimple_call_lhs (stmt)
+         && loop->simduid
+         && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
+         && loop->simduid
+            == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
+       {
+         edge e = single_exit (loop);
+         basic_block merge_bb = e->dest;
+         imm_use_iterator imm_iter;
+         use_operand_p use_p;
+         tree lhs = gimple_call_lhs (stmt);
+
+         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
+           {
+             gimple use_stmt = USE_STMT (use_p);
+             if (gimple_code (use_stmt) == GIMPLE_PHI
+                 || gimple_bb (use_stmt) == merge_bb)
+               {
+                 if (vec_stmt)
+                   {
+                     tree vfm1
+                       = build_int_cst (unsigned_type_node,
+                                        loop_vinfo->vectorization_factor - 1);
+                     SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
+                   }
+                 return true;
+               }
+           }
+       }
+
+      return false;
+    }
 
   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
     return false;
@@ -5365,7 +5520,7 @@ vectorizable_live_operation (gimple stmt,
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "use not simple.");
+                            "use not simple.\n");
           return false;
         }
 
@@ -5404,7 +5559,7 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
                {
                  if (dump_enabled_p ())
                    dump_printf_loc (MSG_NOTE, vect_location,
-                                     "killing debug use");
+                                     "killing debug use\n");
 
                  gimple_debug_bind_reset_value (ustmt);
                  update_stmt (ustmt);
@@ -5416,6 +5571,111 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
     }
 }
 
+
+/* This function builds ni_name = number of iterations.  Statements
+   are emitted on the loop preheader edge.  */
+
+static tree
+vect_build_loop_niters (loop_vec_info loop_vinfo)
+{
+  tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
+  if (TREE_CODE (ni) == INTEGER_CST)
+    return ni;
+  else
+    {
+      tree ni_name, var;
+      gimple_seq stmts = NULL;
+      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+
+      var = create_tmp_var (TREE_TYPE (ni), "niters");
+      ni_name = force_gimple_operand (ni, &stmts, false, var);
+      if (stmts)
+       gsi_insert_seq_on_edge_immediate (pe, stmts);
+
+      return ni_name;
+    }
+}
+
+
+/* This function generates the following statements:
+
+   ni_name = number of iterations loop executes
+   ratio = ni_name / vf
+   ratio_mult_vf_name = ratio * vf
+
+   and places them on the loop preheader edge.  */
+
+static void
+vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
+                                tree ni_name,
+                                tree *ratio_mult_vf_name_ptr,
+                                tree *ratio_name_ptr)
+{
+  tree ni_minus_gap_name;
+  tree var;
+  tree ratio_name;
+  tree ratio_mult_vf_name;
+  tree ni = LOOP_VINFO_NITERS (loop_vinfo);
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+  tree log_vf;
+
+  log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
+
+  /* If epilogue loop is required because of data accesses with gaps, we
+     subtract one iteration from the total number of iterations here for
+     correct calculation of RATIO.  */
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    {
+      ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
+                                      ni_name,
+                                      build_one_cst (TREE_TYPE (ni_name)));
+      if (!is_gimple_val (ni_minus_gap_name))
+       {
+         var = create_tmp_var (TREE_TYPE (ni), "ni_gap");
+          gimple stmts = NULL;
+          ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
+                                                   true, var);
+         gsi_insert_seq_on_edge_immediate (pe, stmts);
+        }
+    }
+  else
+    ni_minus_gap_name = ni_name;
+
+  /* Create: ratio = ni >> log2(vf) */
+
+  ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_minus_gap_name),
+                           ni_minus_gap_name, log_vf);
+  if (!is_gimple_val (ratio_name))
+    {
+      var = create_tmp_var (TREE_TYPE (ni), "bnd");
+      gimple stmts = NULL;
+      ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
+      gsi_insert_seq_on_edge_immediate (pe, stmts);
+    }
+  *ratio_name_ptr = ratio_name;
+
+  /* Create: ratio_mult_vf = ratio << log2 (vf).  */
+
+  if (ratio_mult_vf_name_ptr)
+    {
+      ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
+                                       ratio_name, log_vf);
+      if (!is_gimple_val (ratio_mult_vf_name))
+       {
+         var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
+         gimple stmts = NULL;
+         ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
+                                                    true, var);
+         gsi_insert_seq_on_edge_immediate (pe, stmts);
+       }
+      *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
+    }
+
+  return;
+}
+
+
 /* Function vect_transform_loop.
 
    The analysis phase has determined that the loop is vectorizable.
@@ -5445,7 +5705,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
 
   if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
+    dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
 
   /* If profile is inprecise, we have chance to fix it up.  */
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
@@ -5464,24 +5724,35 @@ vect_transform_loop (loop_vec_info loop_vinfo)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
-                        "Profitability threshold is %d loop iterations.", th);
+                        "Profitability threshold is %d loop iterations.\n",
+                         th);
       check_profitability = true;
     }
 
-  /* Peel the loop if there are data refs with unknown alignment.
-     Only one data ref with unknown store is allowed.  */
+  /* Version the loop first, if required, so the profitability check
+     comes first.  */
 
-  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
+  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
+      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
     {
-      vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
+      vect_loop_versioning (loop_vinfo, th, check_profitability);
       check_profitability = false;
     }
 
-  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
-      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
+  tree ni_name = vect_build_loop_niters (loop_vinfo);
+  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
+
+  /* Peel the loop if there are data refs with unknown alignment.
+     Only one data ref with unknown store is allowed.  */
+
+  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
     {
-      vect_loop_versioning (loop_vinfo, th, check_profitability);
+      vect_do_peeling_for_alignment (loop_vinfo, ni_name,
+                                    th, check_profitability);
       check_profitability = false;
+      /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
+        be re-computed.  */
+      ni_name = NULL_TREE;
     }
 
   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
@@ -5492,15 +5763,26 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      will remain scalar and will compute the remaining (n%VF) iterations.
      (VF is the vectorization factor).  */
 
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-       || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-          && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
-       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-    vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
-                                   th, check_profitability);
-  else
+  if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    {
+      tree ratio_mult_vf;
+      if (!ni_name)
+       ni_name = vect_build_loop_niters (loop_vinfo);
+      vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
+                                      &ratio);
+      vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
+                                     th, check_profitability);
+    }
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
                LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
+  else
+    {
+      if (!ni_name)
+       ni_name = vect_build_loop_niters (loop_vinfo);
+      vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
+    }
 
   /* 1) Make sure the loop header has exactly two entries
      2) Make sure we have a preheader basic block.  */
@@ -5528,6 +5810,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
              dump_printf_loc (MSG_NOTE, vect_location,
                                "------>vectorizing phi: ");
              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+              dump_printf (MSG_NOTE, "\n");
            }
          stmt_info = vinfo_for_stmt (phi);
          if (!stmt_info)
@@ -5543,12 +5826,12 @@ vect_transform_loop (loop_vec_info loop_vinfo)
          if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
                != (unsigned HOST_WIDE_INT) vectorization_factor)
              && dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
+           dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
 
          if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
            {
              if (dump_enabled_p ())
-               dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
+               dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
              vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
            }
        }
@@ -5561,13 +5844,24 @@ vect_transform_loop (loop_vec_info loop_vinfo)
           if (transform_pattern_stmt)
            stmt = pattern_stmt;
           else
-            stmt = gsi_stmt (si);
+           {
+             stmt = gsi_stmt (si);
+             /* During vectorization remove existing clobber stmts.  */
+             if (gimple_clobber_p (stmt))
+               {
+                 unlink_stmt_vdef (stmt);
+                 gsi_remove (&si, true);
+                 release_defs (stmt);
+                 continue;
+               }
+           }
 
          if (dump_enabled_p ())
            {
              dump_printf_loc (MSG_NOTE, vect_location,
                               "------>vectorizing statement: ");
              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+              dump_printf (MSG_NOTE, "\n");
            }
 
          stmt_info = vinfo_for_stmt (stmt);
@@ -5642,6 +5936,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                                           "stmt: ");
                          dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
                                            pattern_def_stmt, 0);
+                          dump_printf (MSG_NOTE, "\n");
                        }
 
                      stmt = pattern_def_stmt;
@@ -5666,7 +5961,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
            /* For SLP VF is set according to unrolling factor, and not to
               vector size, hence for SLP this print is not valid.  */
             dump_printf_loc (MSG_NOTE, vect_location,
-                            "multiple-types.");
+                            "multiple-types.\n");
 
          /* SLP. Schedule all the SLP instances when the first SLP stmt is
             reached.  */
@@ -5678,7 +5973,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
                  if (dump_enabled_p ())
                    dump_printf_loc (MSG_NOTE, vect_location,
-                                    "=== scheduling SLP instances ===");
+                                    "=== scheduling SLP instances ===\n");
 
                  vect_schedule_slp (loop_vinfo, NULL);
                }
@@ -5697,7 +5992,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
          /* -------- vectorize statement ------------ */
          if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
+           dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
 
          grouped_store = false;
          is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
@@ -5735,7 +6030,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   slpeel_make_loop_iterate_ntimes (loop, ratio);
 
   /* Reduce loop iterations by the vectorization factor.  */
-  scale_loop_profile (loop, RDIV (REG_BR_PROB_BASE , vectorization_factor),
+  scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
                      expected_iterations / vectorization_factor);
   loop->nb_iterations_upper_bound
     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
@@ -5753,14 +6048,13 @@ vect_transform_loop (loop_vec_info loop_vinfo)
         loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
     }
 
-  /* The memory tags and pointers in vectorized statements need to
-     have their SSA forms updated.  FIXME, why can't this be delayed
-     until all the loops have been transformed?  */
-  update_ssa (TODO_update_ssa);
-
   if (dump_enabled_p ())
-    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
-  if (loop->inner && dump_enabled_p ())
-    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-                    "OUTER LOOP VECTORIZED.");
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "LOOP VECTORIZED\n");
+      if (loop->inner)
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "OUTER LOOP VECTORIZED\n");
+      dump_printf (MSG_NOTE, "\n");
+    }
 }