i386.md (simple LEA peephole2s): Add zero-extend variants of PLUS and MULT simple...
[platform/upstream/gcc.git] / gcc / tree-vect-stmts.c
index f9b9639..491b239 100644 (file)
@@ -1,5 +1,5 @@
 /* Statement Analysis and Transformation for Vectorization
-   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
    Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
    and Ira Rosen <irar@il.ibm.com>
@@ -28,19 +28,79 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree.h"
 #include "target.h"
 #include "basic-block.h"
-#include "tree-pretty-print.h"
 #include "gimple-pretty-print.h"
 #include "tree-flow.h"
-#include "tree-dump.h"
 #include "cfgloop.h"
-#include "cfglayout.h"
 #include "expr.h"
-#include "recog.h"
+#include "recog.h"             /* FIXME: for insn_data */
 #include "optabs.h"
 #include "diagnostic-core.h"
 #include "tree-vectorizer.h"
+#include "dumpfile.h"
+
+/* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
 
+/* Return the vectorized type for the given statement.  */
+
+tree
+stmt_vectype (struct _stmt_vec_info *stmt_info)
+{
+  return STMT_VINFO_VECTYPE (stmt_info);
+}
+
+/* Return TRUE iff the given statement is in an inner loop relative to
+   the loop being vectorized.  */
+bool
+stmt_in_inner_loop_p (struct _stmt_vec_info *stmt_info)
+{
+  gimple stmt = STMT_VINFO_STMT (stmt_info);
+  basic_block bb = gimple_bb (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  struct loop* loop;
+
+  if (!loop_vinfo)
+    return false;
+
+  loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+  return (bb->loop_father == loop->inner);
+}
+
+/* Record the cost of a statement, either by directly informing the 
+   target model or by saving it in a vector for later processing.
+   Return a preliminary estimate of the statement's cost.  */
+
+unsigned
+record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
+                 enum vect_cost_for_stmt kind, stmt_vec_info stmt_info,
+                 int misalign, enum vect_cost_model_location where)
+{
+  if (body_cost_vec)
+    {
+      tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
+      add_stmt_info_to_vec (body_cost_vec, count, kind,
+                           stmt_info ? STMT_VINFO_STMT (stmt_info) : NULL,
+                           misalign);
+      return (unsigned)
+       (builtin_vectorization_cost (kind, vectype, misalign) * count);
+        
+    }
+  else
+    {
+      loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+      bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
+      void *target_cost_data;
+
+      if (loop_vinfo)
+       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
+      else
+       target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
+
+      return add_stmt_cost (target_cost_data, count, kind, stmt_info,
+                           misalign, where);
+    }
+}
 
 /* Return a variable of type ELEM_TYPE[NELEMS].  */
 
@@ -74,7 +134,6 @@ read_vector_array (gimple stmt, gimple_stmt_iterator *gsi, tree scalar_dest,
   vect_name = make_ssa_name (vect, new_stmt);
   gimple_assign_set_lhs (new_stmt, vect_name);
   vect_finish_stmt_generation (stmt, new_stmt, gsi);
-  mark_symbols_for_renaming (new_stmt);
 
   return vect_name;
 }
@@ -96,7 +155,6 @@ write_vector_array (gimple stmt, gimple_stmt_iterator *gsi, tree vect,
 
   new_stmt = gimple_build_assign (array_ref, vect);
   vect_finish_stmt_generation (stmt, new_stmt, gsi);
-  mark_symbols_for_renaming (new_stmt);
 }
 
 /* PTR is a pointer to an array of type TYPE.  Return a representation
@@ -106,15 +164,12 @@ write_vector_array (gimple stmt, gimple_stmt_iterator *gsi, tree vect,
 static tree
 create_array_ref (tree type, tree ptr, struct data_reference *first_dr)
 {
-  struct ptr_info_def *pi;
   tree mem_ref, alias_ptr_type;
 
   alias_ptr_type = reference_alias_ptr_type (DR_REF (first_dr));
   mem_ref = build2 (MEM_REF, type, ptr, build_int_cst (alias_ptr_type, 0));
   /* Arrays have the same alignment as their type.  */
-  pi = get_ptr_info (ptr);
-  pi->align = TYPE_ALIGN_UNIT (type);
-  pi->misalign = 0;
+  set_ptr_info_alignment (get_ptr_info (ptr), TYPE_ALIGN_UNIT (type), 0);
   return mem_ref;
 }
 
@@ -126,33 +181,78 @@ create_array_ref (tree type, tree ptr, struct data_reference *first_dr)
 
 static void
 vect_mark_relevant (VEC(gimple,heap) **worklist, gimple stmt,
-                   enum vect_relevant relevant, bool live_p)
+                   enum vect_relevant relevant, bool live_p,
+                   bool used_in_pattern)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   enum vect_relevant save_relevant = STMT_VINFO_RELEVANT (stmt_info);
   bool save_live_p = STMT_VINFO_LIVE_P (stmt_info);
+  gimple pattern_stmt;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "mark relevant %d, live %d.", relevant, live_p);
 
+  /* If this stmt is an original stmt in a pattern, we might need to mark its
+     related pattern stmt instead of the original stmt.  However, such stmts
+     may have their own uses that are not in any pattern, in such cases the
+     stmt itself should be marked.  */
   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
     {
-      gimple pattern_stmt;
+      bool found = false;
+      if (!used_in_pattern)
+        {
+          imm_use_iterator imm_iter;
+          use_operand_p use_p;
+          gimple use_stmt;
+          tree lhs;
+         loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+         struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 
-      /* This is the last stmt in a sequence that was detected as a
-         pattern that can potentially be vectorized.  Don't mark the stmt
-         as relevant/live because it's not going to be vectorized.
-         Instead mark the pattern-stmt that replaces it.  */
+          if (is_gimple_assign (stmt))
+            lhs = gimple_assign_lhs (stmt);
+          else
+            lhs = gimple_call_lhs (stmt);
 
-      pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+          /* This use is out of pattern use, if LHS has other uses that are
+             pattern uses, we should mark the stmt itself, and not the pattern
+             stmt.  */
+         if (TREE_CODE (lhs) == SSA_NAME)
+           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
+             {
+               if (is_gimple_debug (USE_STMT (use_p)))
+                 continue;
+               use_stmt = USE_STMT (use_p);
 
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live.");
-      stmt_info = vinfo_for_stmt (pattern_stmt);
-      gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt);
-      save_relevant = STMT_VINFO_RELEVANT (stmt_info);
-      save_live_p = STMT_VINFO_LIVE_P (stmt_info);
-      stmt = pattern_stmt;
+               if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+                 continue;
+
+               if (vinfo_for_stmt (use_stmt)
+                   && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
+                 {
+                   found = true;
+                   break;
+                 }
+             }
+        }
+
+      if (!found)
+        {
+          /* This is the last stmt in a sequence that was detected as a
+             pattern that can potentially be vectorized.  Don't mark the stmt
+             as relevant/live because it's not going to be vectorized.
+             Instead mark the pattern-stmt that replaces it.  */
+
+          pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "last stmt in pattern. don't mark"
+                                " relevant/live.");
+          stmt_info = vinfo_for_stmt (pattern_stmt);
+          gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt);
+          save_relevant = STMT_VINFO_RELEVANT (stmt_info);
+          save_live_p = STMT_VINFO_LIVE_P (stmt_info);
+          stmt = pattern_stmt;
+        }
     }
 
   STMT_VINFO_LIVE_P (stmt_info) |= live_p;
@@ -292,6 +392,8 @@ exist_non_indexing_operands_for_use_p (tree use, gimple stmt)
    - LIVE_P, RELEVANT - enum values to be set in the STMT_VINFO of the stmt
      that defined USE.  This is done by calling mark_relevant and passing it
      the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant).
+   - FORCE is true if exist_non_indexing_operands_for_use_p check shouldn't
+     be performed.
 
    Outputs:
    Generally, LIVE_P and RELEVANT are used to define the liveness and
@@ -311,7 +413,8 @@ exist_non_indexing_operands_for_use_p (tree use, gimple stmt)
 
 static bool
 process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
-            enum vect_relevant relevant, VEC(gimple,heap) **worklist)
+            enum vect_relevant relevant, VEC(gimple,heap) **worklist,
+            bool force)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
@@ -323,10 +426,10 @@ process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
 
   /* case 1: we are only interested in uses that need to be vectorized.  Uses
      that are used for address computation are not considered relevant.  */
-  if (!exist_non_indexing_operands_for_use_p (use, stmt))
+  if (!force && !exist_non_indexing_operands_for_use_p (use, stmt))
      return true;
 
-  if (!vect_is_simple_use (use, loop_vinfo, NULL, &def_stmt, &def, &dt))
+  if (!vect_is_simple_use (use, stmt, loop_vinfo, NULL, &def_stmt, &def, &dt))
     {
       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
         fprintf (vect_dump, "not vectorized: unsupported use in stmt.");
@@ -437,7 +540,8 @@ process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
         }
     }
 
-  vect_mark_relevant (worklist, def_stmt, relevant, live_p);
+  vect_mark_relevant (worklist, def_stmt, relevant, live_p,
+                      is_pattern_stmt_p (stmt_vinfo));
   return true;
 }
 
@@ -494,7 +598,7 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
            }
 
          if (vect_stmt_relevant_p (phi, loop_vinfo, &relevant, &live_p))
-           vect_mark_relevant (&worklist, phi, relevant, live_p);
+           vect_mark_relevant (&worklist, phi, relevant, live_p, false);
        }
       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
        {
@@ -506,7 +610,7 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
            }
 
          if (vect_stmt_relevant_p (stmt, loop_vinfo, &relevant, &live_p))
-            vect_mark_relevant (&worklist, stmt, relevant, live_p);
+            vect_mark_relevant (&worklist, stmt, relevant, live_p, false);
        }
     }
 
@@ -605,18 +709,34 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
             break;
         }
 
-      if (is_pattern_stmt_p (vinfo_for_stmt (stmt)))
+      if (is_pattern_stmt_p (stmt_vinfo))
         {
           /* Pattern statements are not inserted into the code, so
              FOR_EACH_PHI_OR_STMT_USE optimizes their operands out, and we
              have to scan the RHS or function arguments instead.  */
           if (is_gimple_assign (stmt))
             {
-              for (i = 1; i < gimple_num_ops (stmt); i++)
+             enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
+             tree op = gimple_assign_rhs1 (stmt);
+
+             i = 1;
+             if (rhs_code == COND_EXPR && COMPARISON_CLASS_P (op))
+               {
+                 if (!process_use (stmt, TREE_OPERAND (op, 0), loop_vinfo,
+                                   live_p, relevant, &worklist, false)
+                     || !process_use (stmt, TREE_OPERAND (op, 1), loop_vinfo,
+                                      live_p, relevant, &worklist, false))
+                   {
+                     VEC_free (gimple, heap, worklist);
+                     return false;
+                   }
+                 i = 2;
+               }
+             for (; i < gimple_num_ops (stmt); i++)
                 {
-                  tree op = gimple_op (stmt, i);
+                 op = gimple_op (stmt, i);
                   if (!process_use (stmt, op, loop_vinfo, live_p, relevant,
-                                    &worklist))
+                                   &worklist, false))
                     {
                       VEC_free (gimple, heap, worklist);
                       return false;
@@ -629,7 +749,7 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
                 {
                   tree arg = gimple_call_arg (stmt, i);
                   if (!process_use (stmt, arg, loop_vinfo, live_p, relevant,
-                                    &worklist))
+                                   &worklist, false))
                     {
                       VEC_free (gimple, heap, worklist);
                       return false;
@@ -642,12 +762,25 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
           {
             tree op = USE_FROM_PTR (use_p);
             if (!process_use (stmt, op, loop_vinfo, live_p, relevant,
-                              &worklist))
+                             &worklist, false))
               {
                 VEC_free (gimple, heap, worklist);
                 return false;
               }
           }
+
+      if (STMT_VINFO_GATHER_P (stmt_vinfo))
+       {
+         tree off;
+         tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
+         gcc_assert (decl);
+         if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
+                           &worklist, true))
+           {
+             VEC_free (gimple, heap, worklist);
+             return false;
+           }
+       }
     } /* while worklist */
 
   VEC_free (gimple, heap, worklist);
@@ -655,48 +788,6 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
 }
 
 
-/* Get cost by calling cost target builtin.  */
-
-static inline
-int vect_get_stmt_cost (enum vect_cost_for_stmt type_of_cost)
-{
-  tree dummy_type = NULL;
-  int dummy = 0;
-
-  return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
-                                                       dummy_type, dummy);
-}
-
-
-/* Get cost for STMT.  */
-
-int
-cost_for_stmt (gimple stmt)
-{
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-
-  switch (STMT_VINFO_TYPE (stmt_info))
-  {
-  case load_vec_info_type:
-    return vect_get_stmt_cost (scalar_load);
-  case store_vec_info_type:
-    return vect_get_stmt_cost (scalar_store);
-  case op_vec_info_type:
-  case condition_vec_info_type:
-  case assignment_vec_info_type:
-  case reduc_vec_info_type:
-  case induc_vec_info_type:
-  case type_promotion_vec_info_type:
-  case type_demotion_vec_info_type:
-  case type_conversion_vec_info_type:
-  case call_vec_info_type:
-    return vect_get_stmt_cost (scalar_stmt);
-  case undef_vec_info_type:
-  default:
-    gcc_unreachable ();
-  }
-}
-
 /* Function vect_model_simple_cost.
 
    Models cost for simple operations, i.e. those that only emit ncopies of a
@@ -705,42 +796,85 @@ cost_for_stmt (gimple stmt)
 
 void
 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
-                       enum vect_def_type *dt, slp_tree slp_node)
+                       enum vect_def_type *dt,
+                       stmt_vector_for_cost *prologue_cost_vec,
+                       stmt_vector_for_cost *body_cost_vec)
 {
   int i;
-  int inside_cost = 0, outside_cost = 0;
+  int inside_cost = 0, prologue_cost = 0;
 
   /* The SLP costs were already calculated during SLP tree build.  */
   if (PURE_SLP_STMT (stmt_info))
     return;
 
-  inside_cost = ncopies * vect_get_stmt_cost (vector_stmt); 
-
   /* FORNOW: Assuming maximum 2 args per stmts.  */
   for (i = 0; i < 2; i++)
-    {
-      if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
-       outside_cost += vect_get_stmt_cost (vector_stmt); 
-    }
+    if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
+      prologue_cost += record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
+                                        stmt_info, 0, vect_prologue);
+
+  /* Pass the inside-of-loop statements to the target-specific cost model.  */
+  inside_cost = record_stmt_cost (body_cost_vec, ncopies, vector_stmt,
+                                 stmt_info, 0, vect_body);
 
   if (vect_print_dump_info (REPORT_COST))
     fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
-             "outside_cost = %d .", inside_cost, outside_cost);
-
-  /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
-  stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
-  stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
+             "prologue_cost = %d .", inside_cost, prologue_cost);
 }
 
 
-/* Function vect_cost_strided_group_size
+/* Model cost for type demotion and promotion operations.  PWR is normally
+   zero for single-step promotions and demotions.  It will be one if 
+   two-step promotion/demotion is required, and so on.  Each additional
+   step doubles the number of instructions required.  */
+
+static void
+vect_model_promotion_demotion_cost (stmt_vec_info stmt_info,
+                                   enum vect_def_type *dt, int pwr)
+{
+  int i, tmp;
+  int inside_cost = 0, prologue_cost = 0;
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
+  void *target_cost_data;
+
+  /* The SLP costs were already calculated during SLP tree build.  */
+  if (PURE_SLP_STMT (stmt_info))
+    return;
+
+  if (loop_vinfo)
+    target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
+  else
+    target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
+
+  for (i = 0; i < pwr + 1; i++)
+    {
+      tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ?
+       (i + 1) : i;
+      inside_cost += add_stmt_cost (target_cost_data, vect_pow2 (tmp),
+                                   vec_promote_demote, stmt_info, 0,
+                                   vect_body);
+    }
+
+  /* FORNOW: Assuming maximum 2 args per stmts.  */
+  for (i = 0; i < 2; i++)
+    if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
+      prologue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
+                                     stmt_info, 0, vect_prologue);
+
+  if (vect_print_dump_info (REPORT_COST))
+    fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, "
+             "prologue_cost = %d .", inside_cost, prologue_cost);
+}
+
+/* Function vect_cost_group_size
 
-   For strided load or store, return the group_size only if it is the first
+   For grouped load or store, return the group_size only if it is the first
    load or store of a group, else return 1.  This ensures that group size is
    only returned once per group.  */
 
 static int
-vect_cost_strided_group_size (stmt_vec_info stmt_info)
+vect_cost_group_size (stmt_vec_info stmt_info)
 {
   gimple first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
 
@@ -753,16 +887,18 @@ vect_cost_strided_group_size (stmt_vec_info stmt_info)
 
 /* Function vect_model_store_cost
 
-   Models cost for stores.  In the case of strided accesses, one access
-   has the overhead of the strided access attributed to it.  */
+   Models cost for stores.  In the case of grouped accesses, one access
+   has the overhead of the grouped access attributed to it.  */
 
 void
 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
                       bool store_lanes_p, enum vect_def_type dt,
-                      slp_tree slp_node)
+                      slp_tree slp_node,
+                      stmt_vector_for_cost *prologue_cost_vec,
+                      stmt_vector_for_cost *body_cost_vec)
 {
   int group_size;
-  unsigned int inside_cost = 0, outside_cost = 0;
+  unsigned int inside_cost = 0, prologue_cost = 0;
   struct data_reference *first_dr;
   gimple first_stmt;
 
@@ -771,10 +907,11 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
     return;
 
   if (dt == vect_constant_def || dt == vect_external_def)
-    outside_cost = vect_get_stmt_cost (scalar_to_vec); 
+    prologue_cost += record_stmt_cost (prologue_cost_vec, 1, scalar_to_vec,
+                                      stmt_info, 0, vect_prologue);
 
-  /* Strided access?  */
-  if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
+  /* Grouped access?  */
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
       if (slp_node)
         {
@@ -784,12 +921,12 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
       else
         {
           first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
-          group_size = vect_cost_strided_group_size (stmt_info);
+          group_size = vect_cost_group_size (stmt_info);
         }
 
       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
     }
-  /* Not a strided access.  */
+  /* Not a grouped access.  */
   else
     {
       group_size = 1;
@@ -797,46 +934,48 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
     }
 
   /* We assume that the cost of a single store-lanes instruction is
-     equivalent to the cost of GROUP_SIZE separate stores.  If a strided
+     equivalent to the cost of GROUP_SIZE separate stores.  If a grouped
      access is instead being provided by a permute-and-store operation,
      include the cost of the permutes.  */
   if (!store_lanes_p && group_size > 1)
     {
       /* Uses a high and low interleave operation for each needed permute.  */
-      inside_cost = ncopies * exact_log2(group_size) * group_size
-        * vect_get_stmt_cost (vector_stmt);
+      
+      int nstmts = ncopies * exact_log2 (group_size) * group_size;
+      inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
+                                     stmt_info, 0, vect_body);
 
       if (vect_print_dump_info (REPORT_COST))
         fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
                  group_size);
-
     }
 
   /* Costs of the stores.  */
-  vect_get_store_cost (first_dr, ncopies, &inside_cost);
+  vect_get_store_cost (first_dr, ncopies, &inside_cost, body_cost_vec);
 
   if (vect_print_dump_info (REPORT_COST))
     fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
-             "outside_cost = %d .", inside_cost, outside_cost);
-
-  /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
-  stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
-  stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
+             "prologue_cost = %d .", inside_cost, prologue_cost);
 }
 
 
 /* Calculate cost of DR's memory access.  */
 void
 vect_get_store_cost (struct data_reference *dr, int ncopies,
-                     unsigned int *inside_cost)
+                    unsigned int *inside_cost,
+                    stmt_vector_for_cost *body_cost_vec)
 {
   int alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
+  gimple stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 
   switch (alignment_support_scheme)
     {
     case dr_aligned:
       {
-        *inside_cost += ncopies * vect_get_stmt_cost (vector_store);
+       *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
+                                         vector_store, stmt_info, 0,
+                                         vect_body);
 
         if (vect_print_dump_info (REPORT_COST))
           fprintf (vect_dump, "vect_model_store_cost: aligned.");
@@ -846,14 +985,10 @@ vect_get_store_cost (struct data_reference *dr, int ncopies,
 
     case dr_unaligned_supported:
       {
-        gimple stmt = DR_STMT (dr);
-        stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-        tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-
         /* Here, we assign an additional cost for the unaligned store.  */
-        *inside_cost += ncopies
-          * targetm.vectorize.builtin_vectorization_cost (unaligned_store,
-                                 vectype, DR_MISALIGNMENT (dr));
+       *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
+                                         unaligned_store, stmt_info,
+                                         DR_MISALIGNMENT (dr), vect_body);
 
         if (vect_print_dump_info (REPORT_COST))
           fprintf (vect_dump, "vect_model_store_cost: unaligned supported by "
@@ -862,6 +997,16 @@ vect_get_store_cost (struct data_reference *dr, int ncopies,
         break;
       }
 
+    case dr_unaligned_unsupported:
+      {
+        *inside_cost = VECT_MAX_COST;
+
+        if (vect_print_dump_info (REPORT_COST))
+          fprintf (vect_dump, "vect_model_store_cost: unsupported access.");
+
+        break;
+      }
+
     default:
       gcc_unreachable ();
     }
@@ -870,32 +1015,34 @@ vect_get_store_cost (struct data_reference *dr, int ncopies,
 
 /* Function vect_model_load_cost
 
-   Models cost for loads.  In the case of strided accesses, the last access
-   has the overhead of the strided access attributed to it.  Since unaligned
+   Models cost for loads.  In the case of grouped accesses, the last access
+   has the overhead of the grouped access attributed to it.  Since unaligned
    accesses are supported for loads, we also account for the costs of the
    access scheme chosen.  */
 
 void
-vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, bool load_lanes_p,
-                     slp_tree slp_node)
+vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
+                     bool load_lanes_p, slp_tree slp_node,
+                     stmt_vector_for_cost *prologue_cost_vec,
+                     stmt_vector_for_cost *body_cost_vec)
 {
   int group_size;
   gimple first_stmt;
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
-  unsigned int inside_cost = 0, outside_cost = 0;
+  unsigned int inside_cost = 0, prologue_cost = 0;
 
   /* The SLP costs were already calculated during SLP tree build.  */
   if (PURE_SLP_STMT (stmt_info))
     return;
 
-  /* Strided accesses?  */
+  /* Grouped accesses?  */
   first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
-  if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && first_stmt && !slp_node)
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info) && first_stmt && !slp_node)
     {
-      group_size = vect_cost_strided_group_size (stmt_info);
+      group_size = vect_cost_group_size (stmt_info);
       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
     }
-  /* Not a strided access.  */
+  /* Not a grouped access.  */
   else
     {
       group_size = 1;
@@ -903,14 +1050,15 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, bool load_lanes_p,
     }
 
   /* We assume that the cost of a single load-lanes instruction is
-     equivalent to the cost of GROUP_SIZE separate loads.  If a strided
+     equivalent to the cost of GROUP_SIZE separate loads.  If a grouped
      access is instead being provided by a load-and-permute operation,
      include the cost of the permutes.  */
   if (!load_lanes_p && group_size > 1)
     {
       /* Uses an even and odd extract operations for each needed permute.  */
-      inside_cost = ncopies * exact_log2(group_size) * group_size
-       * vect_get_stmt_cost (vector_stmt);
+      int nstmts = ncopies * exact_log2 (group_size) * group_size;
+      inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm,
+                                      stmt_info, 0, vect_body);
 
       if (vect_print_dump_info (REPORT_COST))
         fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
@@ -918,34 +1066,48 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, bool load_lanes_p,
     }
 
   /* The loads themselves.  */
-  vect_get_load_cost (first_dr, ncopies,
-         ((!STMT_VINFO_STRIDED_ACCESS (stmt_info)) || group_size > 1
-          || slp_node),
-         &inside_cost, &outside_cost);
+  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+    {
+      /* N scalar loads plus gathering them into a vector.  */
+      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+      inside_cost += record_stmt_cost (body_cost_vec,
+                                      ncopies * TYPE_VECTOR_SUBPARTS (vectype),
+                                      scalar_load, stmt_info, 0, vect_body);
+      inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_construct,
+                                      stmt_info, 0, vect_body);
+    }
+  else
+    vect_get_load_cost (first_dr, ncopies,
+                       ((!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+                        || group_size > 1 || slp_node),
+                       &inside_cost, &prologue_cost, 
+                       prologue_cost_vec, body_cost_vec, true);
 
   if (vect_print_dump_info (REPORT_COST))
     fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
-             "outside_cost = %d .", inside_cost, outside_cost);
-
-  /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
-  stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
-  stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
+             "prologue_cost = %d .", inside_cost, prologue_cost);
 }
 
 
 /* Calculate cost of DR's memory access.  */
 void
 vect_get_load_cost (struct data_reference *dr, int ncopies,
-                    bool add_realign_cost, unsigned int *inside_cost,
-                    unsigned int *outside_cost)
+                   bool add_realign_cost, unsigned int *inside_cost,
+                   unsigned int *prologue_cost,
+                   stmt_vector_for_cost *prologue_cost_vec,
+                   stmt_vector_for_cost *body_cost_vec,
+                   bool record_prologue_costs)
 {
   int alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
+  gimple stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 
   switch (alignment_support_scheme)
     {
     case dr_aligned:
       {
-        *inside_cost += ncopies * vect_get_stmt_cost (vector_load); 
+       *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
+                                         stmt_info, 0, vect_body);
 
         if (vect_print_dump_info (REPORT_COST))
           fprintf (vect_dump, "vect_model_load_cost: aligned.");
@@ -954,14 +1116,11 @@ vect_get_load_cost (struct data_reference *dr, int ncopies,
       }
     case dr_unaligned_supported:
       {
-        gimple stmt = DR_STMT (dr);
-        stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-        tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-
         /* Here, we assign an additional cost for the unaligned load.  */
-        *inside_cost += ncopies
-          * targetm.vectorize.builtin_vectorization_cost (unaligned_load,
-                                           vectype, DR_MISALIGNMENT (dr));
+       *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
+                                         unaligned_load, stmt_info,
+                                         DR_MISALIGNMENT (dr), vect_body);
+
         if (vect_print_dump_info (REPORT_COST))
           fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
                    "hardware.");
@@ -970,14 +1129,20 @@ vect_get_load_cost (struct data_reference *dr, int ncopies,
       }
     case dr_explicit_realign:
       {
-        *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load)
-           + vect_get_stmt_cost (vector_stmt));
+       *inside_cost += record_stmt_cost (body_cost_vec, ncopies * 2,
+                                         vector_load, stmt_info, 0, vect_body);
+       *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
+                                         vec_perm, stmt_info, 0, vect_body);
 
         /* FIXME: If the misalignment remains fixed across the iterations of
            the containing loop, the following cost should be added to the
-           outside costs.  */
+           prologue costs.  */
         if (targetm.vectorize.builtin_mask_for_load)
-          *inside_cost += vect_get_stmt_cost (vector_stmt);
+         *inside_cost += record_stmt_cost (body_cost_vec, 1, vector_stmt,
+                                           stmt_info, 0, vect_body);
+
+        if (vect_print_dump_info (REPORT_COST))
+          fprintf (vect_dump, "vect_model_load_cost: explicit realign");
 
         break;
       }
@@ -989,20 +1154,41 @@ vect_get_load_cost (struct data_reference *dr, int ncopies,
 
         /* Unaligned software pipeline has a load of an address, an initial
            load, and possibly a mask operation to "prime" the loop.  However,
-           if this is an access in a group of loads, which provide strided
+           if this is an access in a group of loads, which provide grouped
            access, then the above cost should only be considered for one
            access in the group.  Inside the loop, there is a load op
            and a realignment op.  */
 
-        if (add_realign_cost)
+        if (add_realign_cost && record_prologue_costs)
           {
-            *outside_cost = 2 * vect_get_stmt_cost (vector_stmt);
+           *prologue_cost += record_stmt_cost (prologue_cost_vec, 2,
+                                               vector_stmt, stmt_info,
+                                               0, vect_prologue);
             if (targetm.vectorize.builtin_mask_for_load)
-              *outside_cost += vect_get_stmt_cost (vector_stmt);
+             *prologue_cost += record_stmt_cost (prologue_cost_vec, 1,
+                                                 vector_stmt, stmt_info,
+                                                 0, vect_prologue);
           }
 
-        *inside_cost += ncopies * (vect_get_stmt_cost (vector_load)
-          + vect_get_stmt_cost (vector_stmt));
+       *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
+                                         stmt_info, 0, vect_body);
+       *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_perm,
+                                         stmt_info, 0, vect_body);
+
+        if (vect_print_dump_info (REPORT_COST))
+          fprintf (vect_dump,
+                  "vect_model_load_cost: explicit realign optimized");
+
+        break;
+      }
+
+    case dr_unaligned_unsupported:
+      {
+        *inside_cost = VECT_MAX_COST;
+
+        if (vect_print_dump_info (REPORT_COST))
+          fprintf (vect_dump, "vect_model_load_cost: unsupported access.");
+
         break;
       }
 
@@ -1011,48 +1197,30 @@ vect_get_load_cost (struct data_reference *dr, int ncopies,
     }
 }
 
+/* Insert the new stmt NEW_STMT at *GSI or at the appropriate place in
+   the loop preheader for the vectorized stmt STMT.  */
 
-/* Function vect_init_vector.
-
-   Insert a new stmt (INIT_STMT) that initializes a new vector variable with
-   the vector elements of VECTOR_VAR.  Place the initialization at BSI if it
-   is not NULL.  Otherwise, place the initialization at the loop preheader.
-   Return the DEF of INIT_STMT.
-   It will be used in the vectorization of STMT.  */
-
-tree
-vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
-                 gimple_stmt_iterator *gsi)
+static void
+vect_init_vector_1 (gimple stmt, gimple new_stmt, gimple_stmt_iterator *gsi)
 {
-  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
-  tree new_var;
-  gimple init_stmt;
-  tree vec_oprnd;
-  edge pe;
-  tree new_temp;
-  basic_block new_bb;
-
-  new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
-  add_referenced_var (new_var);
-  init_stmt = gimple_build_assign  (new_var, vector_var);
-  new_temp = make_ssa_name (new_var, init_stmt);
-  gimple_assign_set_lhs (init_stmt, new_temp);
-
   if (gsi)
-    vect_finish_stmt_generation (stmt, init_stmt, gsi);
+    vect_finish_stmt_generation (stmt, new_stmt, gsi);
   else
     {
+      stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
       loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
 
       if (loop_vinfo)
         {
           struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+         basic_block new_bb;
+         edge pe;
 
           if (nested_in_vect_loop_p (loop, stmt))
             loop = loop->inner;
 
          pe = loop_preheader_edge (loop);
-          new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
+          new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
           gcc_assert (!new_bb);
        }
       else
@@ -1064,16 +1232,60 @@ vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
           gcc_assert (bb_vinfo);
           bb = BB_VINFO_BB (bb_vinfo);
           gsi_bb_start = gsi_after_labels (bb);
-          gsi_insert_before (&gsi_bb_start, init_stmt, GSI_SAME_STMT);
+          gsi_insert_before (&gsi_bb_start, new_stmt, GSI_SAME_STMT);
        }
     }
 
   if (vect_print_dump_info (REPORT_DETAILS))
     {
       fprintf (vect_dump, "created new init_stmt: ");
-      print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
+      print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
+    }
+}
+
+/* Function vect_init_vector.
+
+   Insert a new stmt (INIT_STMT) that initializes a new variable of type
+   TYPE with the value VAL.  If TYPE is a vector type and VAL does not have
+   vector type a vector with all elements equal to VAL is created first.
+   Place the initialization at BSI if it is not NULL.  Otherwise, place the
+   initialization at the loop preheader.
+   Return the DEF of INIT_STMT.
+   It will be used in the vectorization of STMT.  */
+
+tree
+vect_init_vector (gimple stmt, tree val, tree type, gimple_stmt_iterator *gsi)
+{
+  tree new_var;
+  gimple init_stmt;
+  tree vec_oprnd;
+  tree new_temp;
+
+  if (TREE_CODE (type) == VECTOR_TYPE
+      && TREE_CODE (TREE_TYPE (val)) != VECTOR_TYPE)
+    {
+      if (!types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
+       {
+         if (CONSTANT_CLASS_P (val))
+           val = fold_unary (VIEW_CONVERT_EXPR, TREE_TYPE (type), val);
+         else
+           {
+             new_temp = make_ssa_name (TREE_TYPE (type), NULL);
+             init_stmt = gimple_build_assign_with_ops (NOP_EXPR,
+                                                       new_temp, val,
+                                                       NULL_TREE);
+             vect_init_vector_1 (stmt, init_stmt, gsi);
+             val = new_temp;
+           }
+       }
+      val = build_vector_from_val (type, val);
     }
 
+  new_var = vect_get_new_vect_var (type, vect_simple_var, "cst_");
+  init_stmt = gimple_build_assign  (new_var, val);
+  new_temp = make_ssa_name (new_var, init_stmt);
+  gimple_assign_set_lhs (init_stmt, new_temp);
+  vect_init_vector_1 (stmt, init_stmt, gsi);
   vec_oprnd = gimple_assign_lhs (init_stmt);
   return vec_oprnd;
 }
@@ -1100,11 +1312,7 @@ vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
   unsigned int nunits;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
-  tree vec_inv;
-  tree vec_cst;
-  tree t = NULL_TREE;
   tree def;
-  int i;
   enum vect_def_type dt;
   bool is_simple_use;
   tree vector_type;
@@ -1115,8 +1323,8 @@ vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
       print_generic_expr (vect_dump, op, TDF_SLIM);
     }
 
-  is_simple_use = vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def,
-                                      &dt);
+  is_simple_use = vect_is_simple_use (op, stmt, loop_vinfo, NULL,
+                                     &def_stmt, &def, &dt);
   gcc_assert (is_simple_use);
   if (vect_print_dump_info (REPORT_DETAILS))
     {
@@ -1148,8 +1356,7 @@ vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
         if (vect_print_dump_info (REPORT_DETAILS))
           fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
 
-        vec_cst = build_vector_from_val (vector_type, op);
-        return vect_init_vector (stmt, vec_cst, vector_type, NULL);
+        return vect_init_vector (stmt, op, vector_type, NULL);
       }
 
     /* Case 2: operand is defined outside the loop - loop invariant.  */
@@ -1157,7 +1364,6 @@ vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
       {
        vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
        gcc_assert (vector_type);
-       nunits = TYPE_VECTOR_SUBPARTS (vector_type);
 
        if (scalar_def)
          *scalar_def = def;
@@ -1166,14 +1372,7 @@ vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
         if (vect_print_dump_info (REPORT_DETAILS))
           fprintf (vect_dump, "Create vector_inv.");
 
-        for (i = nunits - 1; i >= 0; --i)
-          {
-            t = tree_cons (NULL_TREE, def, t);
-          }
-
-       /* FIXME: use build_constructor directly.  */
-        vec_inv = build_constructor_from_list (vector_type, t);
-        return vect_init_vector (stmt, vec_inv, vector_type, NULL);
+        return vect_init_vector (stmt, def, vector_type, NULL);
       }
 
     /* Case 3: operand is defined inside the loop.  */
@@ -1184,7 +1383,14 @@ vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
 
         /* Get the def from the vectorized stmt.  */
         def_stmt_info = vinfo_for_stmt (def_stmt);
+
         vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
+        /* Get vectorized pattern statement.  */
+        if (!vec_stmt
+            && STMT_VINFO_IN_PATTERN_P (def_stmt_info)
+            && !STMT_VINFO_RELEVANT (def_stmt_info))
+          vec_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (
+                       STMT_VINFO_RELATED_STMT (def_stmt_info)));
         gcc_assert (vec_stmt);
        if (gimple_code (vec_stmt) == GIMPLE_PHI)
          vec_oprnd = PHI_RESULT (vec_stmt);
@@ -1333,16 +1539,35 @@ vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
 }
 
 
-/* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not
-   NULL.  */
+/* Get vectorized definitions for OP0 and OP1.
+   REDUC_INDEX is the index of reduction operand in case of reduction,
+   and -1 otherwise.  */
 
-static void
+void
 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
-                  VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
-                  slp_tree slp_node)
+                  VEC (tree, heap) **vec_oprnds0,
+                  VEC (tree, heap) **vec_oprnds1,
+                  slp_tree slp_node, int reduc_index)
 {
   if (slp_node)
-    vect_get_slp_defs (op0, op1, slp_node, vec_oprnds0, vec_oprnds1, -1);
+    {
+      int nops = (op1 == NULL_TREE) ? 1 : 2;
+      VEC (tree, heap) *ops = VEC_alloc (tree, heap, nops);
+      VEC (slp_void_p, heap) *vec_defs = VEC_alloc (slp_void_p, heap, nops);
+
+      VEC_quick_push (tree, ops, op0);
+      if (op1)
+        VEC_quick_push (tree, ops, op1);
+
+      vect_get_slp_defs (ops, slp_node, &vec_defs, reduc_index);
+
+      *vec_oprnds0 = (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
+      if (op1)
+        *vec_oprnds1 = (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 1);
+
+      VEC_free (tree, heap, ops);
+      VEC_free (slp_void_p, heap, vec_defs);
+    }
   else
     {
       tree vec_oprnd;
@@ -1386,7 +1611,7 @@ vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
       print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
     }
 
-  gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi)));
+  gimple_set_location (vec_stmt, gimple_location (stmt));
 }
 
 /* Checks if CALL can be vectorized in type VECTYPE.  Returns
@@ -1420,7 +1645,8 @@ vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
 
 static bool
-vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
+vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
+                  slp_tree slp_node)
 {
   tree vec_dest;
   tree scalar_dest;
@@ -1431,6 +1657,7 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
   int nunits_in;
   int nunits_out;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   tree fndecl, new_temp, def, rhs_type;
   gimple def_stmt;
   enum vect_def_type dt[3]
@@ -1442,19 +1669,12 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
   size_t i, nargs;
   tree lhs;
 
-  /* FORNOW: unsupported in basic block SLP.  */
-  gcc_assert (loop_vinfo);
-
-  if (!STMT_VINFO_RELEVANT_P (stmt_info))
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
     return false;
 
-  /* FORNOW: SLP not supported.  */
-  if (STMT_SLP_TYPE (stmt_info))
-    return false;
-
   /* Is STMT a vectorizable call?   */
   if (!is_gimple_call (stmt))
     return false;
@@ -1495,7 +1715,7 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
       if (!rhs_type)
        rhs_type = TREE_TYPE (op);
 
-      if (!vect_is_simple_use_1 (op, loop_vinfo, NULL,
+      if (!vect_is_simple_use_1 (op, stmt, loop_vinfo, bb_vinfo,
                                 &def_stmt, &def, &dt[i], &opvectype))
        {
          if (vect_print_dump_info (REPORT_DETAILS))
@@ -1557,7 +1777,9 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
 
   gcc_assert (!gimple_vuse (stmt));
 
-  if (modifier == NARROW)
+  if (slp_node || PURE_SLP_STMT (stmt_info))
+    ncopies = 1;
+  else if (modifier == NARROW)
     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
   else
     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
@@ -1571,7 +1793,7 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
       STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "=== vectorizable_call ===");
-      vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
+      vect_model_simple_cost (stmt_info, ncopies, dt, NULL, NULL);
       return true;
     }
 
@@ -1596,6 +1818,49 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
          else
            VEC_truncate (tree, vargs, 0);
 
+         if (slp_node)
+           {
+             VEC (slp_void_p, heap) *vec_defs
+               = VEC_alloc (slp_void_p, heap, nargs);
+             VEC (tree, heap) *vec_oprnds0;
+
+             for (i = 0; i < nargs; i++)
+               VEC_quick_push (tree, vargs, gimple_call_arg (stmt, i));
+             vect_get_slp_defs (vargs, slp_node, &vec_defs, -1);
+             vec_oprnds0
+               = (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
+
+             /* Arguments are ready.  Create the new vector stmt.  */
+             FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_oprnd0)
+               {
+                 size_t k;
+                 for (k = 0; k < nargs; k++)
+                   {
+                     VEC (tree, heap) *vec_oprndsk
+                       = (VEC (tree, heap) *)
+                         VEC_index (slp_void_p, vec_defs, k);
+                     VEC_replace (tree, vargs, k,
+                                  VEC_index (tree, vec_oprndsk, i));
+                   }
+                 new_stmt = gimple_build_call_vec (fndecl, vargs);
+                 new_temp = make_ssa_name (vec_dest, new_stmt);
+                 gimple_call_set_lhs (new_stmt, new_temp);
+                 vect_finish_stmt_generation (stmt, new_stmt, gsi);
+                 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+                                 new_stmt);
+               }
+
+             for (i = 0; i < nargs; i++)
+               {
+                 VEC (tree, heap) *vec_oprndsi
+                   = (VEC (tree, heap) *)
+                     VEC_index (slp_void_p, vec_defs, i);
+                 VEC_free (tree, heap, vec_oprndsi);
+               }
+             VEC_free (slp_void_p, heap, vec_defs);
+             continue;
+           }
+
          for (i = 0; i < nargs; i++)
            {
              op = gimple_call_arg (stmt, i);
@@ -1615,9 +1880,7 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
          new_stmt = gimple_build_call_vec (fndecl, vargs);
          new_temp = make_ssa_name (vec_dest, new_stmt);
          gimple_call_set_lhs (new_stmt, new_temp);
-
          vect_finish_stmt_generation (stmt, new_stmt, gsi);
-         mark_symbols_for_renaming (new_stmt);
 
          if (j == 0)
            STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
@@ -1638,6 +1901,53 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
          else
            VEC_truncate (tree, vargs, 0);
 
+         if (slp_node)
+           {
+             VEC (slp_void_p, heap) *vec_defs
+               = VEC_alloc (slp_void_p, heap, nargs);
+             VEC (tree, heap) *vec_oprnds0;
+
+             for (i = 0; i < nargs; i++)
+               VEC_quick_push (tree, vargs, gimple_call_arg (stmt, i));
+             vect_get_slp_defs (vargs, slp_node, &vec_defs, -1);
+             vec_oprnds0
+               = (VEC (tree, heap) *) VEC_index (slp_void_p, vec_defs, 0);
+
+             /* Arguments are ready.  Create the new vector stmt.  */
+             for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vec_oprnd0);
+                  i += 2)
+               {
+                 size_t k;
+                 VEC_truncate (tree, vargs, 0);
+                 for (k = 0; k < nargs; k++)
+                   {
+                     VEC (tree, heap) *vec_oprndsk
+                       = (VEC (tree, heap) *)
+                         VEC_index (slp_void_p, vec_defs, k);
+                     VEC_quick_push (tree, vargs,
+                                     VEC_index (tree, vec_oprndsk, i));
+                     VEC_quick_push (tree, vargs,
+                                     VEC_index (tree, vec_oprndsk, i + 1));
+                   }
+                 new_stmt = gimple_build_call_vec (fndecl, vargs);
+                 new_temp = make_ssa_name (vec_dest, new_stmt);
+                 gimple_call_set_lhs (new_stmt, new_temp);
+                 vect_finish_stmt_generation (stmt, new_stmt, gsi);
+                 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+                                 new_stmt);
+               }
+
+             for (i = 0; i < nargs; i++)
+               {
+                 VEC (tree, heap) *vec_oprndsi
+                   = (VEC (tree, heap) *)
+                     VEC_index (slp_void_p, vec_defs, i);
+                 VEC_free (tree, heap, vec_oprndsi);
+               }
+             VEC_free (slp_void_p, heap, vec_defs);
+             continue;
+           }
+
          for (i = 0; i < nargs; i++)
            {
              op = gimple_call_arg (stmt, i);
@@ -1650,7 +1960,7 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
                }
              else
                {
-                 vec_oprnd1 = gimple_call_arg (new_stmt, 2*i);
+                 vec_oprnd1 = gimple_call_arg (new_stmt, 2*i + 1);
                  vec_oprnd0
                    = vect_get_vec_def_for_stmt_copy (dt[i], vec_oprnd1);
                  vec_oprnd1
@@ -1664,9 +1974,7 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
          new_stmt = gimple_build_call_vec (fndecl, vargs);
          new_temp = make_ssa_name (vec_dest, new_stmt);
          gimple_call_set_lhs (new_stmt, new_temp);
-
          vect_finish_stmt_generation (stmt, new_stmt, gsi);
-         mark_symbols_for_renaming (new_stmt);
 
          if (j == 0)
            STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
@@ -1696,6 +2004,9 @@ vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
      it defines is mapped to the new definition.  So just replace
      rhs of the statement with something harmless.  */
 
+  if (slp_node)
+    return true;
+
   type = TREE_TYPE (scalar_dest);
   if (is_pattern_stmt_p (stmt_info))
     lhs = gimple_call_lhs (STMT_VINFO_RELATED_STMT (stmt_info));
@@ -1759,48 +2070,207 @@ vect_gen_widened_results_half (enum tree_code code,
 }
 
 
-/* Check if STMT performs a conversion operation, that can be vectorized.
-   If VEC_STMT is also passed, vectorize the STMT: create a vectorized
-   stmt to replace it, put it in VEC_STMT, and insert it at BSI.
-   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
+/* Get vectorized definitions for loop-based vectorization.  For the first
+   operand we call vect_get_vec_def_for_operand() (with OPRND containing
+   scalar operand), and for the rest we get a copy with
+   vect_get_vec_def_for_stmt_copy() using the previous vector definition
+   (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
+   The vectors are collected into VEC_OPRNDS.  */
 
-static bool
-vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
-                        gimple *vec_stmt, slp_tree slp_node)
+static void
+vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
+                         VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
 {
-  tree vec_dest;
-  tree scalar_dest;
-  tree op0;
-  tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
-  tree decl1 = NULL_TREE, decl2 = NULL_TREE;
-  tree new_temp;
-  tree def;
-  gimple def_stmt;
-  enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
-  gimple new_stmt = NULL;
-  stmt_vec_info prev_stmt_info;
-  int nunits_in;
-  int nunits_out;
+  tree vec_oprnd;
+
+  /* Get first vector operand.  */
+  /* All the vector operands except the very first one (that is scalar oprnd)
+     are stmt copies.  */
+  if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
+    vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
+  else
+    vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
+
+  VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
+
+  /* Get second vector operand.  */
+  vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
+  VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
+
+  *oprnd = vec_oprnd;
+
+  /* For conversion in multiple steps, continue to get operands
+     recursively.  */
+  if (multi_step_cvt)
+    vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds,  multi_step_cvt - 1);
+}
+
+
+/* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
+   For multi-step conversions store the resulting vectors and call the function
+   recursively.  */
+
+static void
+vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
+                                      int multi_step_cvt, gimple stmt,
+                                      VEC (tree, heap) *vec_dsts,
+                                      gimple_stmt_iterator *gsi,
+                                      slp_tree slp_node, enum tree_code code,
+                                      stmt_vec_info *prev_stmt_info)
+{
+  unsigned int i;
+  tree vop0, vop1, new_tmp, vec_dest;
+  gimple new_stmt;
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+
+  vec_dest = VEC_pop (tree, vec_dsts);
+
+  for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
+    {
+      /* Create demotion operation.  */
+      vop0 = VEC_index (tree, *vec_oprnds, i);
+      vop1 = VEC_index (tree, *vec_oprnds, i + 1);
+      new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
+      new_tmp = make_ssa_name (vec_dest, new_stmt);
+      gimple_assign_set_lhs (new_stmt, new_tmp);
+      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+      if (multi_step_cvt)
+       /* Store the resulting vector for next recursive call.  */
+       VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
+      else
+       {
+         /* This is the last step of the conversion sequence. Store the
+            vectors in SLP_NODE or in vector info of the scalar statement
+            (or in STMT_VINFO_RELATED_STMT chain).  */
+         if (slp_node)
+           VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
+         else
+           {
+             if (!*prev_stmt_info)
+               STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+             else
+               STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
+
+             *prev_stmt_info = vinfo_for_stmt (new_stmt);
+           }
+       }
+    }
+
+  /* For multi-step demotion operations we first generate demotion operations
+     from the source type to the intermediate types, and then combine the
+     results (stored in VEC_OPRNDS) in demotion operation to the destination
+     type.  */
+  if (multi_step_cvt)
+    {
+      /* At each level of recursion we have half of the operands we had at the
+        previous level.  */
+      VEC_truncate (tree, *vec_oprnds, (i+1)/2);
+      vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
+                                            stmt, vec_dsts, gsi, slp_node,
+                                            VEC_PACK_TRUNC_EXPR,
+                                            prev_stmt_info);
+    }
+
+  VEC_quick_push (tree, vec_dsts, vec_dest);
+}
+
+
+/* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
+   and VEC_OPRNDS1 (for binary operations).  For multi-step conversions store
+   the resulting vectors and call the function recursively.  */
+
+static void
+vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
+                                       VEC (tree, heap) **vec_oprnds1,
+                                       gimple stmt, tree vec_dest,
+                                       gimple_stmt_iterator *gsi,
+                                       enum tree_code code1,
+                                       enum tree_code code2, tree decl1,
+                                       tree decl2, int op_type)
+{
+  int i;
+  tree vop0, vop1, new_tmp1, new_tmp2;
+  gimple new_stmt1, new_stmt2;
+  VEC (tree, heap) *vec_tmp = NULL;
+
+  vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
+  FOR_EACH_VEC_ELT (tree, *vec_oprnds0, i, vop0)
+    {
+      if (op_type == binary_op)
+       vop1 = VEC_index (tree, *vec_oprnds1, i);
+      else
+       vop1 = NULL_TREE;
+
+      /* Generate the two halves of promotion operation.  */
+      new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
+                                                op_type, vec_dest, gsi, stmt);
+      new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
+                                                op_type, vec_dest, gsi, stmt);
+      if (is_gimple_call (new_stmt1))
+       {
+         new_tmp1 = gimple_call_lhs (new_stmt1);
+         new_tmp2 = gimple_call_lhs (new_stmt2);
+       }
+      else
+       {
+         new_tmp1 = gimple_assign_lhs (new_stmt1);
+         new_tmp2 = gimple_assign_lhs (new_stmt2);
+       }
+
+      /* Store the results for the next step.  */
+      VEC_quick_push (tree, vec_tmp, new_tmp1);
+      VEC_quick_push (tree, vec_tmp, new_tmp2);
+    }
+
+  VEC_free (tree, heap, *vec_oprnds0);
+  *vec_oprnds0 = vec_tmp;
+}
+
+
+/* Check if STMT performs a conversion operation, that can be vectorized.
+   If VEC_STMT is also passed, vectorize the STMT: create a vectorized
+   stmt to replace it, put it in VEC_STMT, and insert it at GSI.
+   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
+
+static bool
+vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
+                        gimple *vec_stmt, slp_tree slp_node)
+{
+  tree vec_dest;
+  tree scalar_dest;
+  tree op0, op1 = NULL_TREE;
+  tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
+  enum tree_code codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
+  tree decl1 = NULL_TREE, decl2 = NULL_TREE;
+  tree new_temp;
+  tree def;
+  gimple def_stmt;
+  enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
+  gimple new_stmt = NULL;
+  stmt_vec_info prev_stmt_info;
+  int nunits_in;
+  int nunits_out;
   tree vectype_out, vectype_in;
-  int ncopies, j;
-  tree rhs_type;
-  tree builtin_decl;
+  int ncopies, i, j;
+  tree lhs_type, rhs_type;
   enum { NARROW, NONE, WIDEN } modifier;
-  int i;
-  VEC(tree,heap) *vec_oprnds0 = NULL;
+  VEC (tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
   tree vop0;
-  VEC(tree,heap) *dummy = NULL;
-  int dummy_int;
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
+  int multi_step_cvt = 0;
+  VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL;
+  tree last_oprnd, intermediate_type, cvt_type = NULL_TREE;
+  int op_type;
+  enum machine_mode rhs_mode;
+  unsigned short fltsz;
 
   /* Is STMT a vectorizable conversion?   */
 
-  /* FORNOW: unsupported in basic block SLP.  */
-  gcc_assert (loop_vinfo);
-
-  if (!STMT_VINFO_RELEVANT_P (stmt_info))
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
@@ -1813,23 +2283,74 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
     return false;
 
   code = gimple_assign_rhs_code (stmt);
-  if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
+  if (!CONVERT_EXPR_CODE_P (code)
+      && code != FIX_TRUNC_EXPR
+      && code != FLOAT_EXPR
+      && code != WIDEN_MULT_EXPR
+      && code != WIDEN_LSHIFT_EXPR)
     return false;
 
+  op_type = TREE_CODE_LENGTH (code);
+
   /* Check types of lhs and rhs.  */
   scalar_dest = gimple_assign_lhs (stmt);
+  lhs_type = TREE_TYPE (scalar_dest);
   vectype_out = STMT_VINFO_VECTYPE (stmt_info);
 
   op0 = gimple_assign_rhs1 (stmt);
   rhs_type = TREE_TYPE (op0);
+
+  if ((code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
+      && !((INTEGRAL_TYPE_P (lhs_type)
+           && INTEGRAL_TYPE_P (rhs_type))
+          || (SCALAR_FLOAT_TYPE_P (lhs_type)
+              && SCALAR_FLOAT_TYPE_P (rhs_type))))
+    return false;
+
+  if ((INTEGRAL_TYPE_P (lhs_type)
+       && (TYPE_PRECISION (lhs_type)
+          != GET_MODE_PRECISION (TYPE_MODE (lhs_type))))
+      || (INTEGRAL_TYPE_P (rhs_type)
+         && (TYPE_PRECISION (rhs_type)
+             != GET_MODE_PRECISION (TYPE_MODE (rhs_type)))))
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump,
+                "type conversion to/from bit-precision unsupported.");
+      return false;
+    }
+
   /* Check the operands of the operation.  */
-  if (!vect_is_simple_use_1 (op0, loop_vinfo, NULL,
+  if (!vect_is_simple_use_1 (op0, stmt, loop_vinfo, bb_vinfo,
                             &def_stmt, &def, &dt[0], &vectype_in))
     {
       if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "use not simple.");
       return false;
     }
+  if (op_type == binary_op)
+    {
+      bool ok;
+
+      op1 = gimple_assign_rhs2 (stmt);
+      gcc_assert (code == WIDEN_MULT_EXPR || code == WIDEN_LSHIFT_EXPR);
+      /* For WIDEN_MULT_EXPR, if OP0 is a constant, use the type of
+        OP1.  */
+      if (CONSTANT_CLASS_P (op0))
+       ok = vect_is_simple_use_1 (op1, stmt, loop_vinfo, bb_vinfo,
+                                  &def_stmt, &def, &dt[1], &vectype_in);
+      else
+       ok = vect_is_simple_use (op1, stmt, loop_vinfo, bb_vinfo, &def_stmt,
+                                &def, &dt[1]);
+
+      if (!ok)
+       {
+         if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "use not simple.");
+         return false;
+       }
+    }
+
   /* If op0 is an external or constant defs use a vector type of
      the same size as the output vector type.  */
   if (!vectype_in)
@@ -1839,82 +2360,227 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
   if (!vectype_in)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
-        {
-          fprintf (vect_dump, "no vectype for scalar type ");
-          print_generic_expr (vect_dump, rhs_type, TDF_SLIM);
-        }
+       {
+         fprintf (vect_dump, "no vectype for scalar type ");
+         print_generic_expr (vect_dump, rhs_type, TDF_SLIM);
+       }
 
       return false;
     }
 
-  /* FORNOW */
   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
-  if (nunits_in == nunits_out / 2)
+  if (nunits_in < nunits_out)
     modifier = NARROW;
   else if (nunits_out == nunits_in)
     modifier = NONE;
-  else if (nunits_out == nunits_in / 2)
-    modifier = WIDEN;
-  else
-    return false;
-
-  if (modifier == NARROW)
-    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
   else
-    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
+    modifier = WIDEN;
 
   /* Multiple types in SLP are handled by creating the appropriate number of
      vectorized stmts for each SLP node.  Hence, NCOPIES is always 1 in
      case of SLP.  */
   if (slp_node || PURE_SLP_STMT (stmt_info))
     ncopies = 1;
+  else if (modifier == NARROW)
+    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
+  else
+    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
 
   /* Sanity check: make sure that at least one copy of the vectorized stmt
      needs to be generated.  */
   gcc_assert (ncopies >= 1);
 
   /* Supportable by target?  */
-  if ((modifier == NONE
-       && !targetm.vectorize.builtin_conversion (code, vectype_out, vectype_in))
-      || (modifier == WIDEN
-         && !supportable_widening_operation (code, stmt,
-                                             vectype_out, vectype_in,
-                                             &decl1, &decl2,
-                                             &code1, &code2,
-                                              &dummy_int, &dummy))
-      || (modifier == NARROW
-         && !supportable_narrowing_operation (code, vectype_out, vectype_in,
-                                              &code1, &dummy_int, &dummy)))
+  switch (modifier)
     {
+    case NONE:
+      if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
+       return false;
+      if (supportable_convert_operation (code, vectype_out, vectype_in,
+                                        &decl1, &code1))
+       break;
+      /* FALLTHRU */
+    unsupported:
       if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "conversion not supported by target.");
+       fprintf (vect_dump, "conversion not supported by target.");
       return false;
-    }
 
-  if (modifier != NONE)
-    {
-      /* FORNOW: SLP not supported.  */
-      if (STMT_SLP_TYPE (stmt_info))
-       return false;
+    case WIDEN:
+      if (supportable_widening_operation (code, stmt, vectype_out, vectype_in,
+                                         &code1, &code2, &multi_step_cvt,
+                                         &interm_types))
+       {
+         /* Binary widening operation can only be supported directly by the
+            architecture.  */
+         gcc_assert (!(multi_step_cvt && op_type == binary_op));
+         break;
+       }
+
+      if (code != FLOAT_EXPR
+         || (GET_MODE_SIZE (TYPE_MODE (lhs_type))
+             <= GET_MODE_SIZE (TYPE_MODE (rhs_type))))
+       goto unsupported;
+
+      rhs_mode = TYPE_MODE (rhs_type);
+      fltsz = GET_MODE_SIZE (TYPE_MODE (lhs_type));
+      for (rhs_mode = GET_MODE_2XWIDER_MODE (TYPE_MODE (rhs_type));
+          rhs_mode != VOIDmode && GET_MODE_SIZE (rhs_mode) <= fltsz;
+          rhs_mode = GET_MODE_2XWIDER_MODE (rhs_mode))
+       {
+         cvt_type
+           = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
+         cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
+         if (cvt_type == NULL_TREE)
+           goto unsupported;
+
+         if (GET_MODE_SIZE (rhs_mode) == fltsz)
+           {
+             if (!supportable_convert_operation (code, vectype_out,
+                                                 cvt_type, &decl1, &codecvt1))
+               goto unsupported;
+           }
+         else if (!supportable_widening_operation (code, stmt, vectype_out,
+                                                   cvt_type, &codecvt1,
+                                                   &codecvt2, &multi_step_cvt,
+                                                   &interm_types))
+           continue;
+         else
+           gcc_assert (multi_step_cvt == 0);
+
+         if (supportable_widening_operation (NOP_EXPR, stmt, cvt_type,
+                                             vectype_in, &code1, &code2,
+                                             &multi_step_cvt, &interm_types))
+           break;
+       }
+
+      if (rhs_mode == VOIDmode || GET_MODE_SIZE (rhs_mode) > fltsz)
+       goto unsupported;
+
+      if (GET_MODE_SIZE (rhs_mode) == fltsz)
+       codecvt2 = ERROR_MARK;
+      else
+       {
+         multi_step_cvt++;
+         VEC_safe_push (tree, heap, interm_types, cvt_type);
+         cvt_type = NULL_TREE;
+       }
+      break;
+
+    case NARROW:
+      gcc_assert (op_type == unary_op);
+      if (supportable_narrowing_operation (code, vectype_out, vectype_in,
+                                          &code1, &multi_step_cvt,
+                                          &interm_types))
+       break;
+
+      if (code != FIX_TRUNC_EXPR
+         || (GET_MODE_SIZE (TYPE_MODE (lhs_type))
+             >= GET_MODE_SIZE (TYPE_MODE (rhs_type))))
+       goto unsupported;
+
+      rhs_mode = TYPE_MODE (rhs_type);
+      cvt_type
+       = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
+      cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
+      if (cvt_type == NULL_TREE)
+       goto unsupported;
+      if (!supportable_convert_operation (code, cvt_type, vectype_in,
+                                         &decl1, &codecvt1))
+       goto unsupported;
+      if (supportable_narrowing_operation (NOP_EXPR, vectype_out, cvt_type,
+                                          &code1, &multi_step_cvt,
+                                          &interm_types))
+       break;
+      goto unsupported;
+
+    default:
+      gcc_unreachable ();
     }
 
   if (!vec_stmt)               /* transformation not required.  */
     {
-      STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "=== vectorizable_conversion ===");
+      if (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR)
+        {
+         STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
+         vect_model_simple_cost (stmt_info, ncopies, dt, NULL, NULL);
+       }
+      else if (modifier == NARROW)
+       {
+         STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
+         vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt);
+       }
+      else
+       {
+         STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
+         vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt);
+       }
+      VEC_free (tree, heap, interm_types);
       return true;
     }
 
   /** Transform.  **/
   if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "transform conversion.");
+    fprintf (vect_dump, "transform conversion. ncopies = %d.", ncopies);
 
-  /* Handle def.  */
-  vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
+  if (op_type == binary_op)
+    {
+      if (CONSTANT_CLASS_P (op0))
+       op0 = fold_convert (TREE_TYPE (op1), op0);
+      else if (CONSTANT_CLASS_P (op1))
+       op1 = fold_convert (TREE_TYPE (op0), op1);
+    }
+
+  /* In case of multi-step conversion, we first generate conversion operations
+     to the intermediate types, and then from that types to the final one.
+     We create vector destinations for the intermediate type (TYPES) received
+     from supportable_*_operation, and store them in the correct order
+     for future use in vect_create_vectorized_*_stmts ().  */
+  vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
+  vec_dest = vect_create_destination_var (scalar_dest,
+                                         (cvt_type && modifier == WIDEN)
+                                         ? cvt_type : vectype_out);
+  VEC_quick_push (tree, vec_dsts, vec_dest);
+
+  if (multi_step_cvt)
+    {
+      for (i = VEC_length (tree, interm_types) - 1;
+          VEC_iterate (tree, interm_types, i, intermediate_type); i--)
+       {
+         vec_dest = vect_create_destination_var (scalar_dest,
+                                                 intermediate_type);
+         VEC_quick_push (tree, vec_dsts, vec_dest);
+       }
+    }
+
+  if (cvt_type)
+    vec_dest = vect_create_destination_var (scalar_dest,
+                                           modifier == WIDEN
+                                           ? vectype_out : cvt_type);
 
-  if (modifier == NONE && !slp_node)
-    vec_oprnds0 = VEC_alloc (tree, heap, 1);
+  if (!slp_node)
+    {
+      if (modifier == NONE)
+       vec_oprnds0 = VEC_alloc (tree, heap, 1);
+      else if (modifier == WIDEN)
+       {
+         vec_oprnds0 = VEC_alloc (tree, heap,
+                                  (multi_step_cvt
+                                   ? vect_pow2 (multi_step_cvt) : 1));
+         if (op_type == binary_op)
+           vec_oprnds1 = VEC_alloc (tree, heap, 1);
+       }
+      else
+       vec_oprnds0 = VEC_alloc (tree, heap,
+                                2 * (multi_step_cvt
+                                     ? vect_pow2 (multi_step_cvt) : 1));
+    }
+  else if (code == WIDEN_LSHIFT_EXPR)
+    vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
 
+  last_oprnd = op0;
   prev_stmt_info = NULL;
   switch (modifier)
     {
@@ -1922,22 +2588,33 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
       for (j = 0; j < ncopies; j++)
        {
          if (j == 0)
-           vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
+           vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node,
+                              -1);
          else
            vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
 
-         builtin_decl =
-           targetm.vectorize.builtin_conversion (code,
-                                                 vectype_out, vectype_in);
          FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vop0)
            {
-             /* Arguments are ready. create the new vector stmt.  */
-             new_stmt = gimple_build_call (builtin_decl, 1, vop0);
-             new_temp = make_ssa_name (vec_dest, new_stmt);
-             gimple_call_set_lhs (new_stmt, new_temp);
-             vect_finish_stmt_generation (stmt, new_stmt, gsi);
+             /* Arguments are ready, create the new vector stmt.  */
+             if (code1 == CALL_EXPR)
+               {
+                 new_stmt = gimple_build_call (decl1, 1, vop0);
+                 new_temp = make_ssa_name (vec_dest, new_stmt);
+                 gimple_call_set_lhs (new_stmt, new_temp);
+               }
+             else
+               {
+                 gcc_assert (TREE_CODE_LENGTH (code1) == unary_op);
+                 new_stmt = gimple_build_assign_with_ops (code1, vec_dest,
+                                                          vop0, NULL);
+                 new_temp = make_ssa_name (vec_dest, new_stmt);
+                 gimple_assign_set_lhs (new_stmt, new_temp);
+               }
+
+             vect_finish_stmt_generation (stmt, new_stmt, gsi);
              if (slp_node)
-               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
+               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+                               new_stmt);
            }
 
          if (j == 0)
@@ -1955,30 +2632,117 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
         the vector stmt by a factor VF/nunits.  */
       for (j = 0; j < ncopies; j++)
        {
+         /* Handle uses.  */
          if (j == 0)
-           vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
-         else
-           vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
+           {
+             if (slp_node)
+               {
+                 if (code == WIDEN_LSHIFT_EXPR)
+                   {
+                     unsigned int k;
 
-         /* Generate first half of the widened result:  */
-         new_stmt
-           = vect_gen_widened_results_half (code1, decl1,
-                                            vec_oprnd0, vec_oprnd1,
-                                            unary_op, vec_dest, gsi, stmt);
-         if (j == 0)
-           STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+                     vec_oprnd1 = op1;
+                     /* Store vec_oprnd1 for every vector stmt to be created
+                        for SLP_NODE.  We check during the analysis that all
+                        the shift arguments are the same.  */
+                     for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
+                       VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+
+                     vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
+                                        slp_node, -1);
+                   }
+                 else
+                   vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0,
+                                      &vec_oprnds1, slp_node, -1);
+               }
+             else
+               {
+                 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
+                 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
+                 if (op_type == binary_op)
+                   {
+                     if (code == WIDEN_LSHIFT_EXPR)
+                       vec_oprnd1 = op1;
+                     else
+                       vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt,
+                                                                  NULL);
+                     VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+                   }
+               }
+           }
          else
-           STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
-         prev_stmt_info = vinfo_for_stmt (new_stmt);
+           {
+             vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
+             VEC_truncate (tree, vec_oprnds0, 0);
+             VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
+             if (op_type == binary_op)
+               {
+                 if (code == WIDEN_LSHIFT_EXPR)
+                   vec_oprnd1 = op1;
+                 else
+                   vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1],
+                                                                vec_oprnd1);
+                 VEC_truncate (tree, vec_oprnds1, 0);
+                 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+               }
+           }
 
-         /* Generate second half of the widened result:  */
-         new_stmt
-           = vect_gen_widened_results_half (code2, decl2,
-                                            vec_oprnd0, vec_oprnd1,
-                                            unary_op, vec_dest, gsi, stmt);
-         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
-         prev_stmt_info = vinfo_for_stmt (new_stmt);
+         /* Arguments are ready.  Create the new vector stmts.  */
+         for (i = multi_step_cvt; i >= 0; i--)
+           {
+             tree this_dest = VEC_index (tree, vec_dsts, i);
+             enum tree_code c1 = code1, c2 = code2;
+             if (i == 0 && codecvt2 != ERROR_MARK)
+               {
+                 c1 = codecvt1;
+                 c2 = codecvt2;
+               }
+             vect_create_vectorized_promotion_stmts (&vec_oprnds0,
+                                                     &vec_oprnds1,
+                                                     stmt, this_dest, gsi,
+                                                     c1, c2, decl1, decl2,
+                                                     op_type);
+           }
+
+         FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vop0)
+           {
+             if (cvt_type)
+               {
+                 if (codecvt1 == CALL_EXPR)
+                   {
+                     new_stmt = gimple_build_call (decl1, 1, vop0);
+                     new_temp = make_ssa_name (vec_dest, new_stmt);
+                     gimple_call_set_lhs (new_stmt, new_temp);
+                   }
+                 else
+                   {
+                     gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
+                     new_temp = make_ssa_name (vec_dest, NULL);
+                     new_stmt = gimple_build_assign_with_ops (codecvt1,
+                                                              new_temp,
+                                                              vop0, NULL);
+                   }
+
+                 vect_finish_stmt_generation (stmt, new_stmt, gsi);
+               }
+             else
+               new_stmt = SSA_NAME_DEF_STMT (vop0);
+
+             if (slp_node)
+               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+                               new_stmt);
+             else
+               {
+                 if (!prev_stmt_info)
+                   STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+                 else
+                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+                 prev_stmt_info = vinfo_for_stmt (new_stmt);
+               }
+           }
        }
+
+      *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
       break;
 
     case NARROW:
@@ -1989,37 +2753,52 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
       for (j = 0; j < ncopies; j++)
        {
          /* Handle uses.  */
-         if (j == 0)
-           {
-             vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
-             vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
-           }
+         if (slp_node)
+           vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
+                              slp_node, -1);
          else
            {
-             vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
-             vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
+             VEC_truncate (tree, vec_oprnds0, 0);
+             vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
+                                       vect_pow2 (multi_step_cvt) - 1);
            }
 
-         /* Arguments are ready.  Create the new vector stmt.  */
-         new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
-                                                  vec_oprnd1);
-         new_temp = make_ssa_name (vec_dest, new_stmt);
-         gimple_assign_set_lhs (new_stmt, new_temp);
-         vect_finish_stmt_generation (stmt, new_stmt, gsi);
+         /* Arguments are ready.  Create the new vector stmts.  */
+         if (cvt_type)
+           FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vop0)
+             {
+               if (codecvt1 == CALL_EXPR)
+                 {
+                   new_stmt = gimple_build_call (decl1, 1, vop0);
+                   new_temp = make_ssa_name (vec_dest, new_stmt);
+                   gimple_call_set_lhs (new_stmt, new_temp);
+                 }
+               else
+                 {
+                   gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
+                   new_temp = make_ssa_name (vec_dest, NULL);
+                   new_stmt = gimple_build_assign_with_ops (codecvt1, new_temp,
+                                                            vop0, NULL);
+                 }
 
-         if (j == 0)
-           STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
-         else
-           STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+               vect_finish_stmt_generation (stmt, new_stmt, gsi);
+               VEC_replace (tree, vec_oprnds0, i, new_temp);
+             }
 
-         prev_stmt_info = vinfo_for_stmt (new_stmt);
+         vect_create_vectorized_demotion_stmts (&vec_oprnds0, multi_step_cvt,
+                                                stmt, vec_dsts, gsi,
+                                                slp_node, code1,
+                                                &prev_stmt_info);
        }
 
       *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+      break;
     }
 
-  if (vec_oprnds0)
-    VEC_free (tree, heap, vec_oprnds0);
+  VEC_free (tree, heap, vec_oprnds0);
+  VEC_free (tree, heap, vec_oprnds1);
+  VEC_free (tree, heap, vec_dsts);
+  VEC_free (tree, heap, interm_types);
 
   return true;
 }
@@ -2092,7 +2871,7 @@ vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
   if (code == VIEW_CONVERT_EXPR)
     op = TREE_OPERAND (op, 0);
 
-  if (!vect_is_simple_use_1 (op, loop_vinfo, bb_vinfo,
+  if (!vect_is_simple_use_1 (op, stmt, loop_vinfo, bb_vinfo,
                             &def_stmt, &def, &dt[0], &vectype_in))
     {
       if (vect_print_dump_info (REPORT_DETAILS))
@@ -2110,12 +2889,31 @@ vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
              != GET_MODE_SIZE (TYPE_MODE (vectype_in)))))
     return false;
 
+  /* We do not handle bit-precision changes.  */
+  if ((CONVERT_EXPR_CODE_P (code)
+       || code == VIEW_CONVERT_EXPR)
+      && INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
+      && ((TYPE_PRECISION (TREE_TYPE (scalar_dest))
+          != GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (scalar_dest))))
+         || ((TYPE_PRECISION (TREE_TYPE (op))
+              != GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (op))))))
+      /* But a conversion that does not change the bit-pattern is ok.  */
+      && !((TYPE_PRECISION (TREE_TYPE (scalar_dest))
+           > TYPE_PRECISION (TREE_TYPE (op)))
+          && TYPE_UNSIGNED (TREE_TYPE (op))))
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "type conversion to/from bit-precision "
+                "unsupported.");
+      return false;
+    }
+
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "=== vectorizable_assignment ===");
-      vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
+      vect_model_simple_cost (stmt_info, ncopies, dt, NULL, NULL);
       return true;
     }
 
@@ -2131,7 +2929,7 @@ vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
     {
       /* Handle uses.  */
       if (j == 0)
-        vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
+        vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node, -1);
       else
         vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds, NULL);
 
@@ -2165,6 +2963,42 @@ vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
 }
 
 
+/* Return TRUE if CODE (a shift operation) is supported for SCALAR_TYPE
+   either as shift by a scalar or by a vector.  */
+
+bool
+vect_supportable_shift (enum tree_code code, tree scalar_type)
+{
+
+  enum machine_mode vec_mode;
+  optab optab;
+  int icode;
+  tree vectype;
+
+  vectype = get_vectype_for_scalar_type (scalar_type);
+  if (!vectype)
+    return false;
+
+  optab = optab_for_tree_code (code, vectype, optab_scalar);
+  if (!optab
+      || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
+    {
+      optab = optab_for_tree_code (code, vectype, optab_vector);
+      if (!optab
+          || (optab_handler (optab, TYPE_MODE (vectype))
+                      == CODE_FOR_nothing))
+        return false;
+    }
+
+  vec_mode = TYPE_MODE (vectype);
+  icode = (int) optab_handler (optab, vec_mode);
+  if (icode == CODE_FOR_nothing)
+    return false;
+
+  return true;
+}
+
+
 /* Function vectorizable_shift.
 
    Check if STMT performs a shift operation that can be vectorized.
@@ -2197,6 +3031,7 @@ vectorizable_shift (gimple stmt, gimple_stmt_iterator *gsi,
   int nunits_in;
   int nunits_out;
   tree vectype_out;
+  tree op1_vectype;
   int ncopies;
   int j, i;
   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
@@ -2227,9 +3062,16 @@ vectorizable_shift (gimple stmt, gimple_stmt_iterator *gsi,
 
   scalar_dest = gimple_assign_lhs (stmt);
   vectype_out = STMT_VINFO_VECTYPE (stmt_info);
+  if (TYPE_PRECISION (TREE_TYPE (scalar_dest))
+      != GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (scalar_dest))))
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "bit-precision shifts not supported.");
+      return false;
+    }
 
   op0 = gimple_assign_rhs1 (stmt);
-  if (!vect_is_simple_use_1 (op0, loop_vinfo, bb_vinfo,
+  if (!vect_is_simple_use_1 (op0, stmt, loop_vinfo, bb_vinfo,
                              &def_stmt, &def, &dt[0], &vectype))
     {
       if (vect_print_dump_info (REPORT_DETAILS))
@@ -2259,7 +3101,8 @@ vectorizable_shift (gimple stmt, gimple_stmt_iterator *gsi,
     return false;
 
   op1 = gimple_assign_rhs2 (stmt);
-  if (!vect_is_simple_use (op1, loop_vinfo, bb_vinfo, &def_stmt, &def, &dt[1]))
+  if (!vect_is_simple_use_1 (op1, stmt, loop_vinfo, bb_vinfo, &def_stmt,
+                            &def, &dt[1], &op1_vectype))
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "use not simple.");
@@ -2316,6 +3159,16 @@ vectorizable_shift (gimple stmt, gimple_stmt_iterator *gsi,
       optab = optab_for_tree_code (code, vectype, optab_vector);
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "vector/vector shift/rotate found.");
+      if (!op1_vectype)
+       op1_vectype = get_same_sized_vectype (TREE_TYPE (op1), vectype_out);
+      if (op1_vectype == NULL_TREE
+         || TYPE_MODE (op1_vectype) != TYPE_MODE (vectype))
+       {
+         if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "unusable type for last operand in"
+                               " vector/vector shift/rotate.");
+         return false;
+       }
     }
   /* See if the machine has a vector shifted by scalar insn and if not
      then see if it has a vector shifted by vector insn.  */
@@ -2343,9 +3196,28 @@ vectorizable_shift (gimple stmt, gimple_stmt_iterator *gsi,
               /* Unlike the other binary operators, shifts/rotates have
                  the rhs being int, instead of the same type as the lhs,
                  so make sure the scalar is the right type if we are
-                 dealing with vectors of short/char.  */
+                dealing with vectors of long long/long/short/char.  */
               if (dt[1] == vect_constant_def)
                 op1 = fold_convert (TREE_TYPE (vectype), op1);
+             else if (!useless_type_conversion_p (TREE_TYPE (vectype),
+                                                  TREE_TYPE (op1)))
+               {
+                 if (slp_node
+                     && TYPE_MODE (TREE_TYPE (vectype))
+                        != TYPE_MODE (TREE_TYPE (op1)))
+                   {
+                     if (vect_print_dump_info (REPORT_DETAILS))
+                     fprintf (vect_dump, "unusable type for last operand in"
+                                         " vector/vector shift/rotate.");
+                       return false;
+                   }
+                 if (vec_stmt && !slp_node)
+                   {
+                     op1 = fold_convert (TREE_TYPE (vectype), op1);
+                     op1 = vect_init_vector (stmt, op1,
+                                             TREE_TYPE (vectype), NULL);
+                   }
+               }
             }
         }
     }
@@ -2387,7 +3259,7 @@ vectorizable_shift (gimple stmt, gimple_stmt_iterator *gsi,
       STMT_VINFO_TYPE (stmt_info) = shift_vec_info_type;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "=== vectorizable_shift ===");
-      vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
+      vect_model_simple_cost (stmt_info, ncopies, dt, NULL, NULL);
       return true;
     }
 
@@ -2446,806 +3318,99 @@ vectorizable_shift (gimple stmt, gimple_stmt_iterator *gsi,
                 }
             }
 
-          /* vec_oprnd1 is available if operand 1 should be of a scalar-type
-             (a special case for certain kind of vector shifts); otherwise,
-             operand 1 should be of a vector type (the usual case).  */
-          if (vec_oprnd1)
-            vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
-                               slp_node);
-          else
-            vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
-                               slp_node);
-        }
-      else
-        vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
-
-      /* Arguments are ready.  Create the new vector stmt.  */
-      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vop0)
-        {
-          vop1 = VEC_index (tree, vec_oprnds1, i);
-          new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
-          new_temp = make_ssa_name (vec_dest, new_stmt);
-          gimple_assign_set_lhs (new_stmt, new_temp);
-          vect_finish_stmt_generation (stmt, new_stmt, gsi);
-          if (slp_node)
-            VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
-        }
-
-      if (slp_node)
-        continue;
-
-      if (j == 0)
-        STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
-      else
-        STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
-      prev_stmt_info = vinfo_for_stmt (new_stmt);
-    }
-
-  VEC_free (tree, heap, vec_oprnds0);
-  VEC_free (tree, heap, vec_oprnds1);
-
-  return true;
-}
-
-
-/* Function vectorizable_operation.
-
-   Check if STMT performs a binary, unary or ternary operation that can
-   be vectorized.
-   If VEC_STMT is also passed, vectorize the STMT: create a vectorized
-   stmt to replace it, put it in VEC_STMT, and insert it at BSI.
-   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
-
-static bool
-vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
-                       gimple *vec_stmt, slp_tree slp_node)
-{
-  tree vec_dest;
-  tree scalar_dest;
-  tree op0, op1 = NULL_TREE, op2 = NULL_TREE;
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  tree vectype;
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  enum tree_code code;
-  enum machine_mode vec_mode;
-  tree new_temp;
-  int op_type;
-  optab optab;
-  int icode;
-  tree def;
-  gimple def_stmt;
-  enum vect_def_type dt[3]
-    = {vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type};
-  gimple new_stmt = NULL;
-  stmt_vec_info prev_stmt_info;
-  int nunits_in;
-  int nunits_out;
-  tree vectype_out;
-  int ncopies;
-  int j, i;
-  VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vec_oprnds2 = NULL;
-  tree vop0, vop1, vop2;
-  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
-  int vf;
-
-  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
-    return false;
-
-  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
-    return false;
-
-  /* Is STMT a vectorizable binary/unary operation?   */
-  if (!is_gimple_assign (stmt))
-    return false;
-
-  if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
-    return false;
-
-  code = gimple_assign_rhs_code (stmt);
-
-  /* For pointer addition, we should use the normal plus for
-     the vector addition.  */
-  if (code == POINTER_PLUS_EXPR)
-    code = PLUS_EXPR;
-
-  /* Support only unary or binary operations.  */
-  op_type = TREE_CODE_LENGTH (code);
-  if (op_type != unary_op && op_type != binary_op && op_type != ternary_op)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "num. args = %d (not unary/binary/ternary op).",
-                op_type);
-      return false;
-    }
-
-  scalar_dest = gimple_assign_lhs (stmt);
-  vectype_out = STMT_VINFO_VECTYPE (stmt_info);
-
-  op0 = gimple_assign_rhs1 (stmt);
-  if (!vect_is_simple_use_1 (op0, loop_vinfo, bb_vinfo,
-                            &def_stmt, &def, &dt[0], &vectype))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "use not simple.");
-      return false;
-    }
-  /* If op0 is an external or constant def use a vector type with
-     the same size as the output vector type.  */
-  if (!vectype)
-    vectype = get_same_sized_vectype (TREE_TYPE (op0), vectype_out);
-  if (vec_stmt)
-    gcc_assert (vectype);
-  if (!vectype)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        {
-          fprintf (vect_dump, "no vectype for scalar type ");
-          print_generic_expr (vect_dump, TREE_TYPE (op0), TDF_SLIM);
-        }
-
-      return false;
-    }
-
-  nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
-  nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
-  if (nunits_out != nunits_in)
-    return false;
-
-  if (op_type == binary_op || op_type == ternary_op)
-    {
-      op1 = gimple_assign_rhs2 (stmt);
-      if (!vect_is_simple_use (op1, loop_vinfo, bb_vinfo, &def_stmt, &def,
-                               &dt[1]))
-       {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "use not simple.");
-         return false;
-       }
-    }
-  if (op_type == ternary_op)
-    {
-      op2 = gimple_assign_rhs3 (stmt);
-      if (!vect_is_simple_use (op2, loop_vinfo, bb_vinfo, &def_stmt, &def,
-                               &dt[2]))
-       {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "use not simple.");
-         return false;
-       }
-    }
-
-  if (loop_vinfo)
-    vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  else
-    vf = 1;
-
-  /* Multiple types in SLP are handled by creating the appropriate number of
-     vectorized stmts for each SLP node.  Hence, NCOPIES is always 1 in
-     case of SLP.  */
-  if (slp_node || PURE_SLP_STMT (stmt_info))
-    ncopies = 1;
-  else
-    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
-
-  gcc_assert (ncopies >= 1);
-
-  /* Shifts are handled in vectorizable_shift ().  */
-  if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
-      || code == RROTATE_EXPR)
-   return false;
-
-  optab = optab_for_tree_code (code, vectype, optab_default);
-
-  /* Supportable by target?  */
-  if (!optab)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "no optab.");
-      return false;
-    }
-  vec_mode = TYPE_MODE (vectype);
-  icode = (int) optab_handler (optab, vec_mode);
-  if (icode == CODE_FOR_nothing)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "op not supported by target.");
-      /* Check only during analysis.  */
-      if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
-         || (vf < vect_min_worthwhile_factor (code)
-              && !vec_stmt))
-        return false;
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "proceeding using word mode.");
-    }
-
-  /* Worthwhile without SIMD support?  Check only during analysis.  */
-  if (!VECTOR_MODE_P (TYPE_MODE (vectype))
-      && vf < vect_min_worthwhile_factor (code)
-      && !vec_stmt)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "not worthwhile without SIMD support.");
-      return false;
-    }
-
-  if (!vec_stmt) /* transformation not required.  */
-    {
-      STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "=== vectorizable_operation ===");
-      vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
-      return true;
-    }
-
-  /** Transform.  **/
-
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "transform binary/unary operation.");
-
-  /* Handle def.  */
-  vec_dest = vect_create_destination_var (scalar_dest, vectype);
-
-  /* Allocate VECs for vector operands.  In case of SLP, vector operands are
-     created in the previous stages of the recursion, so no allocation is
-     needed, except for the case of shift with scalar shift argument.  In that
-     case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
-     be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
-     In case of loop-based vectorization we allocate VECs of size 1.  We
-     allocate VEC_OPRNDS1 only in case of binary operation.  */
-  if (!slp_node)
-    {
-      vec_oprnds0 = VEC_alloc (tree, heap, 1);
-      if (op_type == binary_op || op_type == ternary_op)
-        vec_oprnds1 = VEC_alloc (tree, heap, 1);
-      if (op_type == ternary_op)
-        vec_oprnds2 = VEC_alloc (tree, heap, 1);
-    }
-
-  /* In case the vectorization factor (VF) is bigger than the number
-     of elements that we can fit in a vectype (nunits), we have to generate
-     more than one vector stmt - i.e - we need to "unroll" the
-     vector stmt by a factor VF/nunits.  In doing so, we record a pointer
-     from one copy of the vector stmt to the next, in the field
-     STMT_VINFO_RELATED_STMT.  This is necessary in order to allow following
-     stages to find the correct vector defs to be used when vectorizing
-     stmts that use the defs of the current stmt.  The example below
-     illustrates the vectorization process when VF=16 and nunits=4 (i.e.,
-     we need to create 4 vectorized stmts):
-
-     before vectorization:
-                                RELATED_STMT    VEC_STMT
-        S1:     x = memref      -               -
-        S2:     z = x + 1       -               -
-
-     step 1: vectorize stmt S1 (done in vectorizable_load. See more details
-             there):
-                                RELATED_STMT    VEC_STMT
-        VS1_0:  vx0 = memref0   VS1_1           -
-        VS1_1:  vx1 = memref1   VS1_2           -
-        VS1_2:  vx2 = memref2   VS1_3           -
-        VS1_3:  vx3 = memref3   -               -
-        S1:     x = load        -               VS1_0
-        S2:     z = x + 1       -               -
-
-     step2: vectorize stmt S2 (done here):
-        To vectorize stmt S2 we first need to find the relevant vector
-        def for the first operand 'x'.  This is, as usual, obtained from
-        the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
-        that defines 'x' (S1).  This way we find the stmt VS1_0, and the
-        relevant vector def 'vx0'.  Having found 'vx0' we can generate
-        the vector stmt VS2_0, and as usual, record it in the
-        STMT_VINFO_VEC_STMT of stmt S2.
-        When creating the second copy (VS2_1), we obtain the relevant vector
-        def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
-        stmt VS1_0.  This way we find the stmt VS1_1 and the relevant
-        vector def 'vx1'.  Using 'vx1' we create stmt VS2_1 and record a
-        pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
-        Similarly when creating stmts VS2_2 and VS2_3.  This is the resulting
-        chain of stmts and pointers:
-                                RELATED_STMT    VEC_STMT
-        VS1_0:  vx0 = memref0   VS1_1           -
-        VS1_1:  vx1 = memref1   VS1_2           -
-        VS1_2:  vx2 = memref2   VS1_3           -
-        VS1_3:  vx3 = memref3   -               -
-        S1:     x = load        -               VS1_0
-        VS2_0:  vz0 = vx0 + v1  VS2_1           -
-        VS2_1:  vz1 = vx1 + v1  VS2_2           -
-        VS2_2:  vz2 = vx2 + v1  VS2_3           -
-        VS2_3:  vz3 = vx3 + v1  -               -
-        S2:     z = x + 1       -               VS2_0  */
-
-  prev_stmt_info = NULL;
-  for (j = 0; j < ncopies; j++)
-    {
-      /* Handle uses.  */
-      if (j == 0)
-       {
-         if (op_type == binary_op || op_type == ternary_op)
-           vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
-                              slp_node);
-         else
-           vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
-                              slp_node);
-         if (op_type == ternary_op)
-           {
-             vec_oprnds2 = VEC_alloc (tree, heap, 1);
-             VEC_quick_push (tree, vec_oprnds2,
-                             vect_get_vec_def_for_operand (op2, stmt, NULL));
-           }
-       }
-      else
-       {
-         vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
-         if (op_type == ternary_op)
-           {
-             tree vec_oprnd = VEC_pop (tree, vec_oprnds2);
-             VEC_quick_push (tree, vec_oprnds2,
-                             vect_get_vec_def_for_stmt_copy (dt[2],
-                                                             vec_oprnd));
-           }
-       }
-
-      /* Arguments are ready.  Create the new vector stmt.  */
-      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vop0)
-        {
-         vop1 = ((op_type == binary_op || op_type == ternary_op)
-                 ? VEC_index (tree, vec_oprnds1, i) : NULL_TREE);
-         vop2 = ((op_type == ternary_op)
-                 ? VEC_index (tree, vec_oprnds2, i) : NULL_TREE);
-         new_stmt = gimple_build_assign_with_ops3 (code, vec_dest,
-                                                   vop0, vop1, vop2);
-         new_temp = make_ssa_name (vec_dest, new_stmt);
-         gimple_assign_set_lhs (new_stmt, new_temp);
-         vect_finish_stmt_generation (stmt, new_stmt, gsi);
-          if (slp_node)
-           VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
-        }
-
-      if (slp_node)
-        continue;
-
-      if (j == 0)
-       STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
-      else
-       STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
-      prev_stmt_info = vinfo_for_stmt (new_stmt);
-    }
-
-  VEC_free (tree, heap, vec_oprnds0);
-  if (vec_oprnds1)
-    VEC_free (tree, heap, vec_oprnds1);
-  if (vec_oprnds2)
-    VEC_free (tree, heap, vec_oprnds2);
-
-  return true;
-}
-
-
-/* Get vectorized definitions for loop-based vectorization.  For the first
-   operand we call vect_get_vec_def_for_operand() (with OPRND containing
-   scalar operand), and for the rest we get a copy with
-   vect_get_vec_def_for_stmt_copy() using the previous vector definition
-   (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
-   The vectors are collected into VEC_OPRNDS.  */
-
-static void
-vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
-                          VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
-{
-  tree vec_oprnd;
-
-  /* Get first vector operand.  */
-  /* All the vector operands except the very first one (that is scalar oprnd)
-     are stmt copies.  */
-  if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
-    vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
-  else
-    vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
-
-  VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
-
-  /* Get second vector operand.  */
-  vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
-  VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
-
-  *oprnd = vec_oprnd;
-
-  /* For conversion in multiple steps, continue to get operands
-     recursively.  */
-  if (multi_step_cvt)
-    vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds,  multi_step_cvt - 1);
-}
-
-
-/* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
-   For multi-step conversions store the resulting vectors and call the function
-   recursively.  */
-
-static void
-vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
-                                       int multi_step_cvt, gimple stmt,
-                                       VEC (tree, heap) *vec_dsts,
-                                       gimple_stmt_iterator *gsi,
-                                       slp_tree slp_node, enum tree_code code,
-                                       stmt_vec_info *prev_stmt_info)
-{
-  unsigned int i;
-  tree vop0, vop1, new_tmp, vec_dest;
-  gimple new_stmt;
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-
-  vec_dest = VEC_pop (tree, vec_dsts);
-
-  for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
-    {
-      /* Create demotion operation.  */
-      vop0 = VEC_index (tree, *vec_oprnds, i);
-      vop1 = VEC_index (tree, *vec_oprnds, i + 1);
-      new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
-      new_tmp = make_ssa_name (vec_dest, new_stmt);
-      gimple_assign_set_lhs (new_stmt, new_tmp);
-      vect_finish_stmt_generation (stmt, new_stmt, gsi);
-
-      if (multi_step_cvt)
-        /* Store the resulting vector for next recursive call.  */
-        VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
-      else
-        {
-          /* This is the last step of the conversion sequence. Store the
-             vectors in SLP_NODE or in vector info of the scalar statement
-             (or in STMT_VINFO_RELATED_STMT chain).  */
-          if (slp_node)
-            VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
-          else
-            {
-              if (!*prev_stmt_info)
-                STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
-              else
-                STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
-
-              *prev_stmt_info = vinfo_for_stmt (new_stmt);
-            }
-        }
-    }
-
-  /* For multi-step demotion operations we first generate demotion operations
-     from the source type to the intermediate types, and then combine the
-     results (stored in VEC_OPRNDS) in demotion operation to the destination
-     type.  */
-  if (multi_step_cvt)
-    {
-      /* At each level of recursion we have have of the operands we had at the
-         previous level.  */
-      VEC_truncate (tree, *vec_oprnds, (i+1)/2);
-      vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
-                                             stmt, vec_dsts, gsi, slp_node,
-                                             code, prev_stmt_info);
-    }
-}
-
-
-/* Function vectorizable_type_demotion
-
-   Check if STMT performs a binary or unary operation that involves
-   type demotion, and if it can be vectorized.
-   If VEC_STMT is also passed, vectorize the STMT: create a vectorized
-   stmt to replace it, put it in VEC_STMT, and insert it at BSI.
-   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
-
-static bool
-vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
-                           gimple *vec_stmt, slp_tree slp_node)
-{
-  tree vec_dest;
-  tree scalar_dest;
-  tree op0;
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  enum tree_code code, code1 = ERROR_MARK;
-  tree def;
-  gimple def_stmt;
-  enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
-  stmt_vec_info prev_stmt_info;
-  int nunits_in;
-  int nunits_out;
-  tree vectype_out;
-  int ncopies;
-  int j, i;
-  tree vectype_in;
-  int multi_step_cvt = 0;
-  VEC (tree, heap) *vec_oprnds0 = NULL;
-  VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
-  tree last_oprnd, intermediate_type;
-
-  /* FORNOW: not supported by basic block SLP vectorization.  */
-  gcc_assert (loop_vinfo);
-
-  if (!STMT_VINFO_RELEVANT_P (stmt_info))
-    return false;
-
-  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
-    return false;
-
-  /* Is STMT a vectorizable type-demotion operation?  */
-  if (!is_gimple_assign (stmt))
-    return false;
-
-  if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
-    return false;
-
-  code = gimple_assign_rhs_code (stmt);
-  if (!CONVERT_EXPR_CODE_P (code))
-    return false;
-
-  scalar_dest = gimple_assign_lhs (stmt);
-  vectype_out = STMT_VINFO_VECTYPE (stmt_info);
-
-  /* Check the operands of the operation.  */
-  op0 = gimple_assign_rhs1 (stmt);
-  if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
-         && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
-        || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
-            && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
-            && CONVERT_EXPR_CODE_P (code))))
-    return false;
-  if (!vect_is_simple_use_1 (op0, loop_vinfo, NULL,
-                            &def_stmt, &def, &dt[0], &vectype_in))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "use not simple.");
-      return false;
-    }
-  /* If op0 is an external def use a vector type with the
-     same size as the output vector type if possible.  */
-  if (!vectype_in)
-    vectype_in = get_same_sized_vectype (TREE_TYPE (op0), vectype_out);
-  if (vec_stmt)
-    gcc_assert (vectype_in);
-  if (!vectype_in)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        {
-          fprintf (vect_dump, "no vectype for scalar type ");
-          print_generic_expr (vect_dump, TREE_TYPE (op0), TDF_SLIM);
-        }
-
-      return false;
-    }
-
-  nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
-  nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
-  if (nunits_in >= nunits_out)
-    return false;
-
-  /* Multiple types in SLP are handled by creating the appropriate number of
-     vectorized stmts for each SLP node.  Hence, NCOPIES is always 1 in
-     case of SLP.  */
-  if (slp_node || PURE_SLP_STMT (stmt_info))
-    ncopies = 1;
-  else
-    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
-  gcc_assert (ncopies >= 1);
-
-  /* Supportable by target?  */
-  if (!supportable_narrowing_operation (code, vectype_out, vectype_in,
-                                       &code1, &multi_step_cvt, &interm_types))
-    return false;
-
-  if (!vec_stmt) /* transformation not required.  */
-    {
-      STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "=== vectorizable_demotion ===");
-      vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
-      return true;
-    }
-
-  /** Transform.  **/
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
-            ncopies);
-
-  /* In case of multi-step demotion, we first generate demotion operations to
-     the intermediate types, and then from that types to the final one.
-     We create vector destinations for the intermediate type (TYPES) received
-     from supportable_narrowing_operation, and store them in the correct order
-     for future use in vect_create_vectorized_demotion_stmts().  */
-  if (multi_step_cvt)
-    vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
-  else
-    vec_dsts = VEC_alloc (tree, heap, 1);
-
-  vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
-  VEC_quick_push (tree, vec_dsts, vec_dest);
+          /* vec_oprnd1 is available if operand 1 should be of a scalar-type
+             (a special case for certain kind of vector shifts); otherwise,
+             operand 1 should be of a vector type (the usual case).  */
+          if (vec_oprnd1)
+            vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
+                               slp_node, -1);
+          else
+            vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
+                               slp_node, -1);
+        }
+      else
+        vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
 
-  if (multi_step_cvt)
-    {
-      for (i = VEC_length (tree, interm_types) - 1;
-           VEC_iterate (tree, interm_types, i, intermediate_type); i--)
+      /* Arguments are ready.  Create the new vector stmt.  */
+      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vop0)
         {
-          vec_dest = vect_create_destination_var (scalar_dest,
-                                                  intermediate_type);
-          VEC_quick_push (tree, vec_dsts, vec_dest);
+          vop1 = VEC_index (tree, vec_oprnds1, i);
+          new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
+          new_temp = make_ssa_name (vec_dest, new_stmt);
+          gimple_assign_set_lhs (new_stmt, new_temp);
+          vect_finish_stmt_generation (stmt, new_stmt, gsi);
+          if (slp_node)
+            VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
         }
-    }
 
-  /* In case the vectorization factor (VF) is bigger than the number
-     of elements that we can fit in a vectype (nunits), we have to generate
-     more than one vector stmt - i.e - we need to "unroll" the
-     vector stmt by a factor VF/nunits.   */
-  last_oprnd = op0;
-  prev_stmt_info = NULL;
-  for (j = 0; j < ncopies; j++)
-    {
-      /* Handle uses.  */
       if (slp_node)
-        vect_get_slp_defs (op0, NULL_TREE, slp_node, &vec_oprnds0, NULL, -1);
-      else
-        {
-          VEC_free (tree, heap, vec_oprnds0);
-          vec_oprnds0 = VEC_alloc (tree, heap,
-                        (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
-          vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
-                                    vect_pow2 (multi_step_cvt) - 1);
-        }
+        continue;
 
-      /* Arguments are ready.  Create the new vector stmts.  */
-      tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
-      vect_create_vectorized_demotion_stmts (&vec_oprnds0,
-                                             multi_step_cvt, stmt, tmp_vec_dsts,
-                                             gsi, slp_node, code1,
-                                             &prev_stmt_info);
+      if (j == 0)
+        STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+      else
+        STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+      prev_stmt_info = vinfo_for_stmt (new_stmt);
     }
 
   VEC_free (tree, heap, vec_oprnds0);
-  VEC_free (tree, heap, vec_dsts);
-  VEC_free (tree, heap, tmp_vec_dsts);
-  VEC_free (tree, heap, interm_types);
+  VEC_free (tree, heap, vec_oprnds1);
 
-  *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
   return true;
 }
 
 
-/* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
-   and VEC_OPRNDS1 (for binary operations).  For multi-step conversions store
-   the resulting vectors and call the function recursively.  */
-
-static void
-vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
-                                        VEC (tree, heap) **vec_oprnds1,
-                                        int multi_step_cvt, gimple stmt,
-                                        VEC (tree, heap) *vec_dsts,
-                                        gimple_stmt_iterator *gsi,
-                                        slp_tree slp_node, enum tree_code code1,
-                                        enum tree_code code2, tree decl1,
-                                        tree decl2, int op_type,
-                                        stmt_vec_info *prev_stmt_info)
-{
-  int i;
-  tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
-  gimple new_stmt1, new_stmt2;
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  VEC (tree, heap) *vec_tmp;
-
-  vec_dest = VEC_pop (tree, vec_dsts);
-  vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
-
-  FOR_EACH_VEC_ELT (tree, *vec_oprnds0, i, vop0)
-    {
-      if (op_type == binary_op)
-        vop1 = VEC_index (tree, *vec_oprnds1, i);
-      else
-        vop1 = NULL_TREE;
-
-      /* Generate the two halves of promotion operation.  */
-      new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
-                                                 op_type, vec_dest, gsi, stmt);
-      new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
-                                                 op_type, vec_dest, gsi, stmt);
-      if (is_gimple_call (new_stmt1))
-        {
-          new_tmp1 = gimple_call_lhs (new_stmt1);
-          new_tmp2 = gimple_call_lhs (new_stmt2);
-        }
-      else
-        {
-          new_tmp1 = gimple_assign_lhs (new_stmt1);
-          new_tmp2 = gimple_assign_lhs (new_stmt2);
-        }
-
-      if (multi_step_cvt)
-        {
-          /* Store the results for the recursive call.  */
-          VEC_quick_push (tree, vec_tmp, new_tmp1);
-          VEC_quick_push (tree, vec_tmp, new_tmp2);
-        }
-      else
-        {
-          /* Last step of promotion sequience - store the results.  */
-          if (slp_node)
-            {
-              VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
-              VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
-            }
-          else
-            {
-              if (!*prev_stmt_info)
-                STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
-              else
-                STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
-
-              *prev_stmt_info = vinfo_for_stmt (new_stmt1);
-              STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
-              *prev_stmt_info = vinfo_for_stmt (new_stmt2);
-            }
-        }
-    }
-
-  if (multi_step_cvt)
-    {
-      /* For multi-step promotion operation we first generate we call the
-         function recurcively for every stage.  We start from the input type,
-         create promotion operations to the intermediate types, and then
-         create promotions to the output type.  */
-      *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
-      vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
-                                              multi_step_cvt - 1, stmt,
-                                              vec_dsts, gsi, slp_node, code1,
-                                              code2, decl2, decl2, op_type,
-                                              prev_stmt_info);
-    }
-
-  VEC_free (tree, heap, vec_tmp);
-}
+static tree permute_vec_elements (tree, tree, tree, gimple,
+                                 gimple_stmt_iterator *);
 
 
-/* Function vectorizable_type_promotion
+/* Function vectorizable_operation.
 
-   Check if STMT performs a binary or unary operation that involves
-   type promotion, and if it can be vectorized.
+   Check if STMT performs a binary, unary or ternary operation that can
+   be vectorized.
    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
 
 static bool
-vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
-                            gimple *vec_stmt, slp_tree slp_node)
+vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
+                       gimple *vec_stmt, slp_tree slp_node)
 {
   tree vec_dest;
   tree scalar_dest;
-  tree op0, op1 = NULL;
-  tree vec_oprnd0=NULL, vec_oprnd1=NULL;
+  tree op0, op1 = NULL_TREE, op2 = NULL_TREE;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  tree vectype;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
-  tree decl1 = NULL_TREE, decl2 = NULL_TREE;
+  enum tree_code code;
+  enum machine_mode vec_mode;
+  tree new_temp;
   int op_type;
+  optab optab;
+  int icode;
   tree def;
   gimple def_stmt;
-  enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
+  enum vect_def_type dt[3]
+    = {vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type};
+  gimple new_stmt = NULL;
   stmt_vec_info prev_stmt_info;
   int nunits_in;
   int nunits_out;
   tree vectype_out;
   int ncopies;
   int j, i;
-  tree vectype_in;
-  tree intermediate_type = NULL_TREE;
-  int multi_step_cvt = 0;
-  VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
-  VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
-
-  /* FORNOW: not supported by basic block SLP vectorization.  */
-  gcc_assert (loop_vinfo);
+  VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vec_oprnds2 = NULL;
+  tree vop0, vop1, vop2;
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
+  int vf;
 
-  if (!STMT_VINFO_RELEVANT_P (stmt_info))
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
     return false;
 
-  /* Is STMT a vectorizable type-promotion operation?  */
+  /* Is STMT a vectorizable binary/unary operation?   */
   if (!is_gimple_assign (stmt))
     return false;
 
@@ -3253,62 +3418,54 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
     return false;
 
   code = gimple_assign_rhs_code (stmt);
-  if (!CONVERT_EXPR_CODE_P (code)
-      && code != WIDEN_MULT_EXPR)
-    return false;
+
+  /* For pointer addition, we should use the normal plus for
+     the vector addition.  */
+  if (code == POINTER_PLUS_EXPR)
+    code = PLUS_EXPR;
+
+  /* Support only unary or binary operations.  */
+  op_type = TREE_CODE_LENGTH (code);
+  if (op_type != unary_op && op_type != binary_op && op_type != ternary_op)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "num. args = %d (not unary/binary/ternary op).",
+                op_type);
+      return false;
+    }
 
   scalar_dest = gimple_assign_lhs (stmt);
   vectype_out = STMT_VINFO_VECTYPE (stmt_info);
 
-  /* Check the operands of the operation.  */
-  op0 = gimple_assign_rhs1 (stmt);
-  if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
-         && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
-        || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
-            && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
-            && CONVERT_EXPR_CODE_P (code))))
-    return false;
-  if (!vect_is_simple_use_1 (op0, loop_vinfo, NULL,
-                            &def_stmt, &def, &dt[0], &vectype_in))
+  /* Most operations cannot handle bit-precision types without extra
+     truncations.  */
+  if ((TYPE_PRECISION (TREE_TYPE (scalar_dest))
+       != GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (scalar_dest))))
+      /* Exception are bitwise binary operations.  */
+      && code != BIT_IOR_EXPR
+      && code != BIT_XOR_EXPR
+      && code != BIT_AND_EXPR)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "use not simple.");
+        fprintf (vect_dump, "bit-precision arithmetic not supported.");
       return false;
     }
 
-  op_type = TREE_CODE_LENGTH (code);
-  if (op_type == binary_op)
+  op0 = gimple_assign_rhs1 (stmt);
+  if (!vect_is_simple_use_1 (op0, stmt, loop_vinfo, bb_vinfo,
+                            &def_stmt, &def, &dt[0], &vectype))
     {
-      bool ok;
-
-      op1 = gimple_assign_rhs2 (stmt);
-      if (code == WIDEN_MULT_EXPR)
-        {
-         /* For WIDEN_MULT_EXPR, if OP0 is a constant, use the type of
-            OP1.  */
-          if (CONSTANT_CLASS_P (op0))
-            ok = vect_is_simple_use_1 (op1, loop_vinfo, NULL,
-                             &def_stmt, &def, &dt[1], &vectype_in);
-          else
-            ok = vect_is_simple_use (op1, loop_vinfo, NULL, &def_stmt, &def,
-                                     &dt[1]);
-
-          if (!ok)
-            {
-             if (vect_print_dump_info (REPORT_DETAILS))
-               fprintf (vect_dump, "use not simple.");
-              return false;
-            }
-        }        
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "use not simple.");
+      return false;
     }
-
   /* If op0 is an external or constant def use a vector type with
      the same size as the output vector type.  */
-  if (!vectype_in)
-    vectype_in = get_same_sized_vectype (TREE_TYPE (op0), vectype_out);
+  if (!vectype)
+    vectype = get_same_sized_vectype (TREE_TYPE (op0), vectype_out);
   if (vec_stmt)
-    gcc_assert (vectype_in);
-  if (!vectype_in)
+    gcc_assert (vectype);
+  if (!vectype)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         {
@@ -3319,11 +3476,39 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
       return false;
     }
 
-  nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
-  if (nunits_in <= nunits_out)
+  nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
+  if (nunits_out != nunits_in)
     return false;
 
+  if (op_type == binary_op || op_type == ternary_op)
+    {
+      op1 = gimple_assign_rhs2 (stmt);
+      if (!vect_is_simple_use (op1, stmt, loop_vinfo, bb_vinfo, &def_stmt,
+                              &def, &dt[1]))
+       {
+         if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "use not simple.");
+         return false;
+       }
+    }
+  if (op_type == ternary_op)
+    {
+      op2 = gimple_assign_rhs3 (stmt);
+      if (!vect_is_simple_use (op2, stmt, loop_vinfo, bb_vinfo, &def_stmt,
+                              &def, &dt[2]))
+       {
+         if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "use not simple.");
+         return false;
+       }
+    }
+
+  if (loop_vinfo)
+    vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  else
+    vf = 1;
+
   /* Multiple types in SLP are handled by creating the appropriate number of
      vectorized stmts for each SLP node.  Hence, NCOPIES is always 1 in
      case of SLP.  */
@@ -3334,126 +3519,204 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
 
   gcc_assert (ncopies >= 1);
 
+  /* Shifts are handled in vectorizable_shift ().  */
+  if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
+      || code == RROTATE_EXPR)
+   return false;
+
   /* Supportable by target?  */
-  if (!supportable_widening_operation (code, stmt, vectype_out, vectype_in,
-                                      &decl1, &decl2, &code1, &code2,
-                                       &multi_step_cvt, &interm_types))
-    return false;
 
-  /* Binary widening operation can only be supported directly by the
-     architecture.  */
-  gcc_assert (!(multi_step_cvt && op_type == binary_op));
+  vec_mode = TYPE_MODE (vectype);
+  if (code == MULT_HIGHPART_EXPR)
+    {
+      if (can_mult_highpart_p (vec_mode, TYPE_UNSIGNED (vectype)))
+       icode = LAST_INSN_CODE;
+      else
+       icode = CODE_FOR_nothing;
+    }
+  else
+    {
+      optab = optab_for_tree_code (code, vectype, optab_default);
+      if (!optab)
+       {
+         if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "no optab.");
+         return false;
+       }
+      icode = (int) optab_handler (optab, vec_mode);
+    }
+
+  if (icode == CODE_FOR_nothing)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "op not supported by target.");
+      /* Check only during analysis.  */
+      if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
+         || (!vec_stmt && vf < vect_min_worthwhile_factor (code)))
+        return false;
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "proceeding using word mode.");
+    }
+
+  /* Worthwhile without SIMD support?  Check only during analysis.  */
+  if (!VECTOR_MODE_P (vec_mode)
+      && !vec_stmt
+      && vf < vect_min_worthwhile_factor (code))
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "not worthwhile without SIMD support.");
+      return false;
+    }
 
   if (!vec_stmt) /* transformation not required.  */
     {
-      STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
+      STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
       if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "=== vectorizable_promotion ===");
-      vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
+        fprintf (vect_dump, "=== vectorizable_operation ===");
+      vect_model_simple_cost (stmt_info, ncopies, dt, NULL, NULL);
       return true;
     }
 
   /** Transform.  **/
 
   if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
-                        ncopies);
-
-  if (code == WIDEN_MULT_EXPR)
-    {
-      if (CONSTANT_CLASS_P (op0))
-       op0 = fold_convert (TREE_TYPE (op1), op0);
-      else if (CONSTANT_CLASS_P (op1))
-       op1 = fold_convert (TREE_TYPE (op0), op1);
-    }
+    fprintf (vect_dump, "transform binary/unary operation.");
 
   /* Handle def.  */
-  /* In case of multi-step promotion, we first generate promotion operations
-     to the intermediate types, and then from that types to the final one.
-     We store vector destination in VEC_DSTS in the correct order for
-     recursive creation of promotion operations in
-     vect_create_vectorized_promotion_stmts(). Vector destinations are created
-     according to TYPES recieved from supportable_widening_operation().   */
-  if (multi_step_cvt)
-    vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
-  else
-    vec_dsts = VEC_alloc (tree, heap, 1);
-
-  vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
-  VEC_quick_push (tree, vec_dsts, vec_dest);
-
-  if (multi_step_cvt)
-    {
-      for (i = VEC_length (tree, interm_types) - 1;
-           VEC_iterate (tree, interm_types, i, intermediate_type); i--)
-        {
-          vec_dest = vect_create_destination_var (scalar_dest,
-                                                  intermediate_type);
-          VEC_quick_push (tree, vec_dsts, vec_dest);
-        }
-    }
+  vec_dest = vect_create_destination_var (scalar_dest, vectype);
 
+  /* Allocate VECs for vector operands.  In case of SLP, vector operands are
+     created in the previous stages of the recursion, so no allocation is
+     needed, except for the case of shift with scalar shift argument.  In that
+     case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
+     be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
+     In case of loop-based vectorization we allocate VECs of size 1.  We
+     allocate VEC_OPRNDS1 only in case of binary operation.  */
   if (!slp_node)
     {
-      vec_oprnds0 = VEC_alloc (tree, heap,
-                            (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
-      if (op_type == binary_op)
+      vec_oprnds0 = VEC_alloc (tree, heap, 1);
+      if (op_type == binary_op || op_type == ternary_op)
         vec_oprnds1 = VEC_alloc (tree, heap, 1);
+      if (op_type == ternary_op)
+        vec_oprnds2 = VEC_alloc (tree, heap, 1);
     }
 
   /* In case the vectorization factor (VF) is bigger than the number
      of elements that we can fit in a vectype (nunits), we have to generate
      more than one vector stmt - i.e - we need to "unroll" the
-     vector stmt by a factor VF/nunits.   */
+     vector stmt by a factor VF/nunits.  In doing so, we record a pointer
+     from one copy of the vector stmt to the next, in the field
+     STMT_VINFO_RELATED_STMT.  This is necessary in order to allow following
+     stages to find the correct vector defs to be used when vectorizing
+     stmts that use the defs of the current stmt.  The example below
+     illustrates the vectorization process when VF=16 and nunits=4 (i.e.,
+     we need to create 4 vectorized stmts):
+
+     before vectorization:
+                                RELATED_STMT    VEC_STMT
+        S1:     x = memref      -               -
+        S2:     z = x + 1       -               -
+
+     step 1: vectorize stmt S1 (done in vectorizable_load. See more details
+             there):
+                                RELATED_STMT    VEC_STMT
+        VS1_0:  vx0 = memref0   VS1_1           -
+        VS1_1:  vx1 = memref1   VS1_2           -
+        VS1_2:  vx2 = memref2   VS1_3           -
+        VS1_3:  vx3 = memref3   -               -
+        S1:     x = load        -               VS1_0
+        S2:     z = x + 1       -               -
+
+     step2: vectorize stmt S2 (done here):
+        To vectorize stmt S2 we first need to find the relevant vector
+        def for the first operand 'x'.  This is, as usual, obtained from
+        the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
+        that defines 'x' (S1).  This way we find the stmt VS1_0, and the
+        relevant vector def 'vx0'.  Having found 'vx0' we can generate
+        the vector stmt VS2_0, and as usual, record it in the
+        STMT_VINFO_VEC_STMT of stmt S2.
+        When creating the second copy (VS2_1), we obtain the relevant vector
+        def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
+        stmt VS1_0.  This way we find the stmt VS1_1 and the relevant
+        vector def 'vx1'.  Using 'vx1' we create stmt VS2_1 and record a
+        pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
+        Similarly when creating stmts VS2_2 and VS2_3.  This is the resulting
+        chain of stmts and pointers:
+                                RELATED_STMT    VEC_STMT
+        VS1_0:  vx0 = memref0   VS1_1           -
+        VS1_1:  vx1 = memref1   VS1_2           -
+        VS1_2:  vx2 = memref2   VS1_3           -
+        VS1_3:  vx3 = memref3   -               -
+        S1:     x = load        -               VS1_0
+        VS2_0:  vz0 = vx0 + v1  VS2_1           -
+        VS2_1:  vz1 = vx1 + v1  VS2_2           -
+        VS2_2:  vz2 = vx2 + v1  VS2_3           -
+        VS2_3:  vz3 = vx3 + v1  -               -
+        S2:     z = x + 1       -               VS2_0  */
 
   prev_stmt_info = NULL;
   for (j = 0; j < ncopies; j++)
     {
       /* Handle uses.  */
       if (j == 0)
-        {
-          if (slp_node)
-              vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0,
-                                 &vec_oprnds1, -1);
-          else
-            {
-              vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
-              VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
-              if (op_type == binary_op)
-                {
-                  vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
-                  VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
-                }
-            }
-        }
+       {
+         if (op_type == binary_op || op_type == ternary_op)
+           vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
+                              slp_node, -1);
+         else
+           vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
+                              slp_node, -1);
+         if (op_type == ternary_op)
+           {
+             vec_oprnds2 = VEC_alloc (tree, heap, 1);
+             VEC_quick_push (tree, vec_oprnds2,
+                             vect_get_vec_def_for_operand (op2, stmt, NULL));
+           }
+       }
       else
+       {
+         vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
+         if (op_type == ternary_op)
+           {
+             tree vec_oprnd = VEC_pop (tree, vec_oprnds2);
+             VEC_quick_push (tree, vec_oprnds2,
+                             vect_get_vec_def_for_stmt_copy (dt[2],
+                                                             vec_oprnd));
+           }
+       }
+
+      /* Arguments are ready.  Create the new vector stmt.  */
+      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vop0)
         {
-          vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
-          VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
-          if (op_type == binary_op)
-            {
-              vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
-              VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
-            }
+         vop1 = ((op_type == binary_op || op_type == ternary_op)
+                 ? VEC_index (tree, vec_oprnds1, i) : NULL_TREE);
+         vop2 = ((op_type == ternary_op)
+                 ? VEC_index (tree, vec_oprnds2, i) : NULL_TREE);
+         new_stmt = gimple_build_assign_with_ops3 (code, vec_dest,
+                                                   vop0, vop1, vop2);
+         new_temp = make_ssa_name (vec_dest, new_stmt);
+         gimple_assign_set_lhs (new_stmt, new_temp);
+         vect_finish_stmt_generation (stmt, new_stmt, gsi);
+          if (slp_node)
+           VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
         }
 
-      /* Arguments are ready.  Create the new vector stmts.  */
-      tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
-      vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
-                                              multi_step_cvt, stmt,
-                                              tmp_vec_dsts,
-                                              gsi, slp_node, code1, code2,
-                                              decl1, decl2, op_type,
-                                              &prev_stmt_info);
+      if (slp_node)
+        continue;
+
+      if (j == 0)
+       STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+      else
+       STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+      prev_stmt_info = vinfo_for_stmt (new_stmt);
     }
 
-  VEC_free (tree, heap, vec_dsts);
-  VEC_free (tree, heap, tmp_vec_dsts);
-  VEC_free (tree, heap, interm_types);
   VEC_free (tree, heap, vec_oprnds0);
-  VEC_free (tree, heap, vec_oprnds1);
+  if (vec_oprnds1)
+    VEC_free (tree, heap, vec_oprnds1);
+  if (vec_oprnds2)
+    VEC_free (tree, heap, vec_oprnds2);
 
-  *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
   return true;
 }
 
@@ -3492,7 +3755,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   int ncopies;
   int j;
   gimple next_stmt, first_stmt = NULL;
-  bool strided_store = false;
+  bool grouped_store = false;
   bool store_lanes_p = false;
   unsigned int group_size, i;
   VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
@@ -3536,6 +3799,9 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
     return false;
 
   scalar_dest = gimple_assign_lhs (stmt);
+  if (TREE_CODE (scalar_dest) == VIEW_CONVERT_EXPR
+      && is_pattern_stmt_p (stmt_info))
+    scalar_dest = TREE_OPERAND (scalar_dest, 0);
   if (TREE_CODE (scalar_dest) != ARRAY_REF
       && TREE_CODE (scalar_dest) != INDIRECT_REF
       && TREE_CODE (scalar_dest) != COMPONENT_REF
@@ -3546,24 +3812,17 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
   gcc_assert (gimple_assign_single_p (stmt));
   op = gimple_assign_rhs1 (stmt);
-  if (!vect_is_simple_use (op, loop_vinfo, bb_vinfo, &def_stmt, &def, &dt))
+  if (!vect_is_simple_use (op, stmt, loop_vinfo, bb_vinfo, &def_stmt,
+                          &def, &dt))
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "use not simple.");
       return false;
     }
 
-  /* The scalar rhs type needs to be trivially convertible to the vector
-     component type.  This should always be the case.  */
   elem_type = TREE_TYPE (vectype);
-  if (!useless_type_conversion_p (elem_type, TREE_TYPE (op)))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "???  operands of different types");
-      return false;
-    }
-
   vec_mode = TYPE_MODE (vectype);
+
   /* FORNOW. In some cases can vectorize even if data-type not supported
      (e.g. - array initialization with 0).  */
   if (optab_handler (mov_optab, vec_mode) == CODE_FOR_nothing)
@@ -3572,23 +3831,25 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   if (!STMT_VINFO_DATA_REF (stmt_info))
     return false;
 
-  if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
+  if (tree_int_cst_compare (loop && nested_in_vect_loop_p (loop, stmt)
+                           ? STMT_VINFO_DR_STEP (stmt_info) : DR_STEP (dr),
+                           size_zero_node) < 0)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "negative step for store.");
       return false;
     }
 
-  if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
-      strided_store = true;
+      grouped_store = true;
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
       if (!slp && !PURE_SLP_STMT (stmt_info))
        {
          group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
          if (vect_store_lanes_supported (vectype, group_size))
            store_lanes_p = true;
-         else if (!vect_strided_store_supported (vectype, group_size))
+         else if (!vect_grouped_store_supported (vectype, group_size))
            return false;
        }
 
@@ -3601,8 +3862,8 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
             {
              gcc_assert (gimple_assign_single_p (next_stmt));
              op = gimple_assign_rhs1 (next_stmt);
-              if (!vect_is_simple_use (op, loop_vinfo, bb_vinfo, &def_stmt,
-                                       &def, &dt))
+              if (!vect_is_simple_use (op, next_stmt, loop_vinfo, bb_vinfo,
+                                      &def_stmt, &def, &dt))
                 {
                   if (vect_print_dump_info (REPORT_DETAILS))
                     fprintf (vect_dump, "use not simple.");
@@ -3616,13 +3877,14 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
-      vect_model_store_cost (stmt_info, ncopies, store_lanes_p, dt, NULL);
+      vect_model_store_cost (stmt_info, ncopies, store_lanes_p, dt,
+                            NULL, NULL, NULL);
       return true;
     }
 
   /** Transform.  **/
 
-  if (strided_store)
+  if (grouped_store)
     {
       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
       group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
@@ -3644,12 +3906,13 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
       if (slp)
         {
-          strided_store = false;
+          grouped_store = false;
           /* VEC_NUM is the number of vect stmts to be created for this 
              group.  */
           vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
           first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0); 
           first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
+         op = gimple_assign_rhs1 (first_stmt);
         } 
       else
         /* VEC_NUM is the number of vect stmts to be created for this 
@@ -3688,7 +3951,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
      vector stmt by a factor VF/nunits.  For more details see documentation in
      vect_get_vec_def_for_copy_stmt.  */
 
-  /* In case of interleaving (non-unit strided access):
+  /* In case of interleaving (non-unit grouped access):
 
         S1:  &base + 2 = x2
         S2:  &base = x0
@@ -3706,8 +3969,8 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
      Then permutation statements are generated:
 
-        VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
-        VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
+       VS5: vx5 = VEC_PERM_EXPR < vx0, vx3, {0, 8, 1, 9, 2, 10, 3, 11} >
+       VS6: vx6 = VEC_PERM_EXPR < vx0, vx3, {4, 12, 5, 13, 6, 14, 7, 15} >
        ...
 
      And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
@@ -3732,8 +3995,8 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
           if (slp)
             {
              /* Get vectorized arguments for SLP_NODE.  */
-              vect_get_slp_defs (NULL_TREE, NULL_TREE, slp_node, &vec_oprnds,
-                                 NULL, -1);
+              vect_get_vec_defs (op, NULL_TREE, stmt, &vec_oprnds,
+                                 NULL, slp_node, -1);
 
               vec_oprnd = VEC_index (tree, vec_oprnds, 0);
             }
@@ -3744,7 +4007,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
                 used as an input to vect_permute_store_chain(), and OPRNDS as
                 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
 
-                If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
+                If the store is not grouped, GROUP_SIZE is 1, and DR_CHAIN and
                 OPRNDS are of size 1.  */
              next_stmt = first_stmt;
              for (i = 0; i < group_size; i++)
@@ -3781,13 +4044,13 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
             DR_CHAIN is then used as an input to vect_permute_store_chain(),
             and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
             next copy.
-            If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
+            If the store is not grouped, GROUP_SIZE is 1, and DR_CHAIN and
             OPRNDS are of size 1.  */
          for (i = 0; i < group_size; i++)
            {
              op = VEC_index (tree, oprnds, i);
-             vect_is_simple_use (op, loop_vinfo, bb_vinfo, &def_stmt, &def,
-                                 &dt);
+             vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo, &def_stmt,
+                                 &def, &dt);
              vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
              VEC_replace(tree, dr_chain, i, vec_oprnd);
              VEC_replace(tree, oprnds, i, vec_oprnd);
@@ -3814,12 +4077,11 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
          new_stmt = gimple_build_call_internal (IFN_STORE_LANES, 1, vec_array);
          gimple_call_set_lhs (new_stmt, data_ref);
          vect_finish_stmt_generation (stmt, new_stmt, gsi);
-         mark_symbols_for_renaming (new_stmt);
        }
       else
        {
          new_stmt = NULL;
-         if (strided_store)
+         if (grouped_store)
            {
              result_chain = VEC_alloc (tree, heap, group_size);
              /* Permute.  */
@@ -3830,7 +4092,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
          next_stmt = first_stmt;
          for (i = 0; i < vec_num; i++)
            {
-             struct ptr_info_def *pi;
+             unsigned align, misalign;
 
              if (i > 0)
                /* Bump the vector pointer.  */
@@ -3839,38 +4101,38 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
              if (slp)
                vec_oprnd = VEC_index (tree, vec_oprnds, i);
-             else if (strided_store)
-               /* For strided stores vectorized defs are interleaved in
+             else if (grouped_store)
+               /* For grouped stores vectorized defs are interleaved in
                   vect_permute_store_chain().  */
                vec_oprnd = VEC_index (tree, result_chain, i);
 
              data_ref = build2 (MEM_REF, TREE_TYPE (vec_oprnd), dataref_ptr,
                                 build_int_cst (reference_alias_ptr_type
                                                (DR_REF (first_dr)), 0));
-             pi = get_ptr_info (dataref_ptr);
-             pi->align = TYPE_ALIGN_UNIT (vectype);
+             align = TYPE_ALIGN_UNIT (vectype);
              if (aligned_access_p (first_dr))
-               pi->misalign = 0;
+               misalign = 0;
              else if (DR_MISALIGNMENT (first_dr) == -1)
                {
                  TREE_TYPE (data_ref)
                    = build_aligned_type (TREE_TYPE (data_ref),
                                          TYPE_ALIGN (elem_type));
-                 pi->align = TYPE_ALIGN_UNIT (elem_type);
-                 pi->misalign = 0;
+                 align = TYPE_ALIGN_UNIT (elem_type);
+                 misalign = 0;
                }
              else
                {
                  TREE_TYPE (data_ref)
                    = build_aligned_type (TREE_TYPE (data_ref),
                                          TYPE_ALIGN (elem_type));
-                 pi->misalign = DR_MISALIGNMENT (first_dr);
+                 misalign = DR_MISALIGNMENT (first_dr);
                }
+             set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
+                                     misalign);
 
              /* Arguments are ready.  Create the new vector stmt.  */
              new_stmt = gimple_build_assign (data_ref, vec_oprnd);
              vect_finish_stmt_generation (stmt, new_stmt, gsi);
-             mark_symbols_for_renaming (new_stmt);
 
              if (slp)
                continue;
@@ -3900,75 +4162,71 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   return true;
 }
 
-/* Given a vector type VECTYPE returns a builtin DECL to be used
-   for vector permutation and stores a mask into *MASK that implements
-   reversal of the vector elements.  If that is impossible to do
-   returns NULL (and *MASK is unchanged).  */
+/* Given a vector type VECTYPE and permutation SEL returns
+   the VECTOR_CST mask that implements the permutation of the
+   vector elements.  If that is impossible to do, returns NULL.  */
 
-static tree
-perm_mask_for_reverse (tree vectype, tree *mask)
+tree
+vect_gen_perm_mask (tree vectype, unsigned char *sel)
 {
-  tree builtin_decl;
-  tree mask_element_type, mask_type;
-  tree mask_vec = NULL;
-  int i;
-  int nunits;
-  if (!targetm.vectorize.builtin_vec_perm)
-    return NULL;
+  tree mask_elt_type, mask_type, mask_vec, *mask_elts;
+  int i, nunits;
+
+  nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
-  builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
-                                                     &mask_element_type);
-  if (!builtin_decl || !mask_element_type)
+  if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
     return NULL;
 
-  mask_type = get_vectype_for_scalar_type (mask_element_type);
+  mask_elt_type = lang_hooks.types.type_for_mode
+                   (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
+  mask_type = get_vectype_for_scalar_type (mask_elt_type);
+
+  mask_elts = XALLOCAVEC (tree, nunits);
+  for (i = nunits - 1; i >= 0; i--)
+    mask_elts[i] = build_int_cst (mask_elt_type, sel[i]);
+  mask_vec = build_vector (mask_type, mask_elts);
+
+  return mask_vec;
+}
+
+/* Given a vector type VECTYPE returns the VECTOR_CST mask that implements
+   reversal of the vector elements.  If that is impossible to do,
+   returns NULL.  */
+
+static tree
+perm_mask_for_reverse (tree vectype)
+{
+  int i, nunits;
+  unsigned char *sel;
+
   nunits = TYPE_VECTOR_SUBPARTS (vectype);
-  if (!mask_type
-      || TYPE_VECTOR_SUBPARTS (vectype) != TYPE_VECTOR_SUBPARTS (mask_type))
-    return NULL;
+  sel = XALLOCAVEC (unsigned char, nunits);
 
-  for (i = 0; i < nunits; i++)
-    mask_vec = tree_cons (NULL, build_int_cst (mask_element_type, i), mask_vec);
-  mask_vec = build_vector (mask_type, mask_vec);
+  for (i = 0; i < nunits; ++i)
+    sel[i] = nunits - 1 - i;
 
-  if (!targetm.vectorize.builtin_vec_perm_ok (vectype, mask_vec))
-    return NULL;
-  if (mask)
-    *mask = mask_vec;
-  return builtin_decl;
+  return vect_gen_perm_mask (vectype, sel);
 }
 
-/* Given a vector variable X, that was generated for the scalar LHS of
-   STMT, generate instructions to reverse the vector elements of X,
-   insert them a *GSI and return the permuted vector variable.  */
+/* Given a vector variable X and Y, that was generated for the scalar
+   STMT, generate instructions to permute the vector elements of X and Y
+   using permutation mask MASK_VEC, insert them at *GSI and return the
+   permuted vector variable.  */
 
 static tree
-reverse_vec_elements (tree x, gimple stmt, gimple_stmt_iterator *gsi)
+permute_vec_elements (tree x, tree y, tree mask_vec, gimple stmt,
+                     gimple_stmt_iterator *gsi)
 {
   tree vectype = TREE_TYPE (x);
-  tree mask_vec, builtin_decl;
   tree perm_dest, data_ref;
   gimple perm_stmt;
 
-  builtin_decl = perm_mask_for_reverse (vectype, &mask_vec);
-
   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
+  data_ref = make_ssa_name (perm_dest, NULL);
 
   /* Generate the permute statement.  */
-  perm_stmt = gimple_build_call (builtin_decl, 3, x, x, mask_vec);
-  if (!useless_type_conversion_p (vectype,
-                                 TREE_TYPE (TREE_TYPE (builtin_decl))))
-    {
-      tree tem = create_tmp_reg (TREE_TYPE (TREE_TYPE (builtin_decl)), NULL);
-      tem = make_ssa_name (tem, perm_stmt);
-      gimple_call_set_lhs (perm_stmt, tem);
-      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-      perm_stmt = gimple_build_assign (NULL_TREE,
-                                      build1 (VIEW_CONVERT_EXPR,
-                                              vectype, tem));
-    }
-  data_ref = make_ssa_name (perm_dest, perm_stmt);
-  gimple_set_lhs (perm_stmt, data_ref);
+  perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, data_ref,
+                                            x, y, mask_vec);
   vect_finish_stmt_generation (stmt, perm_stmt, gsi);
 
   return data_ref;
@@ -4013,12 +4271,11 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   tree realignment_token = NULL_TREE;
   gimple phi = NULL;
   VEC(tree,heap) *dr_chain = NULL;
-  bool strided_load = false;
+  bool grouped_load = false;
   bool load_lanes_p = false;
   gimple first_stmt;
-  tree scalar_type;
   bool inv_p;
-  bool negative;
+  bool negative = false;
   bool compute_in_loop = false;
   struct loop *at_loop;
   int vec_num;
@@ -4028,6 +4285,11 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   int vf;
   tree aggr_type;
+  tree gather_base = NULL_TREE, gather_off = NULL_TREE;
+  tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE;
+  tree stride_base, stride_step;
+  int gather_scale = 1;
+  enum vect_def_type gather_dt = vect_unknown_def_type;
 
   if (loop_vinfo)
     {
@@ -4076,21 +4338,14 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
       && code != COMPONENT_REF
       && code != IMAGPART_EXPR
       && code != REALPART_EXPR
-      && code != MEM_REF)
+      && code != MEM_REF
+      && TREE_CODE_CLASS (code) != tcc_declaration)
     return false;
 
   if (!STMT_VINFO_DATA_REF (stmt_info))
     return false;
 
-  negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
-  if (negative && ncopies > 1)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "multiple types with negative step.");
-      return false;
-    }
-
-  scalar_type = TREE_TYPE (DR_REF (dr));
+  elem_type = TREE_TYPE (vectype);
   mode = TYPE_MODE (vectype);
 
   /* FORNOW. In some cases can vectorize even if data-type not supported
@@ -4102,68 +4357,348 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
       return false;
     }
 
-  /* The vector component type needs to be trivially convertible to the
-     scalar lhs.  This should always be the case.  */
-  elem_type = TREE_TYPE (vectype);
-  if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), elem_type))
+  /* Check if the load is a part of an interleaving chain.  */
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "???  operands of different types");
-      return false;
+      grouped_load = true;
+      /* FORNOW */
+      gcc_assert (! nested_in_vect_loop && !STMT_VINFO_GATHER_P (stmt_info));
+
+      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+      if (!slp && !PURE_SLP_STMT (stmt_info))
+       {
+         group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
+         if (vect_load_lanes_supported (vectype, group_size))
+           load_lanes_p = true;
+         else if (!vect_grouped_load_supported (vectype, group_size))
+           return false;
+       }
+    }
+
+
+  if (STMT_VINFO_GATHER_P (stmt_info))
+    {
+      gimple def_stmt;
+      tree def;
+      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
+                                      &gather_off, &gather_scale);
+      gcc_assert (gather_decl);
+      if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, bb_vinfo,
+                                &def_stmt, &def, &gather_dt,
+                                &gather_off_vectype))
+       {
+         if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "gather index use not simple.");
+         return false;
+       }
+    }
+  else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+    {
+      if (!vect_check_strided_load (stmt, loop_vinfo,
+                                   &stride_base, &stride_step))
+       return false;
+    }
+  else
+    {
+      negative = tree_int_cst_compare (nested_in_vect_loop
+                                      ? STMT_VINFO_DR_STEP (stmt_info)
+                                      : DR_STEP (dr),
+                                      size_zero_node) < 0;
+      if (negative && ncopies > 1)
+       {
+         if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "multiple types with negative step.");
+         return false;
+       }
+
+      if (negative)
+       {
+         gcc_assert (!grouped_load);
+         alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
+         if (alignment_support_scheme != dr_aligned
+             && alignment_support_scheme != dr_unaligned_supported)
+           {
+             if (vect_print_dump_info (REPORT_DETAILS))
+               fprintf (vect_dump, "negative step but alignment required.");
+             return false;
+           }
+         if (!perm_mask_for_reverse (vectype))
+           {
+             if (vect_print_dump_info (REPORT_DETAILS))
+               fprintf (vect_dump, "negative step and reversing not supported.");
+             return false;
+           }
+       }
+    }
+
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
+      vect_model_load_cost (stmt_info, ncopies, load_lanes_p, NULL, NULL, NULL);
+      return true;
+    }
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "transform load. ncopies = %d", ncopies);
+
+  /** Transform.  **/
+
+  if (STMT_VINFO_GATHER_P (stmt_info))
+    {
+      tree vec_oprnd0 = NULL_TREE, op;
+      tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl));
+      tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
+      tree ptr, mask, var, scale, perm_mask = NULL_TREE, prev_res = NULL_TREE;
+      edge pe = loop_preheader_edge (loop);
+      gimple_seq seq;
+      basic_block new_bb;
+      enum { NARROW, NONE, WIDEN } modifier;
+      int gather_off_nunits = TYPE_VECTOR_SUBPARTS (gather_off_vectype);
+
+      if (nunits == gather_off_nunits)
+       modifier = NONE;
+      else if (nunits == gather_off_nunits / 2)
+       {
+         unsigned char *sel = XALLOCAVEC (unsigned char, gather_off_nunits);
+         modifier = WIDEN;
+
+         for (i = 0; i < gather_off_nunits; ++i)
+           sel[i] = i | nunits;
+
+         perm_mask = vect_gen_perm_mask (gather_off_vectype, sel);
+         gcc_assert (perm_mask != NULL_TREE);
+       }
+      else if (nunits == gather_off_nunits * 2)
+       {
+         unsigned char *sel = XALLOCAVEC (unsigned char, nunits);
+         modifier = NARROW;
+
+         for (i = 0; i < nunits; ++i)
+           sel[i] = i < gather_off_nunits
+                    ? i : i + nunits - gather_off_nunits;
+
+         perm_mask = vect_gen_perm_mask (vectype, sel);
+         gcc_assert (perm_mask != NULL_TREE);
+         ncopies *= 2;
+       }
+      else
+       gcc_unreachable ();
+
+      rettype = TREE_TYPE (TREE_TYPE (gather_decl));
+      srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      scaletype = TREE_VALUE (arglist);
+      gcc_checking_assert (types_compatible_p (srctype, rettype)
+                          && types_compatible_p (srctype, masktype));
+
+      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+
+      ptr = fold_convert (ptrtype, gather_base);
+      if (!is_gimple_min_invariant (ptr))
+       {
+         ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
+         new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+         gcc_assert (!new_bb);
+       }
+
+      /* Currently we support only unconditional gather loads,
+        so mask should be all ones.  */
+      if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+       mask = build_int_cst (TREE_TYPE (masktype), -1);
+      else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype)))
+       {
+         REAL_VALUE_TYPE r;
+         long tmp[6];
+         for (j = 0; j < 6; ++j)
+           tmp[j] = -1;
+         real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype)));
+         mask = build_real (TREE_TYPE (masktype), r);
+       }
+      else
+       gcc_unreachable ();
+      mask = build_vector_from_val (masktype, mask);
+      mask = vect_init_vector (stmt, mask, masktype, NULL);
+
+      scale = build_int_cst (scaletype, gather_scale);
+
+      prev_stmt_info = NULL;
+      for (j = 0; j < ncopies; ++j)
+       {
+         if (modifier == WIDEN && (j & 1))
+           op = permute_vec_elements (vec_oprnd0, vec_oprnd0,
+                                      perm_mask, stmt, gsi);
+         else if (j == 0)
+           op = vec_oprnd0
+             = vect_get_vec_def_for_operand (gather_off, stmt, NULL);
+         else
+           op = vec_oprnd0
+             = vect_get_vec_def_for_stmt_copy (gather_dt, vec_oprnd0);
+
+         if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
+           {
+             gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op))
+                         == TYPE_VECTOR_SUBPARTS (idxtype));
+             var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL);
+             var = make_ssa_name (var, NULL);
+             op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
+             new_stmt
+               = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var,
+                                               op, NULL_TREE);
+             vect_finish_stmt_generation (stmt, new_stmt, gsi);
+             op = var;
+           }
+
+         new_stmt
+           = gimple_build_call (gather_decl, 5, mask, ptr, op, mask, scale);
+
+         if (!useless_type_conversion_p (vectype, rettype))
+           {
+             gcc_assert (TYPE_VECTOR_SUBPARTS (vectype)
+                         == TYPE_VECTOR_SUBPARTS (rettype));
+             var = vect_get_new_vect_var (rettype, vect_simple_var, NULL);
+             op = make_ssa_name (var, new_stmt);
+             gimple_call_set_lhs (new_stmt, op);
+             vect_finish_stmt_generation (stmt, new_stmt, gsi);
+             var = make_ssa_name (vec_dest, NULL);
+             op = build1 (VIEW_CONVERT_EXPR, vectype, op);
+             new_stmt
+               = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, op,
+                                               NULL_TREE);
+           }
+         else
+           {
+             var = make_ssa_name (vec_dest, new_stmt);
+             gimple_call_set_lhs (new_stmt, var);
+           }
+
+         vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+         if (modifier == NARROW)
+           {
+             if ((j & 1) == 0)
+               {
+                 prev_res = var;
+                 continue;
+               }
+             var = permute_vec_elements (prev_res, var,
+                                         perm_mask, stmt, gsi);
+             new_stmt = SSA_NAME_DEF_STMT (var);
+           }
+
+         if (prev_stmt_info == NULL)
+           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+         else
+           STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+         prev_stmt_info = vinfo_for_stmt (new_stmt);
+       }
+      return true;
     }
-
-  /* Check if the load is a part of an interleaving chain.  */
-  if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
+  else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
     {
-      strided_load = true;
-      /* FORNOW */
-      gcc_assert (! nested_in_vect_loop);
+      gimple_stmt_iterator incr_gsi;
+      bool insert_after;
+      gimple incr;
+      tree offvar;
+      tree ref = DR_REF (dr);
+      tree ivstep;
+      tree running_off;
+      VEC(constructor_elt, gc) *v = NULL;
+      gimple_seq stmts = NULL;
 
-      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
-      if (!slp && !PURE_SLP_STMT (stmt_info))
-       {
-         group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-         if (vect_load_lanes_supported (vectype, group_size))
-           load_lanes_p = true;
-         else if (!vect_strided_load_supported (vectype, group_size))
-           return false;
-       }
-    }
+      gcc_assert (stride_base && stride_step);
 
-  if (negative)
-    {
-      gcc_assert (!strided_load);
-      alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
-      if (alignment_support_scheme != dr_aligned
-         && alignment_support_scheme != dr_unaligned_supported)
-       {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "negative step but alignment required.");
-         return false;
-       }
-      if (!perm_mask_for_reverse (vectype, NULL))
+      /* For a load with loop-invariant (but other than power-of-2)
+         stride (i.e. not a grouped access) like so:
+
+          for (i = 0; i < n; i += stride)
+            ... = array[i];
+
+        we generate a new induction variable and new accesses to
+        form a new vector (or vectors, depending on ncopies):
+
+          for (j = 0; ; j += VF*stride)
+            tmp1 = array[j];
+            tmp2 = array[j + stride];
+            ...
+            vectemp = {tmp1, tmp2, ...}
+         */
+
+      ivstep = stride_step;
+      ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
+                           build_int_cst (TREE_TYPE (ivstep), vf));
+
+      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+
+      create_iv (stride_base, ivstep, NULL,
+                loop, &incr_gsi, insert_after,
+                &offvar, NULL);
+      incr = gsi_stmt (incr_gsi);
+      set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
+
+      stride_step = force_gimple_operand (stride_step, &stmts, true, NULL_TREE);
+      if (stmts)
+       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+
+      prev_stmt_info = NULL;
+      running_off = offvar;
+      for (j = 0; j < ncopies; j++)
        {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "negative step and reversing not supported.");
-         return false;
-       }
-    }
+         tree vec_inv;
 
-  if (!vec_stmt) /* transformation not required.  */
-    {
-      STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
-      vect_model_load_cost (stmt_info, ncopies, load_lanes_p, NULL);
-      return true;
-    }
+         v = VEC_alloc (constructor_elt, gc, nunits);
+         for (i = 0; i < nunits; i++)
+           {
+             tree newref, newoff;
+             gimple incr;
+             if (TREE_CODE (ref) == ARRAY_REF)
+               newref = build4 (ARRAY_REF, TREE_TYPE (ref),
+                                unshare_expr (TREE_OPERAND (ref, 0)),
+                                running_off,
+                                NULL_TREE, NULL_TREE);
+             else
+               newref = build2 (MEM_REF, TREE_TYPE (ref),
+                                running_off,
+                                TREE_OPERAND (ref, 1));
+
+             newref = force_gimple_operand_gsi (gsi, newref, true,
+                                                NULL_TREE, true,
+                                                GSI_SAME_STMT);
+             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, newref);
+             newoff = copy_ssa_name (running_off, NULL);
+             if (POINTER_TYPE_P (TREE_TYPE (newoff)))
+               incr = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, newoff,
+                                                    running_off, stride_step);
+             else
+               incr = gimple_build_assign_with_ops (PLUS_EXPR, newoff,
+                                                    running_off, stride_step);
+             vect_finish_stmt_generation (stmt, incr, gsi);
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "transform load. ncopies = %d", ncopies);
+             running_off = newoff;
+           }
 
-  /** Transform.  **/
+         vec_inv = build_constructor (vectype, v);
+         new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
+         new_stmt = SSA_NAME_DEF_STMT (new_temp);
+
+         if (j == 0)
+           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+         else
+           STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+         prev_stmt_info = vinfo_for_stmt (new_stmt);
+       }
+      return true;
+    }
 
-  if (strided_load)
+  if (grouped_load)
     {
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+      if (slp
+          && !SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance)
+         && first_stmt != VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0))
+        first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
+
       /* Check if the chain of loads is already vectorized.  */
       if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
        {
@@ -4176,7 +4711,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
       /* VEC_NUM is the number of vect stmts to be created for this group.  */
       if (slp)
        {
-         strided_load = false;
+         grouped_load = false;
          vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
           if (SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
             slp_perm = true;
@@ -4234,7 +4769,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
      information we recorded in RELATED_STMT field is used to vectorize
      stmt S2.  */
 
-  /* In case of interleaving (non-unit strided access):
+  /* In case of interleaving (non-unit grouped access):
 
      S1:  x2 = &base + 2
      S2:  x0 = &base
@@ -4251,8 +4786,8 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
 
      Then permutation statements are generated:
 
-     VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
-     VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
+     VS5: vx5 = VEC_PERM_EXPR < vx0, vx1, { 0, 2, ..., i*2 } >
+     VS6: vx6 = VEC_PERM_EXPR < vx0, vx1, { 1, 3, ..., i*2+1 } >
        ...
 
      And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
@@ -4260,7 +4795,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
      corresponds to the order of scalar stmts in the interleaving chain - see
      the documentation of vect_permute_load_chain()).
      The generation of permutation stmts and recording them in
-     STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
+     STMT_VINFO_VEC_STMT is done in vect_transform_grouped_load().
 
      In case of both multiple types and interleaving, the vector loads and
      permutation stmts above are created for every copy.  The result vector
@@ -4302,7 +4837,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
      This can only occur when vectorizing memory accesses in the inner-loop
      nested within an outer-loop that is being vectorized.  */
 
-  if (loop && nested_in_vect_loop_p (loop, stmt)
+  if (nested_in_vect_loop
       && (TREE_INT_CST_LOW (DR_STEP (dr))
          % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
     {
@@ -4346,7 +4881,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
         dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
                                       TYPE_SIZE_UNIT (aggr_type));
 
-      if (strided_load || slp_perm)
+      if (grouped_load || slp_perm)
        dr_chain = VEC_alloc (tree, heap, vec_num);
 
       if (load_lanes_p)
@@ -4361,7 +4896,6 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
          new_stmt = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
          gimple_call_set_lhs (new_stmt, vec_array);
          vect_finish_stmt_generation (stmt, new_stmt, gsi);
-         mark_symbols_for_renaming (new_stmt);
 
          /* Extract each vector into an SSA_NAME.  */
          for (i = 0; i < vec_num; i++)
@@ -4372,7 +4906,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
            }
 
          /* Record the mapping between SSA_NAMEs and statements.  */
-         vect_record_strided_load_vectors (stmt, dr_chain);
+         vect_record_grouped_load_vectors (stmt, dr_chain);
        }
       else
        {
@@ -4388,33 +4922,35 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
                case dr_aligned:
                case dr_unaligned_supported:
                  {
-                   struct ptr_info_def *pi;
+                   unsigned int align, misalign;
+
                    data_ref
                      = build2 (MEM_REF, vectype, dataref_ptr,
                                build_int_cst (reference_alias_ptr_type
                                               (DR_REF (first_dr)), 0));
-                   pi = get_ptr_info (dataref_ptr);
-                   pi->align = TYPE_ALIGN_UNIT (vectype);
+                   align = TYPE_ALIGN_UNIT (vectype);
                    if (alignment_support_scheme == dr_aligned)
                      {
                        gcc_assert (aligned_access_p (first_dr));
-                       pi->misalign = 0;
+                       misalign = 0;
                      }
                    else if (DR_MISALIGNMENT (first_dr) == -1)
                      {
                        TREE_TYPE (data_ref)
                          = build_aligned_type (TREE_TYPE (data_ref),
                                                TYPE_ALIGN (elem_type));
-                       pi->align = TYPE_ALIGN_UNIT (elem_type);
-                       pi->misalign = 0;
+                       align = TYPE_ALIGN_UNIT (elem_type);
+                       misalign = 0;
                      }
                    else
                      {
                        TREE_TYPE (data_ref)
                          = build_aligned_type (TREE_TYPE (data_ref),
                                                TYPE_ALIGN (elem_type));
-                       pi->misalign = DR_MISALIGNMENT (first_dr);
+                       misalign = DR_MISALIGNMENT (first_dr);
                      }
+                   set_ptr_info_alignment (get_ptr_info (dataref_ptr),
+                                           align, misalign);
                    break;
                  }
                case dr_explicit_realign:
@@ -4430,13 +4966,12 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
                                                    dr_explicit_realign,
                                                    dataref_ptr, NULL);
 
+                   ptr = copy_ssa_name (dataref_ptr, NULL);
                    new_stmt = gimple_build_assign_with_ops
-                                (BIT_AND_EXPR, NULL_TREE, dataref_ptr,
+                                (BIT_AND_EXPR, ptr, dataref_ptr,
                                  build_int_cst
                                  (TREE_TYPE (dataref_ptr),
                                   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
-                   ptr = make_ssa_name (SSA_NAME_VAR (dataref_ptr), new_stmt);
-                   gimple_assign_set_lhs (new_stmt, ptr);
                    vect_finish_stmt_generation (stmt, new_stmt, gsi);
                    data_ref
                      = build2 (MEM_REF, vectype, ptr,
@@ -4453,14 +4988,14 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
                    msq = new_temp;
 
                    bump = size_binop (MULT_EXPR, vs_minus_1,
-                                      TYPE_SIZE_UNIT (scalar_type));
+                                      TYPE_SIZE_UNIT (elem_type));
                    ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
                    new_stmt = gimple_build_assign_with_ops
                                 (BIT_AND_EXPR, NULL_TREE, ptr,
                                  build_int_cst
                                  (TREE_TYPE (ptr),
                                   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
-                   ptr = make_ssa_name (SSA_NAME_VAR (dataref_ptr), new_stmt);
+                   ptr = copy_ssa_name (dataref_ptr, new_stmt);
                    gimple_assign_set_lhs (new_stmt, ptr);
                    vect_finish_stmt_generation (stmt, new_stmt, gsi);
                    data_ref
@@ -4470,14 +5005,12 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
                    break;
                  }
                case dr_explicit_realign_optimized:
+                 new_temp = copy_ssa_name (dataref_ptr, NULL);
                  new_stmt = gimple_build_assign_with_ops
-                              (BIT_AND_EXPR, NULL_TREE, dataref_ptr,
+                              (BIT_AND_EXPR, new_temp, dataref_ptr,
                                build_int_cst
                                  (TREE_TYPE (dataref_ptr),
                                   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
-                 new_temp = make_ssa_name (SSA_NAME_VAR (dataref_ptr),
-                                           new_stmt);
-                 gimple_assign_set_lhs (new_stmt, new_temp);
                  vect_finish_stmt_generation (stmt, new_stmt, gsi);
                  data_ref
                    = build2 (MEM_REF, vectype, new_temp,
@@ -4492,7 +5025,6 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
              new_temp = make_ssa_name (vec_dest, new_stmt);
              gimple_assign_set_lhs (new_stmt, new_temp);
              vect_finish_stmt_generation (stmt, new_stmt, gsi);
-             mark_symbols_for_renaming (new_stmt);
 
              /* 3. Handle explicit realignment if necessary/supported.
                 Create in loop:
@@ -4526,46 +5058,25 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
              /* 4. Handle invariant-load.  */
              if (inv_p && !bb_vinfo)
                {
-                 gcc_assert (!strided_load);
-                 gcc_assert (nested_in_vect_loop_p (loop, stmt));
-                 if (j == 0)
-                   {
-                     int k;
-                     tree t = NULL_TREE;
-                     tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
-
-                     /* CHECKME: bitpos depends on endianess?  */
-                     bitpos = bitsize_zero_node;
-                     vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
-                                       bitsize, bitpos);
-                     vec_dest = vect_create_destination_var (scalar_dest,
-                                                             NULL_TREE);
-                     new_stmt = gimple_build_assign (vec_dest, vec_inv);
-                     new_temp = make_ssa_name (vec_dest, new_stmt);
-                     gimple_assign_set_lhs (new_stmt, new_temp);
-                     vect_finish_stmt_generation (stmt, new_stmt, gsi);
-
-                     for (k = nunits - 1; k >= 0; --k)
-                       t = tree_cons (NULL_TREE, new_temp, t);
-                     /* FIXME: use build_constructor directly.  */
-                     vec_inv = build_constructor_from_list (vectype, t);
-                     new_temp = vect_init_vector (stmt, vec_inv,
-                                                  vectype, gsi);
-                     new_stmt = SSA_NAME_DEF_STMT (new_temp);
-                   }
-                 else
-                   gcc_unreachable (); /* FORNOW. */
+                 gimple_stmt_iterator gsi2 = *gsi;
+                 gcc_assert (!grouped_load);
+                 gsi_next (&gsi2);
+                 new_temp = vect_init_vector (stmt, scalar_dest,
+                                              vectype, &gsi2);
+                 new_stmt = SSA_NAME_DEF_STMT (new_temp);
                }
 
              if (negative)
                {
-                 new_temp = reverse_vec_elements (new_temp, stmt, gsi);
+                 tree perm_mask = perm_mask_for_reverse (vectype);
+                 new_temp = permute_vec_elements (new_temp, new_temp,
+                                                  perm_mask, stmt, gsi);
                  new_stmt = SSA_NAME_DEF_STMT (new_temp);
                }
 
              /* Collect vector loads and later create their permutation in
-                vect_transform_strided_load ().  */
-             if (strided_load || slp_perm)
+                vect_transform_grouped_load ().  */
+             if (grouped_load || slp_perm)
                VEC_quick_push (tree, dr_chain, new_temp);
 
              /* Store vector loads in the corresponding SLP_NODE.  */
@@ -4589,10 +5100,10 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
         }
       else
         {
-          if (strided_load)
+          if (grouped_load)
            {
              if (!load_lanes_p)
-               vect_transform_strided_load (stmt, dr_chain, group_size, gsi);
+               vect_transform_grouped_load (stmt, dr_chain, group_size, gsi);
              *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
            }
           else
@@ -4617,15 +5128,20 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
    LOOP - the loop that is being vectorized.
    COND - Condition that is checked for simple use.
 
+   Output:
+   *COMP_VECTYPE - the vector type for the comparison.
+
    Returns whether a COND can be vectorized.  Checks whether
    condition operands are supportable using vec_is_simple_use.  */
 
 static bool
-vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
+vect_is_simple_cond (tree cond, gimple stmt, loop_vec_info loop_vinfo,
+                    bb_vec_info bb_vinfo, tree *comp_vectype)
 {
   tree lhs, rhs;
   tree def;
   enum vect_def_type dt;
+  tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
 
   if (!COMPARISON_CLASS_P (cond))
     return false;
@@ -4636,8 +5152,8 @@ vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
   if (TREE_CODE (lhs) == SSA_NAME)
     {
       gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
-      if (!vect_is_simple_use (lhs, loop_vinfo, NULL, &lhs_def_stmt, &def,
-                               &dt))
+      if (!vect_is_simple_use_1 (lhs, stmt, loop_vinfo, bb_vinfo,
+                                &lhs_def_stmt, &def, &dt, &vectype1))
        return false;
     }
   else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
@@ -4647,14 +5163,15 @@ vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
   if (TREE_CODE (rhs) == SSA_NAME)
     {
       gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
-      if (!vect_is_simple_use (rhs, loop_vinfo, NULL, &rhs_def_stmt, &def,
-                               &dt))
+      if (!vect_is_simple_use_1 (rhs, stmt, loop_vinfo, bb_vinfo,
+                                &rhs_def_stmt, &def, &dt, &vectype2))
        return false;
     }
-  else if (TREE_CODE (rhs) != INTEGER_CST  && TREE_CODE (rhs) != REAL_CST
+  else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
           && TREE_CODE (rhs) != FIXED_CST)
     return false;
 
+  *comp_vectype = vectype1 ? vectype1 : vectype2;
   return true;
 }
 
@@ -4673,40 +5190,44 @@ vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
 
 bool
 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
-                       gimple *vec_stmt, tree reduc_def, int reduc_index)
+                       gimple *vec_stmt, tree reduc_def, int reduc_index,
+                       slp_tree slp_node)
 {
   tree scalar_dest = NULL_TREE;
   tree vec_dest = NULL_TREE;
-  tree op = NULL_TREE;
   tree cond_expr, then_clause, else_clause;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree comp_vectype = NULL_TREE;
   tree vec_cond_lhs = NULL_TREE, vec_cond_rhs = NULL_TREE;
   tree vec_then_clause = NULL_TREE, vec_else_clause = NULL_TREE;
   tree vec_compare, vec_cond_expr;
   tree new_temp;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  enum machine_mode vec_mode;
   tree def;
   enum vect_def_type dt, dts[4];
   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
-  int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
+  int ncopies;
   enum tree_code code;
   stmt_vec_info prev_stmt_info = NULL;
-  int j;
-
-  /* FORNOW: unsupported in basic block SLP.  */
-  gcc_assert (loop_vinfo);
+  int i, j;
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
+  VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
+  VEC (tree, heap) *vec_oprnds2 = NULL, *vec_oprnds3 = NULL;
 
-  /* FORNOW: SLP not supported.  */
-  if (STMT_SLP_TYPE (stmt_info))
-    return false;
+  if (slp_node || PURE_SLP_STMT (stmt_info))
+    ncopies = 1;
+  else
+    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
 
   gcc_assert (ncopies >= 1);
   if (reduc_index && ncopies > 1)
     return false; /* FORNOW */
 
-  if (!STMT_VINFO_RELEVANT_P (stmt_info))
+  if (reduc_index && STMT_SLP_TYPE (stmt_info))
+    return false;
+
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
@@ -4731,25 +5252,19 @@ vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
   if (code != COND_EXPR)
     return false;
 
-  gcc_assert (gimple_assign_single_p (stmt));
-  op = gimple_assign_rhs1 (stmt);
-  cond_expr = TREE_OPERAND (op, 0);
-  then_clause = TREE_OPERAND (op, 1);
-  else_clause = TREE_OPERAND (op, 2);
-
-  if (!vect_is_simple_cond (cond_expr, loop_vinfo))
-    return false;
+  cond_expr = gimple_assign_rhs1 (stmt);
+  then_clause = gimple_assign_rhs2 (stmt);
+  else_clause = gimple_assign_rhs3 (stmt);
 
-  /* We do not handle two different vector types for the condition
-     and the values.  */
-  if (!types_compatible_p (TREE_TYPE (TREE_OPERAND (cond_expr, 0)),
-                          TREE_TYPE (vectype)))
+  if (!vect_is_simple_cond (cond_expr, stmt, loop_vinfo, bb_vinfo,
+                           &comp_vectype)
+      || !comp_vectype)
     return false;
 
   if (TREE_CODE (then_clause) == SSA_NAME)
     {
       gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
-      if (!vect_is_simple_use (then_clause, loop_vinfo, NULL,
+      if (!vect_is_simple_use (then_clause, stmt, loop_vinfo, bb_vinfo,
                               &then_def_stmt, &def, &dt))
        return false;
     }
@@ -4761,7 +5276,7 @@ vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
   if (TREE_CODE (else_clause) == SSA_NAME)
     {
       gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
-      if (!vect_is_simple_use (else_clause, loop_vinfo, NULL,
+      if (!vect_is_simple_use (else_clause, stmt, loop_vinfo, bb_vinfo,
                               &else_def_stmt, &def, &dt))
        return false;
     }
@@ -4770,16 +5285,21 @@ vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
           && TREE_CODE (else_clause) != FIXED_CST)
     return false;
 
-
-  vec_mode = TYPE_MODE (vectype);
-
   if (!vec_stmt)
     {
       STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
-      return expand_vec_cond_expr_p (TREE_TYPE (op), vec_mode);
+      return expand_vec_cond_expr_p (vectype, comp_vectype);
     }
 
-  /* Transform */
+  /* Transform.  */
+
+  if (!slp_node)
+    {
+      vec_oprnds0 = VEC_alloc (tree, heap, 1);
+      vec_oprnds1 = VEC_alloc (tree, heap, 1);
+      vec_oprnds2 = VEC_alloc (tree, heap, 1);
+      vec_oprnds3 = VEC_alloc (tree, heap, 1);
+    }
 
   /* Handle def.  */
   scalar_dest = gimple_assign_lhs (stmt);
@@ -4788,67 +5308,118 @@ vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
   /* Handle cond expr.  */
   for (j = 0; j < ncopies; j++)
     {
-      gimple new_stmt;
+      gimple new_stmt = NULL;
       if (j == 0)
        {
-         gimple gtemp;
-         vec_cond_lhs =
+          if (slp_node)
+            {
+              VEC (tree, heap) *ops = VEC_alloc (tree, heap, 4);
+              VEC (slp_void_p, heap) *vec_defs;
+
+             vec_defs = VEC_alloc (slp_void_p, heap, 4);
+              VEC_safe_push (tree, heap, ops, TREE_OPERAND (cond_expr, 0));
+              VEC_safe_push (tree, heap, ops, TREE_OPERAND (cond_expr, 1));
+              VEC_safe_push (tree, heap, ops, then_clause);
+              VEC_safe_push (tree, heap, ops, else_clause);
+              vect_get_slp_defs (ops, slp_node, &vec_defs, -1);
+              vec_oprnds3 = (VEC (tree, heap) *) VEC_pop (slp_void_p, vec_defs);
+              vec_oprnds2 = (VEC (tree, heap) *) VEC_pop (slp_void_p, vec_defs);
+              vec_oprnds1 = (VEC (tree, heap) *) VEC_pop (slp_void_p, vec_defs);
+              vec_oprnds0 = (VEC (tree, heap) *) VEC_pop (slp_void_p, vec_defs);
+
+              VEC_free (tree, heap, ops);
+              VEC_free (slp_void_p, heap, vec_defs);
+            }
+          else
+            {
+             gimple gtemp;
+             vec_cond_lhs =
              vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0),
                                            stmt, NULL);
-         vect_is_simple_use (TREE_OPERAND (cond_expr, 0), loop_vinfo,
-                             NULL, &gtemp, &def, &dts[0]);
-         vec_cond_rhs =
-             vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1),
-                                           stmt, NULL);
-         vect_is_simple_use (TREE_OPERAND (cond_expr, 1), loop_vinfo,
-                             NULL, &gtemp, &def, &dts[1]);
-         if (reduc_index == 1)
-           vec_then_clause = reduc_def;
-         else
-           {
-             vec_then_clause = vect_get_vec_def_for_operand (then_clause,
-                                                             stmt, NULL);
-             vect_is_simple_use (then_clause, loop_vinfo,
-                                 NULL, &gtemp, &def, &dts[2]);
-           }
-         if (reduc_index == 2)
-           vec_else_clause = reduc_def;
-         else
-           {
-             vec_else_clause = vect_get_vec_def_for_operand (else_clause,
+             vect_is_simple_use (TREE_OPERAND (cond_expr, 0), stmt,
+                                 loop_vinfo, NULL, &gtemp, &def, &dts[0]);
+
+             vec_cond_rhs =
+               vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1),
+                                               stmt, NULL);
+             vect_is_simple_use (TREE_OPERAND (cond_expr, 1), stmt,
+                                 loop_vinfo, NULL, &gtemp, &def, &dts[1]);
+             if (reduc_index == 1)
+               vec_then_clause = reduc_def;
+             else
+               {
+                 vec_then_clause = vect_get_vec_def_for_operand (then_clause,
+                                                             stmt, NULL);
+                 vect_is_simple_use (then_clause, stmt, loop_vinfo,
+                                         NULL, &gtemp, &def, &dts[2]);
+               }
+             if (reduc_index == 2)
+               vec_else_clause = reduc_def;
+             else
+               {
+                 vec_else_clause = vect_get_vec_def_for_operand (else_clause,
                                                              stmt, NULL);
-             vect_is_simple_use (else_clause, loop_vinfo,
+                 vect_is_simple_use (else_clause, stmt, loop_vinfo,
                                  NULL, &gtemp, &def, &dts[3]);
+               }
            }
        }
       else
        {
-         vec_cond_lhs = vect_get_vec_def_for_stmt_copy (dts[0], vec_cond_lhs);
-         vec_cond_rhs = vect_get_vec_def_for_stmt_copy (dts[1], vec_cond_rhs);
+         vec_cond_lhs = vect_get_vec_def_for_stmt_copy (dts[0],
+                                               VEC_pop (tree, vec_oprnds0));
+         vec_cond_rhs = vect_get_vec_def_for_stmt_copy (dts[1],
+                                               VEC_pop (tree, vec_oprnds1));
          vec_then_clause = vect_get_vec_def_for_stmt_copy (dts[2],
-                                                           vec_then_clause);
+                                               VEC_pop (tree, vec_oprnds2));
          vec_else_clause = vect_get_vec_def_for_stmt_copy (dts[3],
-                                                           vec_else_clause);
+                                               VEC_pop (tree, vec_oprnds3));
+       }
+
+      if (!slp_node)
+        {
+         VEC_quick_push (tree, vec_oprnds0, vec_cond_lhs);
+         VEC_quick_push (tree, vec_oprnds1, vec_cond_rhs);
+         VEC_quick_push (tree, vec_oprnds2, vec_then_clause);
+         VEC_quick_push (tree, vec_oprnds3, vec_else_clause);
        }
 
       /* Arguments are ready.  Create the new vector stmt.  */
-      vec_compare = build2 (TREE_CODE (cond_expr), vectype,
-                           vec_cond_lhs, vec_cond_rhs);
-      vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
-                             vec_compare, vec_then_clause, vec_else_clause);
+      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, vec_cond_lhs)
+        {
+          vec_cond_rhs = VEC_index (tree, vec_oprnds1, i);
+          vec_then_clause = VEC_index (tree, vec_oprnds2, i);
+          vec_else_clause = VEC_index (tree, vec_oprnds3, i);
 
-      new_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
-      new_temp = make_ssa_name (vec_dest, new_stmt);
-      gimple_assign_set_lhs (new_stmt, new_temp);
-      vect_finish_stmt_generation (stmt, new_stmt, gsi);
-      if (j == 0)
-        STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
-      else
-        STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+          vec_compare = build2 (TREE_CODE (cond_expr), vectype,
+                              vec_cond_lhs, vec_cond_rhs);
+          vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
+                        vec_compare, vec_then_clause, vec_else_clause);
 
-      prev_stmt_info = vinfo_for_stmt (new_stmt);
+          new_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
+          new_temp = make_ssa_name (vec_dest, new_stmt);
+          gimple_assign_set_lhs (new_stmt, new_temp);
+          vect_finish_stmt_generation (stmt, new_stmt, gsi);
+          if (slp_node)
+            VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
+        }
+
+        if (slp_node)
+          continue;
+
+        if (j == 0)
+          STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+        else
+          STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+        prev_stmt_info = vinfo_for_stmt (new_stmt);
     }
 
+  VEC_free (tree, heap, vec_oprnds0);
+  VEC_free (tree, heap, vec_oprnds1);
+  VEC_free (tree, heap, vec_oprnds2);
+  VEC_free (tree, heap, vec_oprnds3);
+
   return true;
 }
 
@@ -4863,6 +5434,8 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
   enum vect_relevant relevance = STMT_VINFO_RELEVANT (stmt_info);
   bool ok;
   tree scalar_type, vectype;
+  gimple pattern_stmt;
+  gimple_seq pattern_def_seq;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     {
@@ -4884,16 +5457,24 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
      - any LABEL_EXPRs in the loop
      - computations that are used only for array indexing or loop control.
      In basic blocks we only analyze statements that are a part of some SLP
-     instance, therefore, all the statements are relevant.  */
+     instance, therefore, all the statements are relevant.
 
+     Pattern statement needs to be analyzed instead of the original statement
+     if the original statement is not relevant.  Otherwise, we analyze both
+     statements.  In basic blocks we are called from some SLP instance
+     traversal, don't analyze pattern stmts instead, the pattern stmts
+     already will be part of SLP instance.  */
+
+  pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
   if (!STMT_VINFO_RELEVANT_P (stmt_info)
       && !STMT_VINFO_LIVE_P (stmt_info))
     {
-      gimple pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
       if (STMT_VINFO_IN_PATTERN_P (stmt_info)
+          && pattern_stmt
           && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
               || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
         {
+          /* Analyze PATTERN_STMT instead of the original stmt.  */
           stmt = pattern_stmt;
           stmt_info = vinfo_for_stmt (pattern_stmt);
           if (vect_print_dump_info (REPORT_DETAILS))
@@ -4910,6 +5491,48 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
           return true;
         }
     }
+  else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
+          && node == NULL
+           && pattern_stmt
+           && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
+               || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
+    {
+      /* Analyze PATTERN_STMT too.  */
+      if (vect_print_dump_info (REPORT_DETAILS))
+        {
+          fprintf (vect_dump, "==> examining pattern statement: ");
+          print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+        }
+
+      if (!vect_analyze_stmt (pattern_stmt, need_to_vectorize, node))
+        return false;
+   }
+
+  if (is_pattern_stmt_p (stmt_info)
+      && node == NULL
+      && (pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)))
+    {
+      gimple_stmt_iterator si;
+
+      for (si = gsi_start (pattern_def_seq); !gsi_end_p (si); gsi_next (&si))
+       {
+         gimple pattern_def_stmt = gsi_stmt (si);
+         if (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
+             || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt)))
+           {
+             /* Analyze def stmt of STMT if it's a pattern stmt.  */
+             if (vect_print_dump_info (REPORT_DETAILS))
+               {
+                 fprintf (vect_dump, "==> examining pattern def statement: ");
+                 print_gimple_stmt (vect_dump, pattern_def_stmt, 0, TDF_SLIM);
+               }
+
+             if (!vect_analyze_stmt (pattern_def_stmt,
+                                     need_to_vectorize, node))
+               return false;
+           }
+       }
+    }
 
   switch (STMT_VINFO_DEF_TYPE (stmt_info))
     {
@@ -4973,25 +5596,26 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
    if (!bb_vinfo
        && (STMT_VINFO_RELEVANT_P (stmt_info)
            || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def))
-      ok = (vectorizable_type_promotion (stmt, NULL, NULL, NULL)
-            || vectorizable_type_demotion (stmt, NULL, NULL, NULL)
-            || vectorizable_conversion (stmt, NULL, NULL, NULL)
+      ok = (vectorizable_conversion (stmt, NULL, NULL, NULL)
             || vectorizable_shift (stmt, NULL, NULL, NULL)
             || vectorizable_operation (stmt, NULL, NULL, NULL)
             || vectorizable_assignment (stmt, NULL, NULL, NULL)
             || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
-            || vectorizable_call (stmt, NULL, NULL)
+           || vectorizable_call (stmt, NULL, NULL, NULL)
             || vectorizable_store (stmt, NULL, NULL, NULL)
             || vectorizable_reduction (stmt, NULL, NULL, NULL)
-            || vectorizable_condition (stmt, NULL, NULL, NULL, 0));
+            || vectorizable_condition (stmt, NULL, NULL, NULL, 0, NULL));
     else
       {
         if (bb_vinfo)
-          ok = (vectorizable_shift (stmt, NULL, NULL, node)
+         ok = (vectorizable_conversion (stmt, NULL, NULL, node)
+               || vectorizable_shift (stmt, NULL, NULL, node)
                 || vectorizable_operation (stmt, NULL, NULL, node)
                 || vectorizable_assignment (stmt, NULL, NULL, node)
                 || vectorizable_load (stmt, NULL, NULL, node, NULL)
-                || vectorizable_store (stmt, NULL, NULL, node));
+               || vectorizable_call (stmt, NULL, NULL, node)
+                || vectorizable_store (stmt, NULL, NULL, node)
+                || vectorizable_condition (stmt, NULL, NULL, NULL, 0, node));
       }
 
   if (!ok)
@@ -5037,27 +5661,18 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
 
 bool
 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
-                    bool *strided_store, slp_tree slp_node,
+                    bool *grouped_store, slp_tree slp_node,
                      slp_instance slp_node_instance)
 {
   bool is_store = false;
   gimple vec_stmt = NULL;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  gimple orig_stmt_in_pattern, orig_scalar_stmt = stmt;
   bool done;
 
   switch (STMT_VINFO_TYPE (stmt_info))
     {
     case type_demotion_vec_info_type:
-      done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
-      gcc_assert (done);
-      break;
-
     case type_promotion_vec_info_type:
-      done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
-      gcc_assert (done);
-      break;
-
     case type_conversion_vec_info_type:
       done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
       gcc_assert (done);
@@ -5093,13 +5708,13 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
     case store_vec_info_type:
       done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
       gcc_assert (done);
-      if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info) && !slp_node)
        {
          /* In case of interleaving, the whole chain is vectorized when the
             last store in the chain is reached.  Store stmts before the last
             one are skipped, and there vec_stmt_info shouldn't be freed
             meanwhile.  */
-         *strided_store = true;
+         *grouped_store = true;
          if (STMT_VINFO_VEC_STMT (stmt_info))
            is_store = true;
          }
@@ -5108,14 +5723,12 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
       break;
 
     case condition_vec_info_type:
-      gcc_assert (!slp_node);
-      done = vectorizable_condition (stmt, gsi, &vec_stmt, NULL, 0);
+      done = vectorizable_condition (stmt, gsi, &vec_stmt, NULL, 0, slp_node);
       gcc_assert (done);
       break;
 
     case call_vec_info_type:
-      gcc_assert (!slp_node);
-      done = vectorizable_call (stmt, gsi, &vec_stmt);
+      done = vectorizable_call (stmt, gsi, &vec_stmt, slp_node);
       stmt = gsi_stmt (*gsi);
       break;
 
@@ -5182,25 +5795,7 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
     }
 
   if (vec_stmt)
-    {
-      STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
-      orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
-      if (orig_stmt_in_pattern)
-       {
-         stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
-         /* STMT was inserted by the vectorizer to replace a computation idiom.
-            ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
-            computed this idiom.  We need to record a pointer to VEC_STMT in
-            the stmt_info of ORIG_STMT_IN_PATTERN.  See more details in the
-            documentation of vect_pattern_recog.  */
-         if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
-           {
-             gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo)
-                           == orig_scalar_stmt);
-             STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
-           }
-       }
-    }
+    STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
 
   return is_store;
 }
@@ -5218,10 +5813,16 @@ vect_remove_stores (gimple first_stmt)
 
   while (next)
     {
+      stmt_vec_info stmt_info = vinfo_for_stmt (next);
+
+      tmp = GROUP_NEXT_ELEMENT (stmt_info);
+      if (is_pattern_stmt_p (stmt_info))
+       next = STMT_VINFO_RELATED_STMT (stmt_info);
       /* Free the attached stmt_vec_info and remove the stmt.  */
       next_si = gsi_for_stmt (next);
+      unlink_stmt_vdef (next);
       gsi_remove (&next_si, true);
-      tmp = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
+      release_defs (next);
       free_stmt_vec_info (next);
       next = tmp;
     }
@@ -5250,6 +5851,7 @@ new_stmt_vec_info (gimple stmt, loop_vec_info loop_vinfo,
   STMT_VINFO_VECTORIZABLE (res) = true;
   STMT_VINFO_IN_PATTERN_P (res) = false;
   STMT_VINFO_RELATED_STMT (res) = NULL;
+  STMT_VINFO_PATTERN_DEF_SEQ (res) = NULL;
   STMT_VINFO_DATA_REF (res) = NULL;
 
   STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
@@ -5265,8 +5867,6 @@ new_stmt_vec_info (gimple stmt, loop_vec_info loop_vinfo,
     STMT_VINFO_DEF_TYPE (res) = vect_internal_def;
 
   STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
-  STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
-  STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
   STMT_SLP_TYPE (res) = loop_vect;
   GROUP_FIRST_ELEMENT (res) = NULL;
   GROUP_NEXT_ELEMENT (res) = NULL;
@@ -5310,6 +5910,27 @@ free_stmt_vec_info (gimple stmt)
   if (!stmt_info)
     return;
 
+  /* Check if this statement has a related "pattern stmt"
+     (introduced by the vectorizer during the pattern recognition
+     pass).  Free pattern's stmt_vec_info and def stmt's stmt_vec_info
+     too.  */
+  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
+    {
+      stmt_vec_info patt_info
+       = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
+      if (patt_info)
+       {
+         gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (patt_info);
+         if (seq)
+           {
+             gimple_stmt_iterator si;
+             for (si = gsi_start (seq); !gsi_end_p (si); gsi_next (&si))
+               free_stmt_vec_info (gsi_stmt (si));
+           }
+         free_stmt_vec_info (STMT_VINFO_RELATED_STMT (stmt_info));
+       }
+    }
+
   VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
   set_vinfo_for_stmt (stmt, NULL);
   free (stmt_info);
@@ -5333,22 +5954,35 @@ get_vectype_for_scalar_type_and_size (tree scalar_type, unsigned size)
   if (nbytes == 0)
     return NULL_TREE;
 
+  if (GET_MODE_CLASS (inner_mode) != MODE_INT
+      && GET_MODE_CLASS (inner_mode) != MODE_FLOAT)
+    return NULL_TREE;
+
   /* We can't build a vector type of elements with alignment bigger than
      their size.  */
   if (nbytes < TYPE_ALIGN_UNIT (scalar_type))
     return NULL_TREE;
 
-  /* If we'd build a vector type of elements whose mode precision doesn't
-     match their types precision we'll get mismatched types on vector
-     extracts via BIT_FIELD_REFs.  This effectively means we disable
-     vectorization of bool and/or enum types in some languages.  */
+  /* For vector types of elements whose mode precision doesn't
+     match their types precision we use a element type of mode
+     precision.  The vectorization routines will have to make sure
+     they support the proper result truncation/extension.
+     We also make sure to build vector types with INTEGER_TYPE
+     component type only.  */
   if (INTEGRAL_TYPE_P (scalar_type)
-      && GET_MODE_BITSIZE (inner_mode) != TYPE_PRECISION (scalar_type))
-    return NULL_TREE;
-
-  if (GET_MODE_CLASS (inner_mode) != MODE_INT
-      && GET_MODE_CLASS (inner_mode) != MODE_FLOAT)
-    return NULL_TREE;
+      && (GET_MODE_BITSIZE (inner_mode) != TYPE_PRECISION (scalar_type)
+         || TREE_CODE (scalar_type) != INTEGER_TYPE))
+    scalar_type = build_nonstandard_integer_type (GET_MODE_BITSIZE (inner_mode),
+                                                 TYPE_UNSIGNED (scalar_type));
+
+  /* We shouldn't end up building VECTOR_TYPEs of non-scalar components.
+     When the component mode passes the above test simply use a type
+     corresponding to that mode.  The theory is that any use that
+     would cause problems with this will disable vectorization anyway.  */
+  if (!SCALAR_FLOAT_TYPE_P (scalar_type)
+      && !INTEGRAL_TYPE_P (scalar_type)
+      && !POINTER_TYPE_P (scalar_type))
+    scalar_type = lang_hooks.types.type_for_mode (inner_mode, 1);
 
   /* If no size was supplied use the mode the target prefers.   Otherwise
      lookup a vector mode of the specified size.  */
@@ -5423,7 +6057,7 @@ get_same_sized_vectype (tree scalar_type, tree vector_type)
    Input:
    LOOP_VINFO - the vect info of the loop that is being vectorized.
    BB_VINFO - the vect info of the basic block that is being vectorized.
-   OPERAND - operand of a stmt in the loop or bb.
+   OPERAND - operand of STMT in the loop or bb.
    DEF - the defining stmt in case OPERAND is an SSA_NAME.
 
    Returns whether a stmt with OPERAND can be vectorized.
@@ -5435,7 +6069,7 @@ get_same_sized_vectype (tree scalar_type, tree vector_type)
    For now, operands defined outside the basic block are not supported.  */
 
 bool
-vect_is_simple_use (tree operand, loop_vec_info loop_vinfo,
+vect_is_simple_use (tree operand, gimple stmt, loop_vec_info loop_vinfo,
                     bb_vec_info bb_vinfo, gimple *def_stmt,
                    tree *def, enum vect_def_type *dt)
 {
@@ -5455,7 +6089,7 @@ vect_is_simple_use (tree operand, loop_vec_info loop_vinfo,
       print_generic_expr (vect_dump, operand, TDF_SLIM);
     }
 
-  if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
+  if (CONSTANT_CLASS_P (operand))
     {
       *dt = vect_constant_def;
       return true;
@@ -5517,7 +6151,10 @@ vect_is_simple_use (tree operand, loop_vec_info loop_vinfo,
       *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
     }
 
-  if (*dt == vect_unknown_def_type)
+  if (*dt == vect_unknown_def_type
+      || (stmt
+         && *dt == vect_double_reduction_def
+         && gimple_code (stmt) != GIMPLE_PHI))
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "Unsupported pattern.");
@@ -5561,11 +6198,12 @@ vect_is_simple_use (tree operand, loop_vec_info loop_vinfo,
    scalar operand.  */
 
 bool
-vect_is_simple_use_1 (tree operand, loop_vec_info loop_vinfo,
+vect_is_simple_use_1 (tree operand, gimple stmt, loop_vec_info loop_vinfo,
                      bb_vec_info bb_vinfo, gimple *def_stmt,
                      tree *def, enum vect_def_type *dt, tree *vectype)
 {
-  if (!vect_is_simple_use (operand, loop_vinfo, bb_vinfo, def_stmt, def, dt))
+  if (!vect_is_simple_use (operand, stmt, loop_vinfo, bb_vinfo, def_stmt,
+                          def, dt))
     return false;
 
   /* Now get a vector type if the def is internal, otherwise supply
@@ -5578,8 +6216,12 @@ vect_is_simple_use_1 (tree operand, loop_vec_info loop_vinfo,
       || *dt == vect_nested_cycle)
     {
       stmt_vec_info stmt_info = vinfo_for_stmt (*def_stmt);
-      if (STMT_VINFO_IN_PATTERN_P (stmt_info))
+
+      if (STMT_VINFO_IN_PATTERN_P (stmt_info)
+          && !STMT_VINFO_RELEVANT (stmt_info)
+          && !STMT_VINFO_LIVE_P (stmt_info))
        stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
+
       *vectype = STMT_VINFO_VECTYPE (stmt_info);
       gcc_assert (*vectype != NULL_TREE);
     }
@@ -5609,9 +6251,6 @@ vect_is_simple_use_1 (tree operand, loop_vec_info loop_vinfo,
    Output:
    - CODE1 and CODE2 are codes of vector operations to be used when
    vectorizing the operation, if available.
-   - DECL1 and DECL2 are decls of target builtin functions to be used
-   when vectorizing the operation, if available.  In this case,
-   CODE1 and CODE2 are CALL_EXPR.
    - MULTI_STEP_CVT determines the number of required intermediate steps in
    case of multi-step conversion (like char->short->int - in that case
    MULTI_STEP_CVT will be 1).
@@ -5621,103 +6260,95 @@ vect_is_simple_use_1 (tree operand, loop_vec_info loop_vinfo,
 bool
 supportable_widening_operation (enum tree_code code, gimple stmt,
                                tree vectype_out, tree vectype_in,
-                                tree *decl1, tree *decl2,
                                 enum tree_code *code1, enum tree_code *code2,
                                 int *multi_step_cvt,
                                 VEC (tree, heap) **interm_types)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
-  bool ordered_p;
+  struct loop *vect_loop = NULL;
   enum machine_mode vec_mode;
   enum insn_code icode1, icode2;
   optab optab1, optab2;
   tree vectype = vectype_in;
   tree wide_vectype = vectype_out;
   enum tree_code c1, c2;
+  int i;
+  tree prev_type, intermediate_type;
+  enum machine_mode intermediate_mode, prev_mode;
+  optab optab3, optab4;
 
-  /* The result of a vectorized widening operation usually requires two vectors
-     (because the widened results do not fit int one vector). The generated
-     vector results would normally be expected to be generated in the same
-     order as in the original scalar computation, i.e. if 8 results are
-     generated in each vector iteration, they are to be organized as follows:
-        vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
-
-     However, in the special case that the result of the widening operation is
-     used in a reduction computation only, the order doesn't matter (because
-     when vectorizing a reduction we change the order of the computation).
-     Some targets can take advantage of this and generate more efficient code.
-     For example, targets like Altivec, that support widen_mult using a sequence
-     of {mult_even,mult_odd} generate the following vectors:
-        vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
-
-     When vectorizing outer-loops, we execute the inner-loop sequentially
-     (each vectorized inner-loop iteration contributes to VF outer-loop
-     iterations in parallel).  We therefore don't allow to change the order
-     of the computation in the inner-loop during outer-loop vectorization.  */
-
-   if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
-       && !nested_in_vect_loop_p (vect_loop, stmt))
-     ordered_p = false;
-   else
-     ordered_p = true;
-
-  if (!ordered_p
-      && code == WIDEN_MULT_EXPR
-      && targetm.vectorize.builtin_mul_widen_even
-      && targetm.vectorize.builtin_mul_widen_even (vectype)
-      && targetm.vectorize.builtin_mul_widen_odd
-      && targetm.vectorize.builtin_mul_widen_odd (vectype))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "Unordered widening operation detected.");
-
-      *code1 = *code2 = CALL_EXPR;
-      *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
-      *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
-      return true;
-    }
+  *multi_step_cvt = 0;
+  if (loop_info)
+    vect_loop = LOOP_VINFO_LOOP (loop_info);
 
   switch (code)
     {
     case WIDEN_MULT_EXPR:
-      if (BYTES_BIG_ENDIAN)
-        {
-          c1 = VEC_WIDEN_MULT_HI_EXPR;
-          c2 = VEC_WIDEN_MULT_LO_EXPR;
-        }
-      else
-        {
-          c2 = VEC_WIDEN_MULT_HI_EXPR;
-          c1 = VEC_WIDEN_MULT_LO_EXPR;
-        }
+      /* The result of a vectorized widening operation usually requires
+        two vectors (because the widened results do not fit into one vector).
+        The generated vector results would normally be expected to be
+        generated in the same order as in the original scalar computation,
+        i.e. if 8 results are generated in each vector iteration, they are
+        to be organized as follows:
+               vect1: [res1,res2,res3,res4],
+               vect2: [res5,res6,res7,res8].
+
+        However, in the special case that the result of the widening
+        operation is used in a reduction computation only, the order doesn't
+        matter (because when vectorizing a reduction we change the order of
+        the computation).  Some targets can take advantage of this and
+        generate more efficient code.  For example, targets like Altivec,
+        that support widen_mult using a sequence of {mult_even,mult_odd}
+        generate the following vectors:
+               vect1: [res1,res3,res5,res7],
+               vect2: [res2,res4,res6,res8].
+
+        When vectorizing outer-loops, we execute the inner-loop sequentially
+        (each vectorized inner-loop iteration contributes to VF outer-loop
+        iterations in parallel).  We therefore don't allow to change the
+        order of the computation in the inner-loop during outer-loop
+        vectorization.  */
+      /* TODO: Another case in which order doesn't *really* matter is when we
+        widen and then contract again, e.g. (short)((int)x * y >> 8).
+        Normally, pack_trunc performs an even/odd permute, whereas the 
+        repack from an even/odd expansion would be an interleave, which
+        would be significantly simpler for e.g. AVX2.  */
+      /* In any case, in order to avoid duplicating the code below, recurse
+        on VEC_WIDEN_MULT_EVEN_EXPR.  If it succeeds, all the return values
+        are properly set up for the caller.  If we fail, we'll continue with
+        a VEC_WIDEN_MULT_LO/HI_EXPR check.  */
+      if (vect_loop
+         && STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
+         && !nested_in_vect_loop_p (vect_loop, stmt)
+         && supportable_widening_operation (VEC_WIDEN_MULT_EVEN_EXPR,
+                                            stmt, vectype_out, vectype_in,
+                                            code1, code2, multi_step_cvt,
+                                            interm_types))
+       return true;
+      c1 = VEC_WIDEN_MULT_LO_EXPR;
+      c2 = VEC_WIDEN_MULT_HI_EXPR;
+      break;
+
+    case VEC_WIDEN_MULT_EVEN_EXPR:
+      /* Support the recursion induced just above.  */
+      c1 = VEC_WIDEN_MULT_EVEN_EXPR;
+      c2 = VEC_WIDEN_MULT_ODD_EXPR;
+      break;
+
+    case WIDEN_LSHIFT_EXPR:
+      c1 = VEC_WIDEN_LSHIFT_LO_EXPR;
+      c2 = VEC_WIDEN_LSHIFT_HI_EXPR;
       break;
 
     CASE_CONVERT:
-      if (BYTES_BIG_ENDIAN)
-        {
-          c1 = VEC_UNPACK_HI_EXPR;
-          c2 = VEC_UNPACK_LO_EXPR;
-        }
-      else
-        {
-          c2 = VEC_UNPACK_HI_EXPR;
-          c1 = VEC_UNPACK_LO_EXPR;
-        }
+      c1 = VEC_UNPACK_LO_EXPR;
+      c2 = VEC_UNPACK_HI_EXPR;
       break;
 
     case FLOAT_EXPR:
-      if (BYTES_BIG_ENDIAN)
-        {
-          c1 = VEC_UNPACK_FLOAT_HI_EXPR;
-          c2 = VEC_UNPACK_FLOAT_LO_EXPR;
-        }
-      else
-        {
-          c2 = VEC_UNPACK_FLOAT_HI_EXPR;
-          c1 = VEC_UNPACK_FLOAT_LO_EXPR;
-        }
+      c1 = VEC_UNPACK_FLOAT_LO_EXPR;
+      c2 = VEC_UNPACK_FLOAT_HI_EXPR;
       break;
 
     case FIX_TRUNC_EXPR:
@@ -5730,6 +6361,13 @@ supportable_widening_operation (enum tree_code code, gimple stmt,
       gcc_unreachable ();
     }
 
+  if (BYTES_BIG_ENDIAN && c1 != VEC_WIDEN_MULT_EVEN_EXPR)
+    {
+      enum tree_code ctmp = c1;
+      c1 = c2;
+      c2 = ctmp;
+    }
+
   if (code == FIX_TRUNC_EXPR)
     {
       /* The signedness is determined from output operand.  */
@@ -5750,65 +6388,60 @@ supportable_widening_operation (enum tree_code code, gimple stmt,
        || (icode2 = optab_handler (optab2, vec_mode)) == CODE_FOR_nothing)
     return false;
 
+  *code1 = c1;
+  *code2 = c2;
+
+  if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
+      && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
+    return true;
+
   /* Check if it's a multi-step conversion that can be done using intermediate
      types.  */
-  if (insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
-       || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
-    {
-      int i;
-      tree prev_type = vectype, intermediate_type;
-      enum machine_mode intermediate_mode, prev_mode = vec_mode;
-      optab optab3, optab4;
 
-      if (!CONVERT_EXPR_CODE_P (code))
-        return false;
+  prev_type = vectype;
+  prev_mode = vec_mode;
 
-      *code1 = c1;
-      *code2 = c2;
+  if (!CONVERT_EXPR_CODE_P (code))
+    return false;
 
-      /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
-         intermediate steps in promotion sequence.  We try
-         MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
-         not.  */
-      *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
-      for (i = 0; i < 3; i++)
-        {
-          intermediate_mode = insn_data[icode1].operand[0].mode;
-          intermediate_type = lang_hooks.types.type_for_mode (intermediate_mode,
-                                                     TYPE_UNSIGNED (prev_type));
-          optab3 = optab_for_tree_code (c1, intermediate_type, optab_default);
-          optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
-
-          if (!optab3 || !optab4
-              || ((icode1 = optab_handler (optab1, prev_mode))
-                 == CODE_FOR_nothing)
-              || insn_data[icode1].operand[0].mode != intermediate_mode
-              || ((icode2 = optab_handler (optab2, prev_mode))
-                 == CODE_FOR_nothing)
-              || insn_data[icode2].operand[0].mode != intermediate_mode
-              || ((icode1 = optab_handler (optab3, intermediate_mode))
-                 == CODE_FOR_nothing)
-              || ((icode2 = optab_handler (optab4, intermediate_mode))
-                 == CODE_FOR_nothing))
-            return false;
-
-          VEC_quick_push (tree, *interm_types, intermediate_type);
-          (*multi_step_cvt)++;
-
-          if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
-              && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
-            return true;
-
-          prev_type = intermediate_type;
-          prev_mode = intermediate_mode;
-        }
+  /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
+     intermediate steps in promotion sequence.  We try
+     MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
+     not.  */
+  *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
+  for (i = 0; i < MAX_INTERM_CVT_STEPS; i++)
+    {
+      intermediate_mode = insn_data[icode1].operand[0].mode;
+      intermediate_type
+       = lang_hooks.types.type_for_mode (intermediate_mode,
+                                         TYPE_UNSIGNED (prev_type));
+      optab3 = optab_for_tree_code (c1, intermediate_type, optab_default);
+      optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
+
+      if (!optab3 || !optab4
+          || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing
+         || insn_data[icode1].operand[0].mode != intermediate_mode
+         || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing
+         || insn_data[icode2].operand[0].mode != intermediate_mode
+         || ((icode1 = optab_handler (optab3, intermediate_mode))
+             == CODE_FOR_nothing)
+         || ((icode2 = optab_handler (optab4, intermediate_mode))
+             == CODE_FOR_nothing))
+       break;
 
-       return false;
+      VEC_quick_push (tree, *interm_types, intermediate_type);
+      (*multi_step_cvt)++;
+
+      if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
+         && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
+       return true;
+
+      prev_type = intermediate_type;
+      prev_mode = intermediate_mode;
     }
 
-  *code1 = c1;
-  *code2 = c2;
-  return true;
+  VEC_free (tree, heap, *interm_types);
+  return false;
 }
 
 
@@ -5844,9 +6477,12 @@ supportable_narrowing_operation (enum tree_code code,
   tree vectype = vectype_in;
   tree narrow_vectype = vectype_out;
   enum tree_code c1;
-  tree intermediate_type, prev_type;
+  tree intermediate_type;
+  enum machine_mode intermediate_mode, prev_mode;
   int i;
+  bool uns;
 
+  *multi_step_cvt = 0;
   switch (code)
     {
     CASE_CONVERT:
@@ -5879,47 +6515,70 @@ supportable_narrowing_operation (enum tree_code code,
   if ((icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing)
     return false;
 
+  *code1 = c1;
+
+  if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
+    return true;
+
   /* Check if it's a multi-step conversion that can be done using intermediate
      types.  */
-  if (insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
-    {
-      enum machine_mode intermediate_mode, prev_mode = vec_mode;
-
-      *code1 = c1;
-      prev_type = vectype;
-      /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
-         intermediate steps in promotion sequence.  We try
-         MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
-         not.  */
-      *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
-      for (i = 0; i < 3; i++)
-        {
-          intermediate_mode = insn_data[icode1].operand[0].mode;
-          intermediate_type = lang_hooks.types.type_for_mode (intermediate_mode,
-                                                     TYPE_UNSIGNED (prev_type));
-          interm_optab = optab_for_tree_code (c1, intermediate_type,
-                                              optab_default);
-          if (!interm_optab
-              || ((icode1 = optab_handler (optab1, prev_mode))
-                 == CODE_FOR_nothing)
-              || insn_data[icode1].operand[0].mode != intermediate_mode
-              || ((icode1 = optab_handler (interm_optab, intermediate_mode))
-                 == CODE_FOR_nothing))
-            return false;
-
-          VEC_quick_push (tree, *interm_types, intermediate_type);
-          (*multi_step_cvt)++;
-
-          if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
-            return true;
-
-          prev_type = intermediate_type;
-          prev_mode = intermediate_mode;
-        }
+  prev_mode = vec_mode;
+  if (code == FIX_TRUNC_EXPR)
+    uns = TYPE_UNSIGNED (vectype_out);
+  else
+    uns = TYPE_UNSIGNED (vectype);
+
+  /* For multi-step FIX_TRUNC_EXPR prefer signed floating to integer
+     conversion over unsigned, as unsigned FIX_TRUNC_EXPR is often more
+     costly than signed.  */
+  if (code == FIX_TRUNC_EXPR && uns)
+    {
+      enum insn_code icode2;
+
+      intermediate_type
+       = lang_hooks.types.type_for_mode (TYPE_MODE (vectype_out), 0);
+      interm_optab
+       = optab_for_tree_code (c1, intermediate_type, optab_default);
+      if (interm_optab != unknown_optab
+         && (icode2 = optab_handler (optab1, vec_mode)) != CODE_FOR_nothing
+         && insn_data[icode1].operand[0].mode
+            == insn_data[icode2].operand[0].mode)
+       {
+         uns = false;
+         optab1 = interm_optab;
+         icode1 = icode2;
+       }
+    }
 
-      return false;
+  /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
+     intermediate steps in promotion sequence.  We try
+     MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do not.  */
+  *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
+  for (i = 0; i < MAX_INTERM_CVT_STEPS; i++)
+    {
+      intermediate_mode = insn_data[icode1].operand[0].mode;
+      intermediate_type
+       = lang_hooks.types.type_for_mode (intermediate_mode, uns);
+      interm_optab
+       = optab_for_tree_code (VEC_PACK_TRUNC_EXPR, intermediate_type,
+                              optab_default);
+      if (!interm_optab
+         || ((icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing)
+         || insn_data[icode1].operand[0].mode != intermediate_mode
+         || ((icode1 = optab_handler (interm_optab, intermediate_mode))
+             == CODE_FOR_nothing))
+       break;
+
+      VEC_quick_push (tree, *interm_types, intermediate_type);
+      (*multi_step_cvt)++;
+
+      if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
+       return true;
+
+      prev_mode = intermediate_mode;
+      optab1 = interm_optab;
     }
 
-  *code1 = c1;
-  return true;
+  VEC_free (tree, heap, *interm_types);
+  return false;
 }