tree-vect-slp.c (vect_get_and_check_slp_defs): Remove code dealing with cost.
[platform/upstream/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2013 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "target.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "recog.h"              /* FIXME: for insn_data */
37 #include "optabs.h"
38 #include "tree-vectorizer.h"
39 #include "langhooks.h"
40
41 /* Extract the location of the basic block in the source code.
42    Return the basic block location if succeed and NULL if not.  */
43
44 LOC
45 find_bb_location (basic_block bb)
46 {
47   gimple stmt = NULL;
48   gimple_stmt_iterator si;
49
50   if (!bb)
51     return UNKNOWN_LOC;
52
53   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
54     {
55       stmt = gsi_stmt (si);
56       if (gimple_location (stmt) != UNKNOWN_LOC)
57         return gimple_location (stmt);
58     }
59
60   return UNKNOWN_LOC;
61 }
62
63
64 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
65
66 static void
67 vect_free_slp_tree (slp_tree node)
68 {
69   int i;
70   slp_tree child;
71
72   if (!node)
73     return;
74
75   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
76     vect_free_slp_tree (child);
77
78   SLP_TREE_CHILDREN (node).release ();
79   SLP_TREE_SCALAR_STMTS (node).release ();
80   SLP_TREE_VEC_STMTS (node).release ();
81
82   free (node);
83 }
84
85
86 /* Free the memory allocated for the SLP instance.  */
87
88 void
89 vect_free_slp_instance (slp_instance instance)
90 {
91   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
92   SLP_INSTANCE_LOAD_PERMUTATION (instance).release ();
93   SLP_INSTANCE_LOADS (instance).release ();
94   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
95   free (instance);
96 }
97
98
99 /* Create an SLP node for SCALAR_STMTS.  */
100
101 static slp_tree
102 vect_create_new_slp_node (vec<gimple> scalar_stmts)
103 {
104   slp_tree node;
105   gimple stmt = scalar_stmts[0];
106   unsigned int nops;
107
108   if (is_gimple_call (stmt))
109     nops = gimple_call_num_args (stmt);
110   else if (is_gimple_assign (stmt))
111     {
112       nops = gimple_num_ops (stmt) - 1;
113       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114         nops++;
115     }
116   else
117     return NULL;
118
119   node = XNEW (struct _slp_tree);
120   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
121   SLP_TREE_VEC_STMTS (node).create (0);
122   SLP_TREE_CHILDREN (node).create (nops);
123
124   return node;
125 }
126
127
128 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
129    operand.  */
130 static vec<slp_oprnd_info> 
131 vect_create_oprnd_info (int nops, int group_size)
132 {
133   int i;
134   slp_oprnd_info oprnd_info;
135   vec<slp_oprnd_info> oprnds_info;
136
137   oprnds_info.create (nops);
138   for (i = 0; i < nops; i++)
139     {
140       oprnd_info = XNEW (struct _slp_oprnd_info);
141       oprnd_info->def_stmts.create (group_size);
142       oprnd_info->first_dt = vect_uninitialized_def;
143       oprnd_info->first_def_type = NULL_TREE;
144       oprnd_info->first_const_oprnd = NULL_TREE;
145       oprnd_info->first_pattern = false;
146       oprnds_info.quick_push (oprnd_info);
147     }
148
149   return oprnds_info;
150 }
151
152
153 /* Free operands info.  */
154
155 static void
156 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
157 {
158   int i;
159   slp_oprnd_info oprnd_info;
160
161   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
162     {
163       oprnd_info->def_stmts.release ();
164       XDELETE (oprnd_info);
165     }
166
167   oprnds_info.release ();
168 }
169
170
171 /* Find the place of the data-ref in STMT in the interleaving chain that starts
172    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
173
174 static int
175 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
176 {
177   gimple next_stmt = first_stmt;
178   int result = 0;
179
180   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
181     return -1;
182
183   do
184     {
185       if (next_stmt == stmt)
186         return result;
187       result++;
188       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
189     }
190   while (next_stmt);
191
192   return -1;
193 }
194
195
196 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
197    they are of a valid type and that they match the defs of the first stmt of
198    the SLP group (stored in OPRNDS_INFO).  */
199
200 static bool
201 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
202                              gimple stmt, bool first,
203                              vec<slp_oprnd_info> *oprnds_info)
204 {
205   tree oprnd;
206   unsigned int i, number_of_oprnds;
207   tree def, def_op0 = NULL_TREE;
208   gimple def_stmt;
209   enum vect_def_type dt = vect_uninitialized_def;
210   enum vect_def_type dt_op0 = vect_uninitialized_def;
211   struct loop *loop = NULL;
212   enum tree_code rhs_code;
213   bool different_types = false;
214   bool pattern = false;
215   slp_oprnd_info oprnd_info, oprnd0_info, oprnd1_info;
216   int op_idx = 1;
217   tree compare_rhs = NULL_TREE;
218
219   if (loop_vinfo)
220     loop = LOOP_VINFO_LOOP (loop_vinfo);
221
222   if (is_gimple_call (stmt))
223     {
224       number_of_oprnds = gimple_call_num_args (stmt);
225       op_idx = 3;
226     }
227   else if (is_gimple_assign (stmt))
228     {
229       number_of_oprnds = gimple_num_ops (stmt) - 1;
230       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
231         number_of_oprnds++;
232     }
233   else
234     return false;
235
236   for (i = 0; i < number_of_oprnds; i++)
237     {
238       if (compare_rhs)
239         {
240           oprnd = compare_rhs;
241           compare_rhs = NULL_TREE;
242         }
243       else
244         oprnd = gimple_op (stmt, op_idx++);
245
246       oprnd_info = (*oprnds_info)[i];
247
248       if (COMPARISON_CLASS_P (oprnd))
249         {
250           compare_rhs = TREE_OPERAND (oprnd, 1);
251           oprnd = TREE_OPERAND (oprnd, 0);
252         }
253
254       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
255                                &def, &dt)
256           || (!def_stmt && dt != vect_constant_def))
257         {
258           if (dump_enabled_p ())
259             {
260               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261                                "Build SLP failed: can't find def for ");
262               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
263             }
264
265           return false;
266         }
267
268       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
269          from the pattern.  Check that all the stmts of the node are in the
270          pattern.  */
271       if (def_stmt && gimple_bb (def_stmt)
272           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
273               || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
274                   && gimple_code (def_stmt) != GIMPLE_PHI))
275           && vinfo_for_stmt (def_stmt)
276           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
277           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
278           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
279         {
280           pattern = true;
281           if (!first && !oprnd_info->first_pattern)
282             {
283               if (dump_enabled_p ())
284                 {
285                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
286                                    "Build SLP failed: some of the stmts"
287                                    " are in a pattern, and others are not ");
288                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
289                 }
290
291               return false;
292             }
293
294           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
295           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
296
297           if (dt == vect_unknown_def_type)
298             {
299               if (dump_enabled_p ())
300                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
301                                  "Unsupported pattern.");
302               return false;
303             }
304
305           switch (gimple_code (def_stmt))
306             {
307               case GIMPLE_PHI:
308                 def = gimple_phi_result (def_stmt);
309                 break;
310
311               case GIMPLE_ASSIGN:
312                 def = gimple_assign_lhs (def_stmt);
313                 break;
314
315               default:
316                 if (dump_enabled_p ())
317                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
318                                    "unsupported defining stmt: ");
319                 return false;
320             }
321         }
322
323       if (first)
324         {
325           oprnd_info->first_dt = dt;
326           oprnd_info->first_pattern = pattern;
327           if (def)
328             {
329               oprnd_info->first_def_type = TREE_TYPE (def);
330               oprnd_info->first_const_oprnd = NULL_TREE;
331             }
332           else
333             {
334               oprnd_info->first_def_type = NULL_TREE;
335               oprnd_info->first_const_oprnd = oprnd;
336             }
337
338           if (i == 0)
339             {
340               def_op0 = def;
341               dt_op0 = dt;
342             }
343         }
344       else
345         {
346           /* Not first stmt of the group, check that the def-stmt/s match
347              the def-stmt/s of the first stmt.  Allow different definition
348              types for reduction chains: the first stmt must be a
349              vect_reduction_def (a phi node), and the rest
350              vect_internal_def.  */
351           if (((oprnd_info->first_dt != dt
352                 && !(oprnd_info->first_dt == vect_reduction_def
353                      && dt == vect_internal_def))
354                || (oprnd_info->first_def_type != NULL_TREE
355                    && def
356                    && !types_compatible_p (oprnd_info->first_def_type,
357                                            TREE_TYPE (def))))
358                || (!def
359                    && !types_compatible_p (TREE_TYPE (oprnd_info->first_const_oprnd),
360                                            TREE_TYPE (oprnd)))
361                || different_types)
362             {
363               if (number_of_oprnds != 2)
364                 {
365                   if (dump_enabled_p ())
366                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
367                                      "Build SLP failed: different types ");
368
369                   return false;
370                 }
371
372               /* Try to swap operands in case of binary operation.  */
373               if (i == 0)
374                 different_types = true;
375               else
376                 {
377                   oprnd0_info = (*oprnds_info)[0];
378                   if (is_gimple_assign (stmt)
379                       && (rhs_code = gimple_assign_rhs_code (stmt))
380                       && TREE_CODE_CLASS (rhs_code) == tcc_binary
381                       && commutative_tree_code (rhs_code)
382                       && oprnd0_info->first_dt == dt
383                       && oprnd_info->first_dt == dt_op0
384                       && def_op0 && def
385                       && !(oprnd0_info->first_def_type
386                            && !types_compatible_p (oprnd0_info->first_def_type,
387                                                    TREE_TYPE (def)))
388                       && !(oprnd_info->first_def_type
389                            && !types_compatible_p (oprnd_info->first_def_type,
390                                                    TREE_TYPE (def_op0))))
391                     {
392                       if (dump_enabled_p ())
393                         {
394                           dump_printf_loc (MSG_NOTE, vect_location,
395                                            "Swapping operands of ");
396                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
397                         }
398
399                       swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
400                                           gimple_assign_rhs2_ptr (stmt));
401                     }
402                   else
403                     {
404                       if (dump_enabled_p ())
405                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406                                          "Build SLP failed: different types ");
407
408                       return false;
409                     }
410                 }
411             }
412         }
413
414       /* Check the types of the definitions.  */
415       switch (dt)
416         {
417         case vect_constant_def:
418         case vect_external_def:
419         case vect_reduction_def:
420           break;
421
422         case vect_internal_def:
423           if (different_types)
424             {
425               oprnd0_info = (*oprnds_info)[0];
426               oprnd1_info = (*oprnds_info)[0];
427               if (i == 0)
428                 oprnd1_info->def_stmts.quick_push (def_stmt);
429               else
430                 oprnd0_info->def_stmts.quick_push (def_stmt);
431             }
432           else
433             oprnd_info->def_stmts.quick_push (def_stmt);
434
435           break;
436
437         default:
438           /* FORNOW: Not supported.  */
439           if (dump_enabled_p ())
440             {
441               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
442                                "Build SLP failed: illegal type of def ");
443               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
444             }
445
446           return false;
447         }
448     }
449
450   return true;
451 }
452
453
454 /* Recursively build an SLP tree starting from NODE.
455    Fail (and return FALSE) if def-stmts are not isomorphic, require data
456    permutation or are of unsupported types of operation.  Otherwise, return
457    TRUE.  */
458
459 static bool
460 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
461                      slp_tree *node, unsigned int group_size,
462                      unsigned int *max_nunits,
463                      vec<int> *load_permutation,
464                      vec<slp_tree> *loads,
465                      unsigned int vectorization_factor, bool *loads_permuted)
466 {
467   unsigned int i;
468   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (*node);
469   gimple stmt = stmts[0];
470   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
471   enum tree_code first_cond_code = ERROR_MARK;
472   tree lhs;
473   bool stop_recursion = false, need_same_oprnds = false;
474   tree vectype, scalar_type, first_op1 = NULL_TREE;
475   optab optab;
476   int icode;
477   enum machine_mode optab_op2_mode;
478   enum machine_mode vec_mode;
479   struct data_reference *first_dr;
480   HOST_WIDE_INT dummy;
481   bool permutation = false;
482   unsigned int load_place;
483   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
484   vec<slp_oprnd_info> oprnds_info;
485   unsigned int nops;
486   slp_oprnd_info oprnd_info;
487   tree cond;
488
489   if (is_gimple_call (stmt))
490     nops = gimple_call_num_args (stmt);
491   else if (is_gimple_assign (stmt))
492     {
493       nops = gimple_num_ops (stmt) - 1;
494       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
495         nops++;
496     }
497   else
498     return false;
499
500   oprnds_info = vect_create_oprnd_info (nops, group_size);
501
502   /* For every stmt in NODE find its def stmt/s.  */
503   FOR_EACH_VEC_ELT (stmts, i, stmt)
504     {
505       if (dump_enabled_p ())
506         {
507           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
508           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
509         }
510
511       /* Fail to vectorize statements marked as unvectorizable.  */
512       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
513         {
514           if (dump_enabled_p ())
515             {
516               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
517                                "Build SLP failed: unvectorizable statement ");
518               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
519             }
520
521           vect_free_oprnd_info (oprnds_info);
522           return false;
523         }
524
525       lhs = gimple_get_lhs (stmt);
526       if (lhs == NULL_TREE)
527         {
528           if (dump_enabled_p ())
529             {
530               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
531                                "Build SLP failed: not GIMPLE_ASSIGN nor "
532                                "GIMPLE_CALL ");
533               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
534             }
535
536           vect_free_oprnd_info (oprnds_info);
537           return false;
538         }
539
540        if (is_gimple_assign (stmt)
541            && gimple_assign_rhs_code (stmt) == COND_EXPR
542            && (cond = gimple_assign_rhs1 (stmt))
543            && !COMPARISON_CLASS_P (cond))
544         {
545           if (dump_enabled_p ())
546             {
547               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
548                                "Build SLP failed: condition is not "
549                                "comparison ");
550               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
551             }
552
553           vect_free_oprnd_info (oprnds_info);
554           return false;
555         }
556
557       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
558       vectype = get_vectype_for_scalar_type (scalar_type);
559       if (!vectype)
560         {
561           if (dump_enabled_p ())
562             {
563               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
564                                "Build SLP failed: unsupported data-type ");
565               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
566                                  scalar_type);
567             }
568
569           vect_free_oprnd_info (oprnds_info);
570           return false;
571         }
572
573       /* In case of multiple types we need to detect the smallest type.  */
574       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
575         {
576           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
577           if (bb_vinfo)
578             vectorization_factor = *max_nunits;
579         }
580
581       if (is_gimple_call (stmt))
582         {
583           rhs_code = CALL_EXPR;
584           if (gimple_call_internal_p (stmt)
585               || gimple_call_tail_p (stmt)
586               || gimple_call_noreturn_p (stmt)
587               || !gimple_call_nothrow_p (stmt)
588               || gimple_call_chain (stmt))
589             {
590               if (dump_enabled_p ())
591                 {
592                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
593                                    "Build SLP failed: unsupported call type ");
594                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
595                 }
596
597               vect_free_oprnd_info (oprnds_info);
598               return false;
599             }
600         }
601       else
602         rhs_code = gimple_assign_rhs_code (stmt);
603
604       /* Check the operation.  */
605       if (i == 0)
606         {
607           first_stmt_code = rhs_code;
608
609           /* Shift arguments should be equal in all the packed stmts for a
610              vector shift with scalar shift operand.  */
611           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
612               || rhs_code == LROTATE_EXPR
613               || rhs_code == RROTATE_EXPR)
614             {
615               vec_mode = TYPE_MODE (vectype);
616
617               /* First see if we have a vector/vector shift.  */
618               optab = optab_for_tree_code (rhs_code, vectype,
619                                            optab_vector);
620
621               if (!optab
622                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
623                 {
624                   /* No vector/vector shift, try for a vector/scalar shift.  */
625                   optab = optab_for_tree_code (rhs_code, vectype,
626                                                optab_scalar);
627
628                   if (!optab)
629                     {
630                       if (dump_enabled_p ())
631                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
632                                          "Build SLP failed: no optab.");
633                       vect_free_oprnd_info (oprnds_info);
634                       return false;
635                     }
636                   icode = (int) optab_handler (optab, vec_mode);
637                   if (icode == CODE_FOR_nothing)
638                     {
639                       if (dump_enabled_p ())
640                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641                                          "Build SLP failed: "
642                                          "op not supported by target.");
643                       vect_free_oprnd_info (oprnds_info);
644                       return false;
645                     }
646                   optab_op2_mode = insn_data[icode].operand[2].mode;
647                   if (!VECTOR_MODE_P (optab_op2_mode))
648                     {
649                       need_same_oprnds = true;
650                       first_op1 = gimple_assign_rhs2 (stmt);
651                     }
652                 }
653             }
654           else if (rhs_code == WIDEN_LSHIFT_EXPR)
655             {
656               need_same_oprnds = true;
657               first_op1 = gimple_assign_rhs2 (stmt);
658             }
659         }
660       else
661         {
662           if (first_stmt_code != rhs_code
663               && (first_stmt_code != IMAGPART_EXPR
664                   || rhs_code != REALPART_EXPR)
665               && (first_stmt_code != REALPART_EXPR
666                   || rhs_code != IMAGPART_EXPR)
667               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
668                    && (first_stmt_code == ARRAY_REF
669                        || first_stmt_code == BIT_FIELD_REF
670                        || first_stmt_code == INDIRECT_REF
671                        || first_stmt_code == COMPONENT_REF
672                        || first_stmt_code == MEM_REF)))
673             {
674               if (dump_enabled_p ())
675                 {
676                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
677                                    "Build SLP failed: different operation "
678                                    "in stmt ");
679                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
680                 }
681
682               vect_free_oprnd_info (oprnds_info);
683               return false;
684             }
685
686           if (need_same_oprnds
687               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
688             {
689               if (dump_enabled_p ())
690                 {
691                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
692                                    "Build SLP failed: different shift "
693                                    "arguments in ");
694                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
695                 }
696
697               vect_free_oprnd_info (oprnds_info);
698               return false;
699             }
700
701           if (rhs_code == CALL_EXPR)
702             {
703               gimple first_stmt = stmts[0];
704               if (gimple_call_num_args (stmt) != nops
705                   || !operand_equal_p (gimple_call_fn (first_stmt),
706                                        gimple_call_fn (stmt), 0)
707                   || gimple_call_fntype (first_stmt)
708                      != gimple_call_fntype (stmt))
709                 {
710                   if (dump_enabled_p ())
711                     {
712                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
713                                        "Build SLP failed: different calls in ");
714                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
715                                         stmt, 0);
716                     }
717
718                   vect_free_oprnd_info (oprnds_info);
719                   return false;
720                 }
721             }
722         }
723
724       /* Grouped store or load.  */
725       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
726         {
727           if (REFERENCE_CLASS_P (lhs))
728             {
729               /* Store.  */
730               if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
731                                                 stmt, (i == 0), &oprnds_info))
732                 {
733                   vect_free_oprnd_info (oprnds_info);
734                   return false;
735                 }
736             }
737           else
738             {
739               /* Load.  */
740               unsigned unrolling_factor
741                 = least_common_multiple
742                     (*max_nunits, group_size) / group_size;
743               /* FORNOW: Check that there is no gap between the loads
744                  and no gap between the groups when we need to load
745                  multiple groups at once.
746                  ???  We should enhance this to only disallow gaps
747                  inside vectors.  */
748               if ((unrolling_factor > 1
749                    && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
750                    && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
751                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
752                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
753                 {
754                   if (dump_enabled_p ())
755                     {
756                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757                                        "Build SLP failed: grouped "
758                                        "loads have gaps ");
759                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
760                                         stmt, 0);
761                     }
762
763                   vect_free_oprnd_info (oprnds_info);
764                   return false;
765                 }
766
767               /* Check that the size of interleaved loads group is not
768                  greater than the SLP group size.  */
769               unsigned ncopies
770                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
771               if (loop_vinfo
772                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
774                        - GROUP_GAP (vinfo_for_stmt (stmt)))
775                       > ncopies * group_size))
776                 {
777                   if (dump_enabled_p ())
778                     {
779                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780                                        "Build SLP failed: the number "
781                                        "of interleaved loads is greater than "
782                                        "the SLP group size ");
783                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
784                                         stmt, 0);
785                     }
786
787                   vect_free_oprnd_info (oprnds_info);
788                   return false;
789                 }
790
791               old_first_load = first_load;
792               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
793               if (prev_first_load)
794                 {
795                   /* Check that there are no loads from different interleaving
796                      chains in the same node.  The only exception is complex
797                      numbers.  */
798                   if (prev_first_load != first_load
799                       && rhs_code != REALPART_EXPR
800                       && rhs_code != IMAGPART_EXPR)
801                     {
802                       if (dump_enabled_p ())
803                         {
804                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
805                                            vect_location, 
806                                            "Build SLP failed: different "
807                                            "interleaving chains in one node ");
808                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
809                                             stmt, 0);
810                         }
811
812                       vect_free_oprnd_info (oprnds_info);
813                       return false;
814                     }
815                 }
816               else
817                 prev_first_load = first_load;
818
819               /* In some cases a group of loads is just the same load
820                  repeated N times.  Only analyze its cost once.  */
821               if (first_load == stmt && old_first_load != first_load)
822                 {
823                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
824                   if (vect_supportable_dr_alignment (first_dr, false)
825                       == dr_unaligned_unsupported)
826                     {
827                       if (dump_enabled_p ())
828                         {
829                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
830                                            vect_location, 
831                                            "Build SLP failed: unsupported "
832                                            "unaligned load ");
833                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
834                                             stmt, 0);
835                         }
836
837                       vect_free_oprnd_info (oprnds_info);
838                       return false;
839                     }
840                 }
841
842               /* Store the place of this load in the interleaving chain.  In
843                  case that permutation is needed we later decide if a specific
844                  permutation is supported.  */
845               load_place = vect_get_place_in_interleaving_chain (stmt,
846                                                                  first_load);
847               if (load_place != i)
848                 permutation = true;
849
850               load_permutation->safe_push (load_place);
851
852               /* We stop the tree when we reach a group of loads.  */
853               stop_recursion = true;
854              continue;
855            }
856         } /* Grouped access.  */
857       else
858         {
859           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
860             {
861               /* Not grouped load.  */
862               if (dump_enabled_p ())
863                 {
864                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
865                                    "Build SLP failed: not grouped load ");
866                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
867                 }
868
869               /* FORNOW: Not grouped loads are not supported.  */
870               vect_free_oprnd_info (oprnds_info);
871               return false;
872             }
873
874           /* Not memory operation.  */
875           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
876               && TREE_CODE_CLASS (rhs_code) != tcc_unary
877               && rhs_code != COND_EXPR
878               && rhs_code != CALL_EXPR)
879             {
880               if (dump_enabled_p ())
881                 {
882                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
883                                    "Build SLP failed: operation");
884                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
885                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
886                 }
887
888               vect_free_oprnd_info (oprnds_info);
889               return false;
890             }
891
892           if (rhs_code == COND_EXPR)
893             {
894               tree cond_expr = gimple_assign_rhs1 (stmt);
895
896               if (i == 0)
897                 first_cond_code = TREE_CODE (cond_expr);
898               else if (first_cond_code != TREE_CODE (cond_expr))
899                 {
900                   if (dump_enabled_p ())
901                     {
902                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
903                                        "Build SLP failed: different"
904                                        " operation");
905                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
906                                         stmt, 0);
907                     }
908
909                   vect_free_oprnd_info (oprnds_info);
910                   return false;
911                 }
912             }
913
914           /* Find the def-stmts.  */
915           if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, stmt,
916                                             (i == 0), &oprnds_info))
917             {
918               vect_free_oprnd_info (oprnds_info);
919               return false;
920             }
921         }
922     }
923
924   /* Grouped loads were reached - stop the recursion.  */
925   if (stop_recursion)
926     {
927       loads->safe_push (*node);
928       if (permutation)
929         *loads_permuted = true;
930       else
931         {
932           /* We don't check here complex numbers chains, so we set
933              LOADS_PERMUTED for further check in
934              vect_supported_load_permutation_p.  */
935           if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
936             *loads_permuted = true;
937         }
938
939       vect_free_oprnd_info (oprnds_info);
940       return true;
941     }
942
943   /* Create SLP_TREE nodes for the definition node/s.  */
944   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
945     {
946       slp_tree child;
947
948       if (oprnd_info->first_dt != vect_internal_def)
949         continue;
950
951       child = vect_create_new_slp_node (oprnd_info->def_stmts);
952       if (!child
953           || !vect_build_slp_tree (loop_vinfo, bb_vinfo, &child, group_size,
954                                    max_nunits, load_permutation, loads,
955                                    vectorization_factor, loads_permuted))
956         {
957           if (child)
958             oprnd_info->def_stmts = vNULL;
959           vect_free_slp_tree (child);
960           vect_free_oprnd_info (oprnds_info);
961           return false;
962         }
963
964       oprnd_info->def_stmts.create (0);
965       SLP_TREE_CHILDREN (*node).quick_push (child);
966     }
967
968   vect_free_oprnd_info (oprnds_info);
969   return true;
970 }
971
972 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
973
974 static void
975 vect_print_slp_tree (int dump_kind, slp_tree node)
976 {
977   int i;
978   gimple stmt;
979   slp_tree child;
980
981   if (!node)
982     return;
983
984   dump_printf (dump_kind, "node ");
985   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
986     {
987       dump_printf (dump_kind, "\n\tstmt %d ", i);
988       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
989     }
990   dump_printf (dump_kind, "\n");
991
992   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
993     vect_print_slp_tree (dump_kind, child);
994 }
995
996
997 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
998    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
999    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1000    stmts in NODE are to be marked.  */
1001
1002 static void
1003 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1004 {
1005   int i;
1006   gimple stmt;
1007   slp_tree child;
1008
1009   if (!node)
1010     return;
1011
1012   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1013     if (j < 0 || i == j)
1014       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1015
1016   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1017     vect_mark_slp_stmts (child, mark, j);
1018 }
1019
1020
1021 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1022
1023 static void
1024 vect_mark_slp_stmts_relevant (slp_tree node)
1025 {
1026   int i;
1027   gimple stmt;
1028   stmt_vec_info stmt_info;
1029   slp_tree child;
1030
1031   if (!node)
1032     return;
1033
1034   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1035     {
1036       stmt_info = vinfo_for_stmt (stmt);
1037       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1038                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1039       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1040     }
1041
1042   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1043     vect_mark_slp_stmts_relevant (child);
1044 }
1045
1046
1047 /* Check if the permutation required by the SLP INSTANCE is supported.
1048    Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed.  */
1049
1050 static bool
1051 vect_supported_slp_permutation_p (slp_instance instance)
1052 {
1053   slp_tree node = SLP_INSTANCE_LOADS (instance)[0];
1054   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1055   gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
1056   vec<slp_tree> sorted_loads = vNULL;
1057   int index;
1058   slp_tree *tmp_loads = NULL;
1059   int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
1060   slp_tree load;
1061
1062   /* FORNOW: The only supported loads permutation is loads from the same
1063      location in all the loads in the node, when the data-refs in
1064      nodes of LOADS constitute an interleaving chain.
1065      Sort the nodes according to the order of accesses in the chain.  */
1066   tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
1067   for (i = 0, j = 0;
1068        SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index)
1069        && SLP_INSTANCE_LOADS (instance).iterate (j, &load);
1070        i += group_size, j++)
1071     {
1072       gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0];
1073       /* Check that the loads are all in the same interleaving chain.  */
1074       if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
1075         {
1076           if (dump_enabled_p ())
1077             {
1078               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079                                "Build SLP failed: unsupported data "
1080                                "permutation ");
1081               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1082                                 scalar_stmt, 0);
1083             }
1084
1085           free (tmp_loads);
1086           return false;
1087         }
1088
1089       tmp_loads[index] = load;
1090     }
1091
1092   sorted_loads.create (group_size);
1093   for (i = 0; i < group_size; i++)
1094      sorted_loads.safe_push (tmp_loads[i]);
1095
1096   SLP_INSTANCE_LOADS (instance).release ();
1097   SLP_INSTANCE_LOADS (instance) = sorted_loads;
1098   free (tmp_loads);
1099
1100   if (!vect_transform_slp_perm_load (stmt, vNULL, NULL,
1101                                      SLP_INSTANCE_UNROLLING_FACTOR (instance),
1102                                      instance, true))
1103     return false;
1104
1105   return true;
1106 }
1107
1108
1109 /* Rearrange the statements of NODE according to PERMUTATION.  */
1110
1111 static void
1112 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1113                           vec<int> permutation)
1114 {
1115   gimple stmt;
1116   vec<gimple> tmp_stmts;
1117   unsigned int i;
1118   slp_tree child;
1119
1120   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1121     vect_slp_rearrange_stmts (child, group_size, permutation);
1122
1123   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1124   tmp_stmts.create (group_size);
1125   tmp_stmts.quick_grow_cleared (group_size);
1126
1127   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1128     tmp_stmts[permutation[i]] = stmt;
1129
1130   SLP_TREE_SCALAR_STMTS (node).release ();
1131   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1132 }
1133
1134
1135 /* Check if the required load permutation is supported.
1136    LOAD_PERMUTATION contains a list of indices of the loads.
1137    In SLP this permutation is relative to the order of grouped stores that are
1138    the base of the SLP instance.  */
1139
1140 static bool
1141 vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
1142                                    vec<int> load_permutation)
1143 {
1144   int i = 0, j, prev = -1, next, k, number_of_groups;
1145   bool supported, bad_permutation = false;
1146   sbitmap load_index;
1147   slp_tree node, other_complex_node;
1148   gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
1149   unsigned complex_numbers = 0;
1150   struct data_reference *dr;
1151   bb_vec_info bb_vinfo;
1152
1153   /* FORNOW: permutations are only supported in SLP.  */
1154   if (!slp_instn)
1155     return false;
1156
1157   if (dump_enabled_p ())
1158     {
1159       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1160       FOR_EACH_VEC_ELT (load_permutation, i, next)
1161         dump_printf (MSG_NOTE, "%d ", next);
1162     }
1163
1164   /* In case of reduction every load permutation is allowed, since the order
1165      of the reduction statements is not important (as opposed to the case of
1166      grouped stores).  The only condition we need to check is that all the
1167      load nodes are of the same size and have the same permutation (and then
1168      rearrange all the nodes of the SLP instance according to this 
1169      permutation).  */
1170
1171   /* Check that all the load nodes are of the same size.  */
1172   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1173     {
1174       if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1175         return false;
1176
1177       stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1178       if (is_gimple_assign (stmt) 
1179           && (gimple_assign_rhs_code (stmt) == REALPART_EXPR
1180               || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR))
1181         complex_numbers++;
1182     }
1183
1184   /* Complex operands can be swapped as following:
1185       real_c = real_b + real_a;
1186       imag_c = imag_a + imag_b;
1187      i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of 
1188      {real_a, imag_a} and {real_b, imag_b}.  We check here that if interleaving
1189      chains are mixed, they match the above pattern.  */
1190   if (complex_numbers)
1191     {
1192       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1193         {
1194           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, stmt)
1195             {
1196               if (j == 0)
1197                 first = stmt;
1198               else
1199                 {
1200                   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != first)
1201                     {
1202                       if (complex_numbers != 2)
1203                         return false;
1204
1205                       if (i == 0)
1206                         k = 1;
1207                       else
1208                         k = 0;
1209  
1210                       other_complex_node = SLP_INSTANCE_LOADS (slp_instn)[k];
1211                       other_node_first = 
1212                                 SLP_TREE_SCALAR_STMTS (other_complex_node)[0];
1213
1214                       if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1215                           != other_node_first)
1216                        return false;
1217                     }
1218                 }
1219             }
1220         }
1221     }
1222
1223   /* We checked that this case ok, so there is no need to proceed with 
1224      permutation tests.  */
1225   if (complex_numbers == 2
1226       && SLP_INSTANCE_LOADS (slp_instn).length () == 2)
1227     {
1228       SLP_INSTANCE_LOADS (slp_instn).release ();
1229       SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1230       return true;
1231     }
1232                    
1233   node = SLP_INSTANCE_TREE (slp_instn);
1234   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1235   /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
1236      instance, not all the loads belong to the same node or interleaving
1237      group.  Hence, we need to divide them into groups according to
1238      GROUP_SIZE.  */
1239   number_of_groups = load_permutation.length () / group_size;
1240
1241   /* Reduction (there are no data-refs in the root).
1242      In reduction chain the order of the loads is important.  */
1243   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1244       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1245     {
1246       int first_group_load_index;
1247
1248       /* Compare all the permutation sequences to the first one.  */
1249       for (i = 1; i < number_of_groups; i++)
1250         {
1251           k = 0;
1252           for (j = i * group_size; j < i * group_size + group_size; j++)
1253             {
1254               next = load_permutation[j];
1255               first_group_load_index = load_permutation[k];
1256
1257               if (next != first_group_load_index)
1258                 {
1259                   bad_permutation = true;
1260                   break;
1261                 }
1262
1263               k++;
1264             }
1265
1266           if (bad_permutation)
1267             break;
1268         }
1269
1270       if (!bad_permutation)
1271         {
1272           /* Check that the loads in the first sequence are different and there
1273              are no gaps between them.  */
1274           load_index = sbitmap_alloc (group_size);
1275           bitmap_clear (load_index);
1276           for (k = 0; k < group_size; k++)
1277             {
1278               first_group_load_index = load_permutation[k];
1279               if (bitmap_bit_p (load_index, first_group_load_index))
1280                 {
1281                   bad_permutation = true;
1282                   break;
1283                 }
1284
1285               bitmap_set_bit (load_index, first_group_load_index);
1286             }
1287
1288           if (!bad_permutation)
1289             for (k = 0; k < group_size; k++)
1290               if (!bitmap_bit_p (load_index, k))
1291                 {
1292                   bad_permutation = true;
1293                   break;
1294                 }
1295
1296           sbitmap_free (load_index);
1297         }
1298
1299       if (!bad_permutation)
1300         {
1301           /* This permutation is valid for reduction.  Since the order of the
1302              statements in the nodes is not important unless they are memory
1303              accesses, we can rearrange the statements in all the nodes 
1304              according to the order of the loads.  */
1305           vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1306                                     load_permutation);
1307           SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1308           return true;
1309         }
1310     }
1311
1312   /* In basic block vectorization we allow any subchain of an interleaving
1313      chain.
1314      FORNOW: not supported in loop SLP because of realignment compications.  */
1315   bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
1316   bad_permutation = false;
1317   /* Check that for every node in the instance the loads form a subchain.  */
1318   if (bb_vinfo)
1319     {
1320       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1321         {
1322           next_load = NULL;
1323           first_load = NULL;
1324           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1325             {
1326               if (!first_load)
1327                 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
1328               else if (first_load
1329                          != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
1330                 {
1331                   bad_permutation = true;
1332                   break;
1333                 }
1334
1335               if (j != 0 && next_load != load)
1336                 {
1337                   bad_permutation = true;
1338                   break;
1339                 }
1340
1341               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1342             }
1343
1344           if (bad_permutation)
1345             break;
1346         }
1347
1348       /* Check that the alignment of the first load in every subchain, i.e.,
1349          the first statement in every load node, is supported.  */
1350       if (!bad_permutation)
1351         {
1352           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1353             {
1354               first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1355               if (first_load
1356                     != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1357                 {
1358                   dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1359                   if (vect_supportable_dr_alignment (dr, false)
1360                        == dr_unaligned_unsupported)
1361                     {
1362                       if (dump_enabled_p ())
1363                         {
1364                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1365                                            vect_location, 
1366                                            "unsupported unaligned load ");
1367                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1368                                             first_load, 0);
1369                         }
1370                       bad_permutation = true;
1371                       break;
1372                     }
1373                 }
1374             }
1375
1376           if (!bad_permutation)
1377             {
1378               SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1379               return true;
1380             }
1381         }
1382     }
1383
1384   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1385      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1386      well (unless it's reduction).  */
1387   if (load_permutation.length ()
1388       != (unsigned int) (group_size * group_size))
1389     return false;
1390
1391   supported = true;
1392   load_index = sbitmap_alloc (group_size);
1393   bitmap_clear (load_index);
1394   for (j = 0; j < group_size; j++)
1395     {
1396       for (i = j * group_size, k = 0;
1397            load_permutation.iterate (i, &next) && k < group_size;
1398            i++, k++)
1399        {
1400          if (i != j * group_size && next != prev)
1401           {
1402             supported = false;
1403             break;
1404           }
1405
1406          prev = next;
1407        }
1408
1409       if (bitmap_bit_p (load_index, prev))
1410         {
1411           supported = false;
1412           break;
1413         }
1414
1415       bitmap_set_bit (load_index, prev);
1416     }
1417  
1418   for (j = 0; j < group_size; j++)
1419     if (!bitmap_bit_p (load_index, j))
1420       {
1421         sbitmap_free (load_index);
1422         return false;
1423       }
1424
1425   sbitmap_free (load_index);
1426
1427   if (supported && i == group_size * group_size
1428       && vect_supported_slp_permutation_p (slp_instn))
1429     return true;
1430
1431   return false;
1432 }
1433
1434
1435 /* Find the first load in the loop that belongs to INSTANCE.
1436    When loads are in several SLP nodes, there can be a case in which the first
1437    load does not appear in the first SLP node to be transformed, causing
1438    incorrect order of statements.  Since we generate all the loads together,
1439    they must be inserted before the first load of the SLP instance and not
1440    before the first load of the first node of the instance.  */
1441
1442 static gimple
1443 vect_find_first_load_in_slp_instance (slp_instance instance)
1444 {
1445   int i, j;
1446   slp_tree load_node;
1447   gimple first_load = NULL, load;
1448
1449   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1450     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1451       first_load = get_earlier_stmt (load, first_load);
1452
1453   return first_load;
1454 }
1455
1456
1457 /* Find the last store in SLP INSTANCE.  */
1458
1459 static gimple
1460 vect_find_last_store_in_slp_instance (slp_instance instance)
1461 {
1462   int i;
1463   slp_tree node;
1464   gimple last_store = NULL, store;
1465
1466   node = SLP_INSTANCE_TREE (instance);
1467   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1468     last_store = get_later_stmt (store, last_store);
1469
1470   return last_store;
1471 }
1472
1473 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1474
1475 static void
1476 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1477                          slp_instance instance, slp_tree node,
1478                          stmt_vector_for_cost *prologue_cost_vec,
1479                          unsigned ncopies_for_cost)
1480 {
1481   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1482
1483   unsigned i;
1484   slp_tree child;
1485   gimple stmt, s;
1486   stmt_vec_info stmt_info;
1487   tree lhs;
1488   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1489
1490   /* Recurse down the SLP tree.  */
1491   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1492     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1493                              instance, child, prologue_cost_vec,
1494                              ncopies_for_cost);
1495
1496   /* Look at the first scalar stmt to determine the cost.  */
1497   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1498   stmt_info = vinfo_for_stmt (stmt);
1499   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1500     {
1501       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1502         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1503                                vect_uninitialized_def,
1504                                node, prologue_cost_vec, body_cost_vec);
1505       else
1506         {
1507           int i;
1508           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1509           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1510                                 node, prologue_cost_vec, body_cost_vec);
1511           /* If the load is permuted record the cost for the permutation.
1512              ???  Loads from multiple chains are let through here only
1513              for a single special case involving complex numbers where
1514              in the end no permutation is necessary.  */
1515           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1516             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1517                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1518                 && vect_get_place_in_interleaving_chain
1519                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1520               {
1521                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1522                                   stmt_info, 0, vect_body);
1523                 break;
1524               }
1525         }
1526     }
1527   else
1528     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1529                       stmt_info, 0, vect_body);
1530
1531   /* Scan operands and account for prologue cost of constants/externals.
1532      ???  This over-estimates cost for multiple uses and should be
1533      re-engineered.  */
1534   lhs = gimple_get_lhs (stmt);
1535   for (i = 0; i < gimple_num_ops (stmt); ++i)
1536     {
1537       tree def, op = gimple_op (stmt, i);
1538       gimple def_stmt;
1539       enum vect_def_type dt;
1540       if (!op || op == lhs)
1541         continue;
1542       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1543                               &def_stmt, &def, &dt)
1544           && (dt == vect_constant_def || dt == vect_external_def))
1545         record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1546                           stmt_info, 0, vect_prologue);
1547     }
1548 }
1549
1550 /* Compute the cost for the SLP instance INSTANCE.  */
1551
1552 static void
1553 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1554                        slp_instance instance, unsigned nunits)
1555 {
1556   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1557   unsigned ncopies_for_cost;
1558   stmt_info_for_cost *si;
1559   unsigned i;
1560
1561   /* Calculate the number of vector stmts to create based on the unrolling
1562      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1563      GROUP_SIZE / NUNITS otherwise.  */
1564   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1565   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1566
1567   prologue_cost_vec.create (10);
1568   body_cost_vec.create (10);
1569   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1570   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1571                            instance, SLP_INSTANCE_TREE (instance),
1572                            &prologue_cost_vec, ncopies_for_cost);
1573
1574   /* Record the prologue costs, which were delayed until we were
1575      sure that SLP was successful.  Unlike the body costs, we know
1576      the final values now regardless of the loop vectorization factor.  */
1577   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1578                 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1579   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1580     {
1581       struct _stmt_vec_info *stmt_info
1582         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1583       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1584                             si->misalign, vect_prologue);
1585     }
1586
1587   prologue_cost_vec.release ();
1588 }
1589
1590 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1591    vect_build_slp_tree to build a tree of packed stmts if possible.
1592    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1593
1594 static bool
1595 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1596                            gimple stmt)
1597 {
1598   slp_instance new_instance;
1599   slp_tree node;
1600   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1601   unsigned int unrolling_factor = 1, nunits;
1602   tree vectype, scalar_type = NULL_TREE;
1603   gimple next;
1604   unsigned int vectorization_factor = 0;
1605   int i;
1606   unsigned int max_nunits = 0;
1607   vec<int> load_permutation;
1608   vec<slp_tree> loads;
1609   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1610   bool loads_permuted = false;
1611   vec<gimple> scalar_stmts;
1612
1613   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1614     {
1615       if (dr)
1616         {
1617           scalar_type = TREE_TYPE (DR_REF (dr));
1618           vectype = get_vectype_for_scalar_type (scalar_type);
1619         }
1620       else
1621         {
1622           gcc_assert (loop_vinfo);
1623           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1624         }
1625
1626       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1627     }
1628   else
1629     {
1630       gcc_assert (loop_vinfo);
1631       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1632       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1633     }
1634
1635   if (!vectype)
1636     {
1637       if (dump_enabled_p ())
1638         {
1639           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1640                            "Build SLP failed: unsupported data-type ");
1641           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1642         }
1643
1644       return false;
1645     }
1646
1647   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1648   if (loop_vinfo)
1649     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1650   else
1651     vectorization_factor = nunits;
1652
1653   /* Calculate the unrolling factor.  */
1654   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1655   if (unrolling_factor != 1 && !loop_vinfo)
1656     {
1657       if (dump_enabled_p ())
1658         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
1659                          "Build SLP failed: unrolling required in basic"
1660                          " block SLP");
1661
1662       return false;
1663     }
1664
1665   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1666   scalar_stmts.create (group_size);
1667   next = stmt;
1668   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1669     {
1670       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1671       while (next)
1672         {
1673           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1674               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1675             scalar_stmts.safe_push (
1676                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1677           else
1678             scalar_stmts.safe_push (next);
1679           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1680         }
1681     }
1682   else
1683     {
1684       /* Collect reduction statements.  */
1685       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1686       for (i = 0; reductions.iterate (i, &next); i++)
1687         scalar_stmts.safe_push (next);
1688     }
1689
1690   node = vect_create_new_slp_node (scalar_stmts);
1691
1692   load_permutation.create (group_size * group_size);
1693   loads.create (group_size);
1694
1695   /* Build the tree for the SLP instance.  */
1696   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1697                            &max_nunits, &load_permutation, &loads,
1698                            vectorization_factor, &loads_permuted))
1699     {
1700       /* Calculate the unrolling factor based on the smallest type.  */
1701       if (max_nunits > nunits)
1702         unrolling_factor = least_common_multiple (max_nunits, group_size)
1703                            / group_size;
1704
1705       if (unrolling_factor != 1 && !loop_vinfo)
1706         {
1707           if (dump_enabled_p ())
1708             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
1709                              "Build SLP failed: unrolling required in basic"
1710                              " block SLP");
1711           vect_free_slp_tree (node);
1712           load_permutation.release ();
1713           loads.release ();
1714           return false;
1715         }
1716
1717       /* Create a new SLP instance.  */
1718       new_instance = XNEW (struct _slp_instance);
1719       SLP_INSTANCE_TREE (new_instance) = node;
1720       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1721       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1722       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1723       SLP_INSTANCE_LOADS (new_instance) = loads;
1724       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1725       SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
1726
1727       if (loads_permuted)
1728         {
1729           if (!vect_supported_load_permutation_p (new_instance, group_size,
1730                                                   load_permutation))
1731             {
1732               if (dump_enabled_p ())
1733                 {
1734                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
1735                                    "Build SLP failed: unsupported load "
1736                                    "permutation ");
1737                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1738                 }
1739
1740               vect_free_slp_instance (new_instance);
1741               return false;
1742             }
1743
1744           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1745              = vect_find_first_load_in_slp_instance (new_instance);
1746         }
1747       else
1748         SLP_INSTANCE_LOAD_PERMUTATION (new_instance).release ();
1749
1750       /* Compute the costs of this SLP instance.  */
1751       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1752                              new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1753
1754       if (loop_vinfo)
1755         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1756       else
1757         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1758
1759       if (dump_enabled_p ())
1760         vect_print_slp_tree (MSG_NOTE, node);
1761
1762       return true;
1763     }
1764
1765   /* Failed to SLP.  */
1766   /* Free the allocated memory.  */
1767   vect_free_slp_tree (node);
1768   load_permutation.release ();
1769   loads.release ();
1770
1771   return false;
1772 }
1773
1774
1775 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1776    trees of packed scalar stmts if SLP is possible.  */
1777
1778 bool
1779 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1780 {
1781   unsigned int i;
1782   vec<gimple> grouped_stores;
1783   vec<gimple> reductions = vNULL;
1784   vec<gimple> reduc_chains = vNULL;
1785   gimple first_element;
1786   bool ok = false;
1787
1788   if (dump_enabled_p ())
1789     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===");
1790
1791   if (loop_vinfo)
1792     {
1793       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1794       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1795       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1796     }
1797   else
1798     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1799
1800   /* Find SLP sequences starting from groups of grouped stores.  */
1801   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1802     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1803       ok = true;
1804
1805   if (bb_vinfo && !ok)
1806     {
1807       if (dump_enabled_p ())
1808         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1809                          "Failed to SLP the basic block.");
1810
1811       return false;
1812     }
1813
1814   if (loop_vinfo
1815       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1816     {
1817       /* Find SLP sequences starting from reduction chains.  */
1818       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1819         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1820           ok = true;
1821         else
1822           return false;
1823
1824       /* Don't try to vectorize SLP reductions if reduction chain was
1825          detected.  */
1826       return ok;
1827     }
1828
1829   /* Find SLP sequences starting from groups of reductions.  */
1830   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1831       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0]))
1832     ok = true;
1833
1834   return true;
1835 }
1836
1837
1838 /* For each possible SLP instance decide whether to SLP it and calculate overall
1839    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1840    least one instance.  */
1841
1842 bool
1843 vect_make_slp_decision (loop_vec_info loop_vinfo)
1844 {
1845   unsigned int i, unrolling_factor = 1;
1846   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1847   slp_instance instance;
1848   int decided_to_slp = 0;
1849
1850   if (dump_enabled_p ())
1851     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ===");
1852
1853   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1854     {
1855       /* FORNOW: SLP if you can.  */
1856       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1857         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1858
1859       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1860          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1861          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1862       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1863       decided_to_slp++;
1864     }
1865
1866   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1867
1868   if (decided_to_slp && dump_enabled_p ())
1869     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
1870                      "Decided to SLP %d instances. Unrolling factor %d",
1871                      decided_to_slp, unrolling_factor);
1872
1873   return (decided_to_slp > 0);
1874 }
1875
1876
1877 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1878    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1879
1880 static void
1881 vect_detect_hybrid_slp_stmts (slp_tree node)
1882 {
1883   int i;
1884   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1885   gimple stmt = stmts[0];
1886   imm_use_iterator imm_iter;
1887   gimple use_stmt;
1888   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1889   slp_tree child;
1890   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1891   struct loop *loop = NULL;
1892   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1893   basic_block bb = NULL;
1894
1895   if (!node)
1896     return;
1897
1898   if (loop_vinfo)
1899     loop = LOOP_VINFO_LOOP (loop_vinfo);
1900   else
1901     bb = BB_VINFO_BB (bb_vinfo);
1902
1903   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1904     if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1905         && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1906       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1907         if (gimple_bb (use_stmt)
1908             && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1909                  || bb == gimple_bb (use_stmt))
1910             && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1911             && !STMT_SLP_TYPE (stmt_vinfo)
1912             && (STMT_VINFO_RELEVANT (stmt_vinfo)
1913                 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
1914             && !(gimple_code (use_stmt) == GIMPLE_PHI
1915                  && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1916                   == vect_reduction_def))
1917           vect_mark_slp_stmts (node, hybrid, i);
1918
1919   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1920     vect_detect_hybrid_slp_stmts (child);
1921 }
1922
1923
1924 /* Find stmts that must be both vectorized and SLPed.  */
1925
1926 void
1927 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1928 {
1929   unsigned int i;
1930   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1931   slp_instance instance;
1932
1933   if (dump_enabled_p ())
1934     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ===");
1935
1936   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1937     vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1938 }
1939
1940
1941 /* Create and initialize a new bb_vec_info struct for BB, as well as
1942    stmt_vec_info structs for all the stmts in it.  */
1943
1944 static bb_vec_info
1945 new_bb_vec_info (basic_block bb)
1946 {
1947   bb_vec_info res = NULL;
1948   gimple_stmt_iterator gsi;
1949
1950   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1951   BB_VINFO_BB (res) = bb;
1952
1953   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1954     {
1955       gimple stmt = gsi_stmt (gsi);
1956       gimple_set_uid (stmt, 0);
1957       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1958     }
1959
1960   BB_VINFO_GROUPED_STORES (res).create (10);
1961   BB_VINFO_SLP_INSTANCES (res).create (2);
1962   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1963
1964   bb->aux = res;
1965   return res;
1966 }
1967
1968
1969 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1970    stmts in the basic block.  */
1971
1972 static void
1973 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1974 {
1975   vec<slp_instance> slp_instances;
1976   slp_instance instance;
1977   basic_block bb;
1978   gimple_stmt_iterator si;
1979   unsigned i;
1980
1981   if (!bb_vinfo)
1982     return;
1983
1984   bb = BB_VINFO_BB (bb_vinfo);
1985
1986   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1987     {
1988       gimple stmt = gsi_stmt (si);
1989       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1990
1991       if (stmt_info)
1992         /* Free stmt_vec_info.  */
1993         free_stmt_vec_info (stmt);
1994     }
1995
1996   free_data_refs (BB_VINFO_DATAREFS (bb_vinfo));
1997   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1998   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1999   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2000   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2001     vect_free_slp_instance (instance);
2002   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2003   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2004   free (bb_vinfo);
2005   bb->aux = NULL;
2006 }
2007
2008
2009 /* Analyze statements contained in SLP tree node after recursively analyzing
2010    the subtree. Return TRUE if the operations are supported.  */
2011
2012 static bool
2013 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2014 {
2015   bool dummy;
2016   int i;
2017   gimple stmt;
2018   slp_tree child;
2019
2020   if (!node)
2021     return true;
2022
2023   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2024     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2025       return false;
2026
2027   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2028     {
2029       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2030       gcc_assert (stmt_info);
2031       gcc_assert (PURE_SLP_STMT (stmt_info));
2032
2033       if (!vect_analyze_stmt (stmt, &dummy, node))
2034         return false;
2035     }
2036
2037   return true;
2038 }
2039
2040
2041 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2042    operations are supported. */
2043
2044 static bool
2045 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2046 {
2047   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2048   slp_instance instance;
2049   int i;
2050
2051   for (i = 0; slp_instances.iterate (i, &instance); )
2052     {
2053       if (!vect_slp_analyze_node_operations (bb_vinfo,
2054                                              SLP_INSTANCE_TREE (instance)))
2055         {
2056           vect_free_slp_instance (instance);
2057           slp_instances.ordered_remove (i);
2058         }
2059       else
2060         i++;
2061     }
2062
2063   if (!slp_instances.length ())
2064     return false;
2065
2066   return true;
2067 }
2068
2069 /* Check if vectorization of the basic block is profitable.  */
2070
2071 static bool
2072 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2073 {
2074   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2075   slp_instance instance;
2076   int i, j;
2077   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2078   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2079   unsigned int stmt_cost;
2080   gimple stmt;
2081   gimple_stmt_iterator si;
2082   basic_block bb = BB_VINFO_BB (bb_vinfo);
2083   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2084   stmt_vec_info stmt_info = NULL;
2085   stmt_vector_for_cost body_cost_vec;
2086   stmt_info_for_cost *ci;
2087
2088   /* Calculate vector costs.  */
2089   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2090     {
2091       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2092
2093       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2094         {
2095           stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2096           (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2097                                 stmt_info, ci->misalign, vect_body);
2098         }
2099     }
2100
2101   /* Calculate scalar cost.  */
2102   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2103     {
2104       stmt = gsi_stmt (si);
2105       stmt_info = vinfo_for_stmt (stmt);
2106
2107       if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
2108           || !PURE_SLP_STMT (stmt_info))
2109         continue;
2110
2111       if (STMT_VINFO_DATA_REF (stmt_info))
2112         {
2113           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2114             stmt_cost = vect_get_stmt_cost (scalar_load);
2115           else
2116             stmt_cost = vect_get_stmt_cost (scalar_store);
2117         }
2118       else
2119         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2120
2121       scalar_cost += stmt_cost;
2122     }
2123
2124   /* Complete the target-specific cost calculation.  */
2125   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2126                &vec_inside_cost, &vec_epilogue_cost);
2127
2128   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2129
2130   if (dump_enabled_p ())
2131     {
2132       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2133       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2134                    vec_inside_cost);
2135       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2136       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2137       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d", scalar_cost);
2138     }
2139
2140   /* Vectorization is profitable if its cost is less than the cost of scalar
2141      version.  */
2142   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2143     return false;
2144
2145   return true;
2146 }
2147
2148 /* Check if the basic block can be vectorized.  */
2149
2150 static bb_vec_info
2151 vect_slp_analyze_bb_1 (basic_block bb)
2152 {
2153   bb_vec_info bb_vinfo;
2154   vec<slp_instance> slp_instances;
2155   slp_instance instance;
2156   int i;
2157   int min_vf = 2;
2158
2159   bb_vinfo = new_bb_vec_info (bb);
2160   if (!bb_vinfo)
2161     return NULL;
2162
2163   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf))
2164     {
2165       if (dump_enabled_p ())
2166         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2167                          "not vectorized: unhandled data-ref in basic "
2168                          "block.\n");
2169
2170       destroy_bb_vec_info (bb_vinfo);
2171       return NULL;
2172     }
2173
2174   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2175     {
2176       if (dump_enabled_p ())
2177         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178                          "not vectorized: not enough data-refs in "
2179                          "basic block.\n");
2180
2181       destroy_bb_vec_info (bb_vinfo);
2182       return NULL;
2183     }
2184
2185   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2186     {
2187      if (dump_enabled_p ())
2188        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189                         "not vectorized: unhandled data access in "
2190                         "basic block.\n");
2191
2192       destroy_bb_vec_info (bb_vinfo);
2193       return NULL;
2194     }
2195
2196   vect_pattern_recog (NULL, bb_vinfo);
2197
2198   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2199      {
2200        if (dump_enabled_p ())
2201          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2202                           "not vectorized: unhandled data dependence "
2203                           "in basic block.\n");
2204
2205        destroy_bb_vec_info (bb_vinfo);
2206        return NULL;
2207      }
2208
2209   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2210     {
2211       if (dump_enabled_p ())
2212         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2213                          "not vectorized: bad data alignment in basic "
2214                          "block.\n");
2215
2216       destroy_bb_vec_info (bb_vinfo);
2217       return NULL;
2218     }
2219
2220   /* Check the SLP opportunities in the basic block, analyze and build SLP
2221      trees.  */
2222   if (!vect_analyze_slp (NULL, bb_vinfo))
2223     {
2224       if (dump_enabled_p ())
2225         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2226                          "not vectorized: failed to find SLP opportunities "
2227                          "in basic block.\n");
2228
2229       destroy_bb_vec_info (bb_vinfo);
2230       return NULL;
2231     }
2232
2233   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2234
2235   /* Mark all the statements that we want to vectorize as pure SLP and
2236      relevant.  */
2237   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2238     {
2239       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2240       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2241     }
2242
2243   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2244     {
2245       if (dump_enabled_p ())
2246         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2247                          "not vectorized: unsupported alignment in basic "
2248                          "block.\n");
2249       destroy_bb_vec_info (bb_vinfo);
2250       return NULL;
2251     }
2252
2253   if (!vect_slp_analyze_operations (bb_vinfo))
2254     {
2255       if (dump_enabled_p ())
2256         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2257                          "not vectorized: bad operation in basic block.\n");
2258
2259       destroy_bb_vec_info (bb_vinfo);
2260       return NULL;
2261     }
2262
2263   /* Cost model: check if the vectorization is worthwhile.  */
2264   if (flag_vect_cost_model
2265       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2266     {
2267       if (dump_enabled_p ())
2268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269                          "not vectorized: vectorization is not "
2270                          "profitable.\n");
2271
2272       destroy_bb_vec_info (bb_vinfo);
2273       return NULL;
2274     }
2275
2276   if (dump_enabled_p ())
2277     dump_printf_loc (MSG_NOTE, vect_location,
2278                      "Basic block will be vectorized using SLP\n");
2279
2280   return bb_vinfo;
2281 }
2282
2283
2284 bb_vec_info
2285 vect_slp_analyze_bb (basic_block bb)
2286 {
2287   bb_vec_info bb_vinfo;
2288   int insns = 0;
2289   gimple_stmt_iterator gsi;
2290   unsigned int vector_sizes;
2291
2292   if (dump_enabled_p ())
2293     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2294
2295   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2296     {
2297       gimple stmt = gsi_stmt (gsi);
2298       if (!is_gimple_debug (stmt)
2299           && !gimple_nop_p (stmt)
2300           && gimple_code (stmt) != GIMPLE_LABEL)
2301         insns++;
2302     }
2303
2304   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2305     {
2306       if (dump_enabled_p ())
2307         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2308                          "not vectorized: too many instructions in "
2309                          "basic block.\n");
2310
2311       return NULL;
2312     }
2313
2314   /* Autodetect first vector size we try.  */
2315   current_vector_size = 0;
2316   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2317
2318   while (1)
2319     {
2320       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2321       if (bb_vinfo)
2322         return bb_vinfo;
2323
2324       destroy_bb_vec_info (bb_vinfo);
2325
2326       vector_sizes &= ~current_vector_size;
2327       if (vector_sizes == 0
2328           || current_vector_size == 0)
2329         return NULL;
2330
2331       /* Try the next biggest vector size.  */
2332       current_vector_size = 1 << floor_log2 (vector_sizes);
2333       if (dump_enabled_p ())
2334         dump_printf_loc (MSG_NOTE, vect_location,
2335                          "***** Re-trying analysis with "
2336                          "vector size %d\n", current_vector_size);
2337     }
2338 }
2339
2340
2341 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2342    the number of created vector stmts depends on the unrolling factor).
2343    However, the actual number of vector stmts for every SLP node depends on
2344    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2345    should be updated.  In this function we assume that the inside costs
2346    calculated in vect_model_xxx_cost are linear in ncopies.  */
2347
2348 void
2349 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2350 {
2351   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2352   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2353   slp_instance instance;
2354   stmt_vector_for_cost body_cost_vec;
2355   stmt_info_for_cost *si;
2356   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2357
2358   if (dump_enabled_p ())
2359     dump_printf_loc (MSG_NOTE, vect_location,
2360                      "=== vect_update_slp_costs_according_to_vf ===");
2361
2362   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2363     {
2364       /* We assume that costs are linear in ncopies.  */
2365       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2366
2367       /* Record the instance's instructions in the target cost model.
2368          This was delayed until here because the count of instructions
2369          isn't known beforehand.  */
2370       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2371
2372       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2373         (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2374                               vinfo_for_stmt (si->stmt), si->misalign,
2375                               vect_body);
2376     }
2377 }
2378
2379
2380 /* For constant and loop invariant defs of SLP_NODE this function returns
2381    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2382    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2383    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2384    REDUC_INDEX is the index of the reduction operand in the statements, unless
2385    it is -1.  */
2386
2387 static void
2388 vect_get_constant_vectors (tree op, slp_tree slp_node,
2389                            vec<tree> *vec_oprnds,
2390                            unsigned int op_num, unsigned int number_of_vectors,
2391                            int reduc_index)
2392 {
2393   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2394   gimple stmt = stmts[0];
2395   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2396   unsigned nunits;
2397   tree vec_cst;
2398   tree *elts;
2399   unsigned j, number_of_places_left_in_vector;
2400   tree vector_type;
2401   tree vop;
2402   int group_size = stmts.length ();
2403   unsigned int vec_num, i;
2404   unsigned number_of_copies = 1;
2405   vec<tree> voprnds;
2406   voprnds.create (number_of_vectors);
2407   bool constant_p, is_store;
2408   tree neutral_op = NULL;
2409   enum tree_code code = gimple_expr_code (stmt);
2410   gimple def_stmt;
2411   struct loop *loop;
2412   gimple_seq ctor_seq = NULL;
2413
2414   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2415       && reduc_index != -1)
2416     {
2417       op_num = reduc_index - 1;
2418       op = gimple_op (stmt, reduc_index);
2419       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2420          we need either neutral operands or the original operands.  See
2421          get_initial_def_for_reduction() for details.  */
2422       switch (code)
2423         {
2424           case WIDEN_SUM_EXPR:
2425           case DOT_PROD_EXPR:
2426           case PLUS_EXPR:
2427           case MINUS_EXPR:
2428           case BIT_IOR_EXPR:
2429           case BIT_XOR_EXPR:
2430              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2431                neutral_op = build_real (TREE_TYPE (op), dconst0);
2432              else
2433                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2434
2435              break;
2436
2437           case MULT_EXPR:
2438              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2439                neutral_op = build_real (TREE_TYPE (op), dconst1);
2440              else
2441                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2442
2443              break;
2444
2445           case BIT_AND_EXPR:
2446             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2447             break;
2448
2449           case MAX_EXPR:
2450           case MIN_EXPR:
2451             def_stmt = SSA_NAME_DEF_STMT (op);
2452             loop = (gimple_bb (stmt))->loop_father;
2453             neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2454                                                 loop_preheader_edge (loop));
2455             break;
2456
2457           default:
2458             neutral_op = NULL;
2459         }
2460     }
2461
2462   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2463     {
2464       is_store = true;
2465       op = gimple_assign_rhs1 (stmt);
2466     }
2467   else
2468     is_store = false;
2469
2470   gcc_assert (op);
2471
2472   if (CONSTANT_CLASS_P (op))
2473     constant_p = true;
2474   else
2475     constant_p = false;
2476
2477   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2478   gcc_assert (vector_type);
2479   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2480
2481   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2482      created vectors. It is greater than 1 if unrolling is performed.
2483
2484      For example, we have two scalar operands, s1 and s2 (e.g., group of
2485      strided accesses of size two), while NUNITS is four (i.e., four scalars
2486      of this type can be packed in a vector).  The output vector will contain
2487      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2488      will be 2).
2489
2490      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2491      containing the operands.
2492
2493      For example, NUNITS is four as before, and the group size is 8
2494      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2495      {s5, s6, s7, s8}.  */
2496
2497   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2498
2499   number_of_places_left_in_vector = nunits;
2500   elts = XALLOCAVEC (tree, nunits);
2501   for (j = 0; j < number_of_copies; j++)
2502     {
2503       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2504         {
2505           if (is_store)
2506             op = gimple_assign_rhs1 (stmt);
2507           else
2508             {
2509               switch (code)
2510                 {
2511                   case COND_EXPR:
2512                     if (op_num == 0 || op_num == 1)
2513                       {
2514                         tree cond = gimple_assign_rhs1 (stmt);
2515                         op = TREE_OPERAND (cond, op_num);
2516                       }
2517                     else
2518                       {
2519                         if (op_num == 2)
2520                           op = gimple_assign_rhs2 (stmt);
2521                         else
2522                           op = gimple_assign_rhs3 (stmt);
2523                       }
2524                     break;
2525
2526                   case CALL_EXPR:
2527                     op = gimple_call_arg (stmt, op_num);
2528                     break;
2529
2530                   case LSHIFT_EXPR:
2531                   case RSHIFT_EXPR:
2532                   case LROTATE_EXPR:
2533                   case RROTATE_EXPR:
2534                     op = gimple_op (stmt, op_num + 1);
2535                     /* Unlike the other binary operators, shifts/rotates have
2536                        the shift count being int, instead of the same type as
2537                        the lhs, so make sure the scalar is the right type if
2538                        we are dealing with vectors of
2539                        long long/long/short/char.  */
2540                     if (op_num == 1 && constant_p)
2541                       op = fold_convert (TREE_TYPE (vector_type), op);
2542                     break;
2543
2544                   default:
2545                     op = gimple_op (stmt, op_num + 1);
2546                     break;
2547                 }
2548             }
2549
2550           if (reduc_index != -1)
2551             {
2552               loop = (gimple_bb (stmt))->loop_father;
2553               def_stmt = SSA_NAME_DEF_STMT (op);
2554
2555               gcc_assert (loop);
2556
2557               /* Get the def before the loop.  In reduction chain we have only
2558                  one initial value.  */
2559               if ((j != (number_of_copies - 1)
2560                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2561                        && i != 0))
2562                   && neutral_op)
2563                 op = neutral_op;
2564               else
2565                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2566                                             loop_preheader_edge (loop));
2567             }
2568
2569           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2570           number_of_places_left_in_vector--;
2571           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2572             {
2573               if (constant_p)
2574                 {
2575                   op = fold_unary (VIEW_CONVERT_EXPR,
2576                                    TREE_TYPE (vector_type), op);
2577                   gcc_assert (op && CONSTANT_CLASS_P (op));
2578                 }
2579               else
2580                 {
2581                   tree new_temp
2582                     = make_ssa_name (TREE_TYPE (vector_type), NULL);
2583                   gimple init_stmt;
2584                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2585                                op);               
2586                   init_stmt
2587                     = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2588                                                     new_temp, op, NULL_TREE);
2589                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2590                   op = new_temp;
2591                 }
2592             }
2593           elts[number_of_places_left_in_vector] = op;
2594
2595           if (number_of_places_left_in_vector == 0)
2596             {
2597               number_of_places_left_in_vector = nunits;
2598
2599               if (constant_p)
2600                 vec_cst = build_vector (vector_type, elts);
2601               else
2602                 {
2603                   vec<constructor_elt, va_gc> *v;
2604                   unsigned k;
2605                   vec_alloc (v, nunits);
2606                   for (k = 0; k < nunits; ++k)
2607                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2608                   vec_cst = build_constructor (vector_type, v);
2609                 }
2610               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2611                                                     vector_type, NULL));
2612               if (ctor_seq != NULL)
2613                 {
2614                   gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2615                   gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2616                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2617                                                         GSI_SAME_STMT);
2618                   ctor_seq = NULL;
2619                 }
2620             }
2621         }
2622     }
2623
2624   /* Since the vectors are created in the reverse order, we should invert
2625      them.  */
2626   vec_num = voprnds.length ();
2627   for (j = vec_num; j != 0; j--)
2628     {
2629       vop = voprnds[j - 1];
2630       vec_oprnds->quick_push (vop);
2631     }
2632
2633   voprnds.release ();
2634
2635   /* In case that VF is greater than the unrolling factor needed for the SLP
2636      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2637      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2638      to replicate the vectors.  */
2639   while (number_of_vectors > vec_oprnds->length ())
2640     {
2641       tree neutral_vec = NULL;
2642
2643       if (neutral_op)
2644         {
2645           if (!neutral_vec)
2646             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2647
2648           vec_oprnds->quick_push (neutral_vec);
2649         }
2650       else
2651         {
2652           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2653             vec_oprnds->quick_push (vop);
2654         }
2655     }
2656 }
2657
2658
2659 /* Get vectorized definitions from SLP_NODE that contains corresponding
2660    vectorized def-stmts.  */
2661
2662 static void
2663 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2664 {
2665   tree vec_oprnd;
2666   gimple vec_def_stmt;
2667   unsigned int i;
2668
2669   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2670
2671   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2672     {
2673       gcc_assert (vec_def_stmt);
2674       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2675       vec_oprnds->quick_push (vec_oprnd);
2676     }
2677 }
2678
2679
2680 /* Get vectorized definitions for SLP_NODE.
2681    If the scalar definitions are loop invariants or constants, collect them and
2682    call vect_get_constant_vectors() to create vector stmts.
2683    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2684    must be stored in the corresponding child of SLP_NODE, and we call
2685    vect_get_slp_vect_defs () to retrieve them.  */
2686
2687 void
2688 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2689                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2690 {
2691   gimple first_stmt;
2692   int number_of_vects = 0, i;
2693   unsigned int child_index = 0;
2694   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2695   slp_tree child = NULL;
2696   vec<tree> vec_defs;
2697   tree oprnd;
2698   bool vectorized_defs;
2699
2700   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2701   FOR_EACH_VEC_ELT (ops, i, oprnd)
2702     {
2703       /* For each operand we check if it has vectorized definitions in a child
2704          node or we need to create them (for invariants and constants).  We
2705          check if the LHS of the first stmt of the next child matches OPRND.
2706          If it does, we found the correct child.  Otherwise, we call
2707          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2708          to check this child node for the next operand.  */
2709       vectorized_defs = false;
2710       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2711         {
2712           child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index];
2713
2714           /* We have to check both pattern and original def, if available.  */
2715           gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2716           gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2717
2718           if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2719               || (related
2720                   && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2721             {
2722               /* The number of vector defs is determined by the number of
2723                  vector statements in the node from which we get those
2724                  statements.  */
2725               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2726               vectorized_defs = true;
2727               child_index++;
2728             }
2729         }
2730
2731       if (!vectorized_defs)
2732         {
2733           if (i == 0)
2734             {
2735               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2736               /* Number of vector stmts was calculated according to LHS in
2737                  vect_schedule_slp_instance (), fix it by replacing LHS with
2738                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2739                  details.  */
2740               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2741                                              &rhs_size_unit);
2742               if (rhs_size_unit != lhs_size_unit)
2743                 {
2744                   number_of_vects *= rhs_size_unit;
2745                   number_of_vects /= lhs_size_unit;
2746                 }
2747             }
2748         }
2749
2750       /* Allocate memory for vectorized defs.  */
2751       vec_defs = vNULL;
2752       vec_defs.create (number_of_vects);
2753
2754       /* For reduction defs we call vect_get_constant_vectors (), since we are
2755          looking for initial loop invariant values.  */
2756       if (vectorized_defs && reduc_index == -1)
2757         /* The defs are already vectorized.  */
2758         vect_get_slp_vect_defs (child, &vec_defs);
2759       else
2760         /* Build vectors from scalar defs.  */
2761         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2762                                    number_of_vects, reduc_index);
2763
2764       vec_oprnds->quick_push (vec_defs);
2765
2766       /* For reductions, we only need initial values.  */
2767       if (reduc_index != -1)
2768         return;
2769     }
2770 }
2771
2772
2773 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2774    building a vector of type MASK_TYPE from it) and two input vectors placed in
2775    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2776    shifting by STRIDE elements of DR_CHAIN for every copy.
2777    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2778    copies).
2779    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2780    the created stmts must be inserted.  */
2781
2782 static inline void
2783 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2784                            tree mask, int first_vec_indx, int second_vec_indx,
2785                            gimple_stmt_iterator *gsi, slp_tree node,
2786                            tree vectype, vec<tree> dr_chain,
2787                            int ncopies, int vect_stmts_counter)
2788 {
2789   tree perm_dest;
2790   gimple perm_stmt = NULL;
2791   stmt_vec_info next_stmt_info;
2792   int i, stride;
2793   tree first_vec, second_vec, data_ref;
2794
2795   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2796
2797   /* Initialize the vect stmts of NODE to properly insert the generated
2798      stmts later.  */
2799   for (i = SLP_TREE_VEC_STMTS (node).length ();
2800        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2801     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2802
2803   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2804   for (i = 0; i < ncopies; i++)
2805     {
2806       first_vec = dr_chain[first_vec_indx];
2807       second_vec = dr_chain[second_vec_indx];
2808
2809       /* Generate the permute statement.  */
2810       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2811                                                 first_vec, second_vec, mask);
2812       data_ref = make_ssa_name (perm_dest, perm_stmt);
2813       gimple_set_lhs (perm_stmt, data_ref);
2814       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2815
2816       /* Store the vector statement in NODE.  */
2817       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2818
2819       first_vec_indx += stride;
2820       second_vec_indx += stride;
2821     }
2822
2823   /* Mark the scalar stmt as vectorized.  */
2824   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2825   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2826 }
2827
2828
2829 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2830    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2831    representation.  Check that the mask is valid and return FALSE if not.
2832    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2833    the next vector, i.e., the current first vector is not needed.  */
2834
2835 static bool
2836 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2837                        int mask_nunits, bool only_one_vec, int index,
2838                        unsigned char *mask, int *current_mask_element,
2839                        bool *need_next_vector, int *number_of_mask_fixes,
2840                        bool *mask_fixed, bool *needs_first_vector)
2841 {
2842   int i;
2843
2844   /* Convert to target specific representation.  */
2845   *current_mask_element = first_mask_element + m;
2846   /* Adjust the value in case it's a mask for second and third vectors.  */
2847   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2848
2849   if (*current_mask_element < mask_nunits)
2850     *needs_first_vector = true;
2851
2852   /* We have only one input vector to permute but the mask accesses values in
2853      the next vector as well.  */
2854   if (only_one_vec && *current_mask_element >= mask_nunits)
2855     {
2856       if (dump_enabled_p ())
2857         {
2858           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2859                            "permutation requires at least two vectors ");
2860           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2861         }
2862
2863       return false;
2864     }
2865
2866   /* The mask requires the next vector.  */
2867   if (*current_mask_element >= mask_nunits * 2)
2868     {
2869       if (*needs_first_vector || *mask_fixed)
2870         {
2871           /* We either need the first vector too or have already moved to the
2872              next vector. In both cases, this permutation needs three
2873              vectors.  */
2874           if (dump_enabled_p ())
2875             {
2876               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2877                                "permutation requires at "
2878                                "least three vectors ");
2879               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2880             }
2881
2882           return false;
2883         }
2884
2885       /* We move to the next vector, dropping the first one and working with
2886          the second and the third - we need to adjust the values of the mask
2887          accordingly.  */
2888       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2889
2890       for (i = 0; i < index; i++)
2891         mask[i] -= mask_nunits * *number_of_mask_fixes;
2892
2893       (*number_of_mask_fixes)++;
2894       *mask_fixed = true;
2895     }
2896
2897   *need_next_vector = *mask_fixed;
2898
2899   /* This was the last element of this mask. Start a new one.  */
2900   if (index == mask_nunits - 1)
2901     {
2902       *number_of_mask_fixes = 1;
2903       *mask_fixed = false;
2904       *needs_first_vector = false;
2905     }
2906
2907   return true;
2908 }
2909
2910
2911 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2912    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2913    permute statements for SLP_NODE_INSTANCE.  */
2914 bool
2915 vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
2916                               gimple_stmt_iterator *gsi, int vf,
2917                               slp_instance slp_node_instance, bool analyze_only)
2918 {
2919   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2920   tree mask_element_type = NULL_TREE, mask_type;
2921   int i, j, k, nunits, vec_index = 0, scalar_index;
2922   slp_tree node;
2923   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2924   gimple next_scalar_stmt;
2925   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2926   int first_mask_element;
2927   int index, unroll_factor, current_mask_element, ncopies;
2928   unsigned char *mask;
2929   bool only_one_vec = false, need_next_vector = false;
2930   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2931   int number_of_mask_fixes = 1;
2932   bool mask_fixed = false;
2933   bool needs_first_vector = false;
2934   enum machine_mode mode;
2935
2936   mode = TYPE_MODE (vectype);
2937
2938   if (!can_vec_perm_p (mode, false, NULL))
2939     {
2940       if (dump_enabled_p ())
2941         {
2942           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2943                            "no vect permute for ");
2944           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2945         }
2946       return false;
2947     }
2948
2949   /* The generic VEC_PERM_EXPR code always uses an integral type of the
2950      same size as the vector element being permuted.  */
2951   mask_element_type = lang_hooks.types.type_for_mode
2952                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2953   mask_type = get_vectype_for_scalar_type (mask_element_type);
2954   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2955   mask = XALLOCAVEC (unsigned char, nunits);
2956   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2957
2958   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2959      unrolling factor.  */
2960   orig_vec_stmts_num = group_size *
2961                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2962   if (orig_vec_stmts_num == 1)
2963     only_one_vec = true;
2964
2965   /* Number of copies is determined by the final vectorization factor
2966      relatively to SLP_NODE_INSTANCE unrolling factor.  */
2967   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2968
2969   /* Generate permutation masks for every NODE. Number of masks for each NODE
2970      is equal to GROUP_SIZE.
2971      E.g., we have a group of three nodes with three loads from the same
2972      location in each node, and the vector size is 4. I.e., we have a
2973      a0b0c0a1b1c1... sequence and we need to create the following vectors:
2974      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2975      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2976      ...
2977
2978      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2979      The last mask is illegal since we assume two operands for permute
2980      operation, and the mask element values can't be outside that range.
2981      Hence, the last mask must be converted into {2,5,5,5}.
2982      For the first two permutations we need the first and the second input
2983      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2984      we need the second and the third vectors: {b1,c1,a2,b2} and
2985      {c2,a3,b3,c3}.  */
2986
2987   FOR_EACH_VEC_ELT  (SLP_INSTANCE_LOADS (slp_node_instance), i, node)
2988     {
2989       scalar_index = 0;
2990       index = 0;
2991       vect_stmts_counter = 0;
2992       vec_index = 0;
2993       first_vec_index = vec_index++;
2994       if (only_one_vec)
2995         second_vec_index = first_vec_index;
2996       else
2997         second_vec_index =  vec_index++;
2998
2999       for (j = 0; j < unroll_factor; j++)
3000         {
3001           for (k = 0; k < group_size; k++)
3002             {
3003               first_mask_element = i + j * group_size;
3004               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3005                                           nunits, only_one_vec, index,
3006                                           mask, &current_mask_element,
3007                                           &need_next_vector,
3008                                           &number_of_mask_fixes, &mask_fixed,
3009                                           &needs_first_vector))
3010                 return false;
3011               mask[index++] = current_mask_element;
3012
3013               if (index == nunits)
3014                 {
3015                   tree mask_vec, *mask_elts;
3016                   int l;
3017
3018                   if (!can_vec_perm_p (mode, false, mask))
3019                     {
3020                       if (dump_enabled_p ())
3021                         {
3022                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3023                                            vect_location, 
3024                                            "unsupported vect permute { ");
3025                           for (i = 0; i < nunits; ++i)
3026                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3027                                          mask[i]);
3028                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3029                         }
3030                       return false;
3031                     }
3032
3033                   mask_elts = XALLOCAVEC (tree, nunits);
3034                   for (l = 0; l < nunits; ++l)
3035                     mask_elts[l] = build_int_cst (mask_element_type, mask[l]);
3036                   mask_vec = build_vector (mask_type, mask_elts);
3037                   index = 0;
3038
3039                   if (!analyze_only)
3040                     {
3041                       if (need_next_vector)
3042                         {
3043                           first_vec_index = second_vec_index;
3044                           second_vec_index = vec_index;
3045                         }
3046
3047                       next_scalar_stmt
3048                           = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3049
3050                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3051                                mask_vec, first_vec_index, second_vec_index,
3052                                gsi, node, vectype, dr_chain,
3053                                ncopies, vect_stmts_counter++);
3054                     }
3055                 }
3056             }
3057         }
3058     }
3059
3060   return true;
3061 }
3062
3063
3064
3065 /* Vectorize SLP instance tree in postorder.  */
3066
3067 static bool
3068 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3069                             unsigned int vectorization_factor)
3070 {
3071   gimple stmt;
3072   bool grouped_store, is_store;
3073   gimple_stmt_iterator si;
3074   stmt_vec_info stmt_info;
3075   unsigned int vec_stmts_size, nunits, group_size;
3076   tree vectype;
3077   int i;
3078   slp_tree loads_node;
3079   slp_tree child;
3080
3081   if (!node)
3082     return false;
3083
3084   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3085     vect_schedule_slp_instance (child, instance, vectorization_factor);
3086
3087   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3088   stmt_info = vinfo_for_stmt (stmt);
3089
3090   /* VECTYPE is the type of the destination.  */
3091   vectype = STMT_VINFO_VECTYPE (stmt_info);
3092   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3093   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3094
3095   /* For each SLP instance calculate number of vector stmts to be created
3096      for the scalar stmts in each node of the SLP tree.  Number of vector
3097      elements in one vector iteration is the number of scalar elements in
3098      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3099      size.  */
3100   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3101
3102   /* In case of load permutation we have to allocate vectorized statements for
3103      all the nodes that participate in that permutation.  */
3104   if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3105     {
3106       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node)
3107         {
3108           if (!SLP_TREE_VEC_STMTS (loads_node).exists ())
3109             {
3110               SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size);
3111               SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
3112             }
3113         }
3114     }
3115
3116   if (!SLP_TREE_VEC_STMTS (node).exists ())
3117     {
3118       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3119       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3120     }
3121
3122   if (dump_enabled_p ())
3123     {
3124       dump_printf_loc (MSG_NOTE,vect_location,
3125                        "------>vectorizing SLP node starting from: ");
3126       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3127     }
3128
3129   /* Loads should be inserted before the first load.  */
3130   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3131       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3132       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3133       && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3134     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3135   else if (is_pattern_stmt_p (stmt_info))
3136     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3137   else
3138     si = gsi_for_stmt (stmt);
3139
3140   /* Stores should be inserted just before the last store.  */
3141   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3142       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3143     { 
3144       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3145       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3146        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3147       si = gsi_for_stmt (last_store);
3148     }
3149
3150   /* Mark the first element of the reduction chain as reduction to properly
3151      transform the node.  In the analysis phase only the last element of the
3152      chain is marked as reduction.  */
3153   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3154       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3155     {
3156       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3157       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3158     }
3159
3160   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3161   return is_store;
3162 }
3163
3164 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3165    For loop vectorization this is done in vectorizable_call, but for SLP
3166    it needs to be deferred until end of vect_schedule_slp, because multiple
3167    SLP instances may refer to the same scalar stmt.  */
3168
3169 static void
3170 vect_remove_slp_scalar_calls (slp_tree node)
3171 {
3172   gimple stmt, new_stmt;
3173   gimple_stmt_iterator gsi;
3174   int i;
3175   slp_tree child;
3176   tree lhs;
3177   stmt_vec_info stmt_info;
3178
3179   if (!node)
3180     return;
3181
3182   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3183     vect_remove_slp_scalar_calls (child);
3184
3185   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3186     {
3187       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3188         continue;
3189       stmt_info = vinfo_for_stmt (stmt);
3190       if (stmt_info == NULL
3191           || is_pattern_stmt_p (stmt_info)
3192           || !PURE_SLP_STMT (stmt_info))
3193         continue;
3194       lhs = gimple_call_lhs (stmt);
3195       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3196       set_vinfo_for_stmt (new_stmt, stmt_info);
3197       set_vinfo_for_stmt (stmt, NULL);
3198       STMT_VINFO_STMT (stmt_info) = new_stmt;
3199       gsi = gsi_for_stmt (stmt);
3200       gsi_replace (&gsi, new_stmt, false);
3201       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3202     }
3203 }
3204
3205 /* Generate vector code for all SLP instances in the loop/basic block.  */
3206
3207 bool
3208 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3209 {
3210   vec<slp_instance> slp_instances;
3211   slp_instance instance;
3212   slp_tree loads_node;
3213   unsigned int i, j, vf;
3214   bool is_store = false;
3215
3216   if (loop_vinfo)
3217     {
3218       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3219       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3220     }
3221   else
3222     {
3223       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3224       vf = 1;
3225     }
3226
3227   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3228     {
3229       /* Schedule the tree of INSTANCE.  */
3230       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3231                                              instance, vf);
3232
3233       /* Clear STMT_VINFO_VEC_STMT of all loads.  With shared loads
3234          between SLP instances we fail to properly initialize the
3235          vectorized SLP stmts and confuse different load permutations.  */
3236       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node)
3237         STMT_VINFO_VEC_STMT
3238           (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL;
3239
3240       if (dump_enabled_p ())
3241         dump_printf_loc (MSG_NOTE, vect_location,
3242                          "vectorizing stmts using SLP.");
3243     }
3244
3245   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3246     {
3247       slp_tree root = SLP_INSTANCE_TREE (instance);
3248       gimple store;
3249       unsigned int j;
3250       gimple_stmt_iterator gsi;
3251
3252       /* Remove scalar call stmts.  Do not do this for basic-block
3253          vectorization as not all uses may be vectorized.
3254          ???  Why should this be necessary?  DCE should be able to
3255          remove the stmts itself.
3256          ???  For BB vectorization we can as well remove scalar
3257          stmts starting from the SLP tree root if they have no
3258          uses.  */
3259       if (loop_vinfo)
3260         vect_remove_slp_scalar_calls (root);
3261
3262       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3263                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3264         {
3265           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3266             break;
3267
3268          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3269            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3270           /* Free the attached stmt_vec_info and remove the stmt.  */
3271           gsi = gsi_for_stmt (store);
3272           unlink_stmt_vdef (store);
3273           gsi_remove (&gsi, true);
3274           release_defs (store);
3275           free_stmt_vec_info (store);
3276         }
3277     }
3278
3279   return is_store;
3280 }
3281
3282
3283 /* Vectorize the basic block.  */
3284
3285 void
3286 vect_slp_transform_bb (basic_block bb)
3287 {
3288   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3289   gimple_stmt_iterator si;
3290
3291   gcc_assert (bb_vinfo);
3292
3293   if (dump_enabled_p ())
3294     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3295
3296   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3297     {
3298       gimple stmt = gsi_stmt (si);
3299       stmt_vec_info stmt_info;
3300
3301       if (dump_enabled_p ())
3302         {
3303           dump_printf_loc (MSG_NOTE, vect_location,
3304                            "------>SLPing statement: ");
3305           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3306         }
3307
3308       stmt_info = vinfo_for_stmt (stmt);
3309       gcc_assert (stmt_info);
3310
3311       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3312       if (STMT_SLP_TYPE (stmt_info))
3313         {
3314           vect_schedule_slp (NULL, bb_vinfo);
3315           break;
3316         }
3317     }
3318
3319   if (dump_enabled_p ())
3320     dump_printf (MSG_OPTIMIZED_LOCATIONS, "BASIC BLOCK VECTORIZED\n");
3321
3322   destroy_bb_vec_info (bb_vinfo);
3323 }