re PR tree-optimization/65935 (433.milc in SPEC CPU 2006 is miscompiled)
[platform/upstream/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h"              /* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
79
80 /* Extract the location of the basic block in the source code.
81    Return the basic block location if succeed and NULL if not.  */
82
83 source_location
84 find_bb_location (basic_block bb)
85 {
86   gimple stmt = NULL;
87   gimple_stmt_iterator si;
88
89   if (!bb)
90     return UNKNOWN_LOCATION;
91
92   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93     {
94       stmt = gsi_stmt (si);
95       if (gimple_location (stmt) != UNKNOWN_LOCATION)
96         return gimple_location (stmt);
97     }
98
99   return UNKNOWN_LOCATION;
100 }
101
102
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
104
105 static void
106 vect_free_slp_tree (slp_tree node)
107 {
108   int i;
109   slp_tree child;
110
111   if (!node)
112     return;
113
114   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115     vect_free_slp_tree (child);
116
117   SLP_TREE_CHILDREN (node).release ();
118   SLP_TREE_SCALAR_STMTS (node).release ();
119   SLP_TREE_VEC_STMTS (node).release ();
120   SLP_TREE_LOAD_PERMUTATION (node).release ();
121
122   free (node);
123 }
124
125
126 /* Free the memory allocated for the SLP instance.  */
127
128 void
129 vect_free_slp_instance (slp_instance instance)
130 {
131   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132   SLP_INSTANCE_LOADS (instance).release ();
133   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
134   free (instance);
135 }
136
137
138 /* Create an SLP node for SCALAR_STMTS.  */
139
140 static slp_tree
141 vect_create_new_slp_node (vec<gimple> scalar_stmts)
142 {
143   slp_tree node;
144   gimple stmt = scalar_stmts[0];
145   unsigned int nops;
146
147   if (is_gimple_call (stmt))
148     nops = gimple_call_num_args (stmt);
149   else if (is_gimple_assign (stmt))
150     {
151       nops = gimple_num_ops (stmt) - 1;
152       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
153         nops++;
154     }
155   else
156     return NULL;
157
158   node = XNEW (struct _slp_tree);
159   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
160   SLP_TREE_VEC_STMTS (node).create (0);
161   SLP_TREE_CHILDREN (node).create (nops);
162   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
163
164   return node;
165 }
166
167
168 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169    operand.  */
170 static vec<slp_oprnd_info> 
171 vect_create_oprnd_info (int nops, int group_size)
172 {
173   int i;
174   slp_oprnd_info oprnd_info;
175   vec<slp_oprnd_info> oprnds_info;
176
177   oprnds_info.create (nops);
178   for (i = 0; i < nops; i++)
179     {
180       oprnd_info = XNEW (struct _slp_oprnd_info);
181       oprnd_info->def_stmts.create (group_size);
182       oprnd_info->first_dt = vect_uninitialized_def;
183       oprnd_info->first_op_type = NULL_TREE;
184       oprnd_info->first_pattern = false;
185       oprnds_info.quick_push (oprnd_info);
186     }
187
188   return oprnds_info;
189 }
190
191
192 /* Free operands info.  */
193
194 static void
195 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
196 {
197   int i;
198   slp_oprnd_info oprnd_info;
199
200   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
201     {
202       oprnd_info->def_stmts.release ();
203       XDELETE (oprnd_info);
204     }
205
206   oprnds_info.release ();
207 }
208
209
210 /* Find the place of the data-ref in STMT in the interleaving chain that starts
211    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
212
213 static int
214 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
215 {
216   gimple next_stmt = first_stmt;
217   int result = 0;
218
219   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
220     return -1;
221
222   do
223     {
224       if (next_stmt == stmt)
225         return result;
226       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
227       if (next_stmt)
228         result += GROUP_GAP (vinfo_for_stmt (next_stmt));
229     }
230   while (next_stmt);
231
232   return -1;
233 }
234
235
236 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
237    they are of a valid type and that they match the defs of the first stmt of
238    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
239    return -1, if the error could be corrected by swapping operands of the
240    operation return 1, if everything is ok return 0.  */
241
242 static int 
243 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
244                              gimple stmt, bool first,
245                              vec<slp_oprnd_info> *oprnds_info)
246 {
247   tree oprnd;
248   unsigned int i, number_of_oprnds;
249   tree def;
250   gimple def_stmt;
251   enum vect_def_type dt = vect_uninitialized_def;
252   struct loop *loop = NULL;
253   bool pattern = false;
254   slp_oprnd_info oprnd_info;
255   int first_op_idx = 1;
256   bool commutative = false;
257   bool first_op_cond = false;
258
259   if (loop_vinfo)
260     loop = LOOP_VINFO_LOOP (loop_vinfo);
261
262   if (is_gimple_call (stmt))
263     {
264       number_of_oprnds = gimple_call_num_args (stmt);
265       first_op_idx = 3;
266     }
267   else if (is_gimple_assign (stmt))
268     {
269       enum tree_code code = gimple_assign_rhs_code (stmt);
270       number_of_oprnds = gimple_num_ops (stmt) - 1;
271       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
272         {
273           first_op_cond = true;
274           commutative = true;
275           number_of_oprnds++;
276         }
277       else
278         commutative = commutative_tree_code (code);
279     }
280   else
281     return -1;
282
283   bool swapped = false;
284   for (i = 0; i < number_of_oprnds; i++)
285     {
286 again:
287       if (first_op_cond)
288         {
289           if (i == 0 || i == 1)
290             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
291                                   swapped ? !i : i);
292           else
293             oprnd = gimple_op (stmt, first_op_idx + i - 1);
294         }
295       else
296         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
297
298       oprnd_info = (*oprnds_info)[i];
299
300       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
301                                &def, &dt)
302           || (!def_stmt && dt != vect_constant_def))
303         {
304           if (dump_enabled_p ())
305             {
306               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
307                                "Build SLP failed: can't find def for ");
308               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
309               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
310             }
311
312           return -1;
313         }
314
315       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
316          from the pattern.  Check that all the stmts of the node are in the
317          pattern.  */
318       if (def_stmt && gimple_bb (def_stmt)
319           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
320               || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
321                   && gimple_code (def_stmt) != GIMPLE_PHI))
322           && vinfo_for_stmt (def_stmt)
323           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
324           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
325           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
326         {
327           pattern = true;
328           if (!first && !oprnd_info->first_pattern)
329             {
330               if (i == 0
331                   && !swapped
332                   && commutative)
333                 {
334                   swapped = true;
335                   goto again;
336                 }
337
338               if (dump_enabled_p ())
339                 {
340                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
341                                    "Build SLP failed: some of the stmts"
342                                    " are in a pattern, and others are not ");
343                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
344                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
345                 }
346
347               return 1;
348             }
349
350           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
351           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
352
353           if (dt == vect_unknown_def_type)
354             {
355               if (dump_enabled_p ())
356                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357                                  "Unsupported pattern.\n");
358               return -1;
359             }
360
361           switch (gimple_code (def_stmt))
362             {
363               case GIMPLE_PHI:
364                 def = gimple_phi_result (def_stmt);
365                 break;
366
367               case GIMPLE_ASSIGN:
368                 def = gimple_assign_lhs (def_stmt);
369                 break;
370
371               default:
372                 if (dump_enabled_p ())
373                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
374                                    "unsupported defining stmt:\n");
375                 return -1;
376             }
377         }
378
379       if (first)
380         {
381           oprnd_info->first_dt = dt;
382           oprnd_info->first_pattern = pattern;
383           oprnd_info->first_op_type = TREE_TYPE (oprnd);
384         }
385       else
386         {
387           /* Not first stmt of the group, check that the def-stmt/s match
388              the def-stmt/s of the first stmt.  Allow different definition
389              types for reduction chains: the first stmt must be a
390              vect_reduction_def (a phi node), and the rest
391              vect_internal_def.  */
392           if (((oprnd_info->first_dt != dt
393                 && !(oprnd_info->first_dt == vect_reduction_def
394                      && dt == vect_internal_def)
395                 && !((oprnd_info->first_dt == vect_external_def
396                       || oprnd_info->first_dt == vect_constant_def)
397                      && (dt == vect_external_def
398                          || dt == vect_constant_def)))
399                || !types_compatible_p (oprnd_info->first_op_type,
400                                        TREE_TYPE (oprnd))))
401             {
402               /* Try swapping operands if we got a mismatch.  */
403               if (i == 0
404                   && !swapped
405                   && commutative)
406                 {
407                   swapped = true;
408                   goto again;
409                 }
410
411               if (dump_enabled_p ())
412                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
413                                  "Build SLP failed: different types\n");
414
415               return 1;
416             }
417         }
418
419       /* Check the types of the definitions.  */
420       switch (dt)
421         {
422         case vect_constant_def:
423         case vect_external_def:
424         case vect_reduction_def:
425           break;
426
427         case vect_internal_def:
428           oprnd_info->def_stmts.quick_push (def_stmt);
429           break;
430
431         default:
432           /* FORNOW: Not supported.  */
433           if (dump_enabled_p ())
434             {
435               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436                                "Build SLP failed: illegal type of def ");
437               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
438               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
439             }
440
441           return -1;
442         }
443     }
444
445   /* Swap operands.  */
446   if (swapped)
447     {
448       if (first_op_cond)
449         {
450           tree cond = gimple_assign_rhs1 (stmt);
451           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
452                              &TREE_OPERAND (cond, 1));
453           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
454         }
455       else
456         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
457                            gimple_assign_rhs2_ptr (stmt));
458     }
459
460   return 0;
461 }
462
463
464 /* Verify if the scalar stmts STMTS are isomorphic, require data
465    permutation or are of unsupported types of operation.  Return
466    true if they are, otherwise return false and indicate in *MATCHES
467    which stmts are not isomorphic to the first one.  If MATCHES[0]
468    is false then this indicates the comparison could not be
469    carried out or the stmts will never be vectorized by SLP.  */
470
471 static bool
472 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
473                        vec<gimple> stmts, unsigned int group_size,
474                        unsigned nops, unsigned int *max_nunits,
475                        unsigned int vectorization_factor, bool *matches)
476 {
477   unsigned int i;
478   gimple stmt = stmts[0];
479   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
480   enum tree_code first_cond_code = ERROR_MARK;
481   tree lhs;
482   bool need_same_oprnds = false;
483   tree vectype, scalar_type, first_op1 = NULL_TREE;
484   optab optab;
485   int icode;
486   machine_mode optab_op2_mode;
487   machine_mode vec_mode;
488   struct data_reference *first_dr;
489   HOST_WIDE_INT dummy;
490   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
491   tree cond;
492
493   /* For every stmt in NODE find its def stmt/s.  */
494   FOR_EACH_VEC_ELT (stmts, i, stmt)
495     {
496       matches[i] = false;
497
498       if (dump_enabled_p ())
499         {
500           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
501           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
502           dump_printf (MSG_NOTE, "\n");
503         }
504
505       /* Fail to vectorize statements marked as unvectorizable.  */
506       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
507         {
508           if (dump_enabled_p ())
509             {
510               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
511                                "Build SLP failed: unvectorizable statement ");
512               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
513               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
514             }
515           /* Fatal mismatch.  */
516           matches[0] = false;
517           return false;
518         }
519
520       lhs = gimple_get_lhs (stmt);
521       if (lhs == NULL_TREE)
522         {
523           if (dump_enabled_p ())
524             {
525               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
526                                "Build SLP failed: not GIMPLE_ASSIGN nor "
527                                "GIMPLE_CALL ");
528               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
529               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530             }
531           /* Fatal mismatch.  */
532           matches[0] = false;
533           return false;
534         }
535
536        if (is_gimple_assign (stmt)
537            && gimple_assign_rhs_code (stmt) == COND_EXPR
538            && (cond = gimple_assign_rhs1 (stmt))
539            && !COMPARISON_CLASS_P (cond))
540         {
541           if (dump_enabled_p ())
542             {
543               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
544                                "Build SLP failed: condition is not "
545                                "comparison ");
546               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
547               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
548             }
549           /* Fatal mismatch.  */
550           matches[0] = false;
551           return false;
552         }
553
554       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
555       vectype = get_vectype_for_scalar_type (scalar_type);
556       if (!vectype)
557         {
558           if (dump_enabled_p ())
559             {
560               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
561                                "Build SLP failed: unsupported data-type ");
562               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
563                                  scalar_type);
564               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
565             }
566           /* Fatal mismatch.  */
567           matches[0] = false;
568           return false;
569         }
570
571       /* In case of multiple types we need to detect the smallest type.  */
572       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
573         {
574           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
575           if (bb_vinfo)
576             vectorization_factor = *max_nunits;
577         }
578
579       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
580         {
581           rhs_code = CALL_EXPR;
582           if (gimple_call_internal_p (call_stmt)
583               || gimple_call_tail_p (call_stmt)
584               || gimple_call_noreturn_p (call_stmt)
585               || !gimple_call_nothrow_p (call_stmt)
586               || gimple_call_chain (call_stmt))
587             {
588               if (dump_enabled_p ())
589                 {
590                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
591                                    "Build SLP failed: unsupported call type ");
592                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
593                                     call_stmt, 0);
594                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
595                 }
596               /* Fatal mismatch.  */
597               matches[0] = false;
598               return false;
599             }
600         }
601       else
602         rhs_code = gimple_assign_rhs_code (stmt);
603
604       /* Check the operation.  */
605       if (i == 0)
606         {
607           first_stmt_code = rhs_code;
608
609           /* Shift arguments should be equal in all the packed stmts for a
610              vector shift with scalar shift operand.  */
611           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
612               || rhs_code == LROTATE_EXPR
613               || rhs_code == RROTATE_EXPR)
614             {
615               vec_mode = TYPE_MODE (vectype);
616
617               /* First see if we have a vector/vector shift.  */
618               optab = optab_for_tree_code (rhs_code, vectype,
619                                            optab_vector);
620
621               if (!optab
622                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
623                 {
624                   /* No vector/vector shift, try for a vector/scalar shift.  */
625                   optab = optab_for_tree_code (rhs_code, vectype,
626                                                optab_scalar);
627
628                   if (!optab)
629                     {
630                       if (dump_enabled_p ())
631                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
632                                          "Build SLP failed: no optab.\n");
633                       /* Fatal mismatch.  */
634                       matches[0] = false;
635                       return false;
636                     }
637                   icode = (int) optab_handler (optab, vec_mode);
638                   if (icode == CODE_FOR_nothing)
639                     {
640                       if (dump_enabled_p ())
641                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642                                          "Build SLP failed: "
643                                          "op not supported by target.\n");
644                       /* Fatal mismatch.  */
645                       matches[0] = false;
646                       return false;
647                     }
648                   optab_op2_mode = insn_data[icode].operand[2].mode;
649                   if (!VECTOR_MODE_P (optab_op2_mode))
650                     {
651                       need_same_oprnds = true;
652                       first_op1 = gimple_assign_rhs2 (stmt);
653                     }
654                 }
655             }
656           else if (rhs_code == WIDEN_LSHIFT_EXPR)
657             {
658               need_same_oprnds = true;
659               first_op1 = gimple_assign_rhs2 (stmt);
660             }
661         }
662       else
663         {
664           if (first_stmt_code != rhs_code
665               && (first_stmt_code != IMAGPART_EXPR
666                   || rhs_code != REALPART_EXPR)
667               && (first_stmt_code != REALPART_EXPR
668                   || rhs_code != IMAGPART_EXPR)
669               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
670                    && (first_stmt_code == ARRAY_REF
671                        || first_stmt_code == BIT_FIELD_REF
672                        || first_stmt_code == INDIRECT_REF
673                        || first_stmt_code == COMPONENT_REF
674                        || first_stmt_code == MEM_REF)))
675             {
676               if (dump_enabled_p ())
677                 {
678                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
679                                    "Build SLP failed: different operation "
680                                    "in stmt ");
681                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
682                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
683                 }
684               /* Mismatch.  */
685               continue;
686             }
687
688           if (need_same_oprnds
689               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
690             {
691               if (dump_enabled_p ())
692                 {
693                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
694                                    "Build SLP failed: different shift "
695                                    "arguments in ");
696                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
697                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
698                 }
699               /* Mismatch.  */
700               continue;
701             }
702
703           if (rhs_code == CALL_EXPR)
704             {
705               gimple first_stmt = stmts[0];
706               if (gimple_call_num_args (stmt) != nops
707                   || !operand_equal_p (gimple_call_fn (first_stmt),
708                                        gimple_call_fn (stmt), 0)
709                   || gimple_call_fntype (first_stmt)
710                      != gimple_call_fntype (stmt))
711                 {
712                   if (dump_enabled_p ())
713                     {
714                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
715                                        "Build SLP failed: different calls in ");
716                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
717                                         stmt, 0);
718                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
719                     }
720                   /* Mismatch.  */
721                   continue;
722                 }
723             }
724         }
725
726       /* Grouped store or load.  */
727       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
728         {
729           if (REFERENCE_CLASS_P (lhs))
730             {
731               /* Store.  */
732               ;
733             }
734           else
735             {
736               /* Load.  */
737               unsigned unrolling_factor
738                 = least_common_multiple
739                     (*max_nunits, group_size) / group_size;
740               /* FORNOW: Check that there is no gap between the loads
741                  and no gap between the groups when we need to load
742                  multiple groups at once.
743                  ???  We should enhance this to only disallow gaps
744                  inside vectors.  */
745               if ((unrolling_factor > 1
746                    && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
747                         && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
748                        /* If the group is split up then GROUP_GAP
749                           isn't correct here, nor is GROUP_FIRST_ELEMENT.  */
750                        || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
751                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
752                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
753                 {
754                   if (dump_enabled_p ())
755                     {
756                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757                                        "Build SLP failed: grouped "
758                                        "loads have gaps ");
759                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
760                                         stmt, 0);
761                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
762                     }
763                   /* Fatal mismatch.  */
764                   matches[0] = false;
765                   return false;
766                 }
767
768               /* Check that the size of interleaved loads group is not
769                  greater than the SLP group size.  */
770               unsigned ncopies
771                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
772               if (loop_vinfo
773                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
774                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
775                        - GROUP_GAP (vinfo_for_stmt (stmt)))
776                       > ncopies * group_size))
777                 {
778                   if (dump_enabled_p ())
779                     {
780                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781                                        "Build SLP failed: the number "
782                                        "of interleaved loads is greater than "
783                                        "the SLP group size ");
784                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
785                                         stmt, 0);
786                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787                     }
788                   /* Fatal mismatch.  */
789                   matches[0] = false;
790                   return false;
791                 }
792
793               old_first_load = first_load;
794               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
795               if (prev_first_load)
796                 {
797                   /* Check that there are no loads from different interleaving
798                      chains in the same node.  */
799                   if (prev_first_load != first_load)
800                     {
801                       if (dump_enabled_p ())
802                         {
803                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
804                                            vect_location, 
805                                            "Build SLP failed: different "
806                                            "interleaving chains in one node ");
807                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
808                                             stmt, 0);
809                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
810                         }
811                       /* Mismatch.  */
812                       continue;
813                     }
814                 }
815               else
816                 prev_first_load = first_load;
817
818               /* In some cases a group of loads is just the same load
819                  repeated N times.  Only analyze its cost once.  */
820               if (first_load == stmt && old_first_load != first_load)
821                 {
822                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
823                   if (vect_supportable_dr_alignment (first_dr, false)
824                       == dr_unaligned_unsupported)
825                     {
826                       if (dump_enabled_p ())
827                         {
828                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
829                                            vect_location, 
830                                            "Build SLP failed: unsupported "
831                                            "unaligned load ");
832                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
833                                             stmt, 0);
834                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
835                         }
836                       /* Fatal mismatch.  */
837                       matches[0] = false;
838                       return false;
839                     }
840                 }
841            }
842         } /* Grouped access.  */
843       else
844         {
845           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
846             {
847               /* Not grouped load.  */
848               if (dump_enabled_p ())
849                 {
850                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
851                                    "Build SLP failed: not grouped load ");
852                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
853                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
854                 }
855
856               /* FORNOW: Not grouped loads are not supported.  */
857               /* Fatal mismatch.  */
858               matches[0] = false;
859               return false;
860             }
861
862           /* Not memory operation.  */
863           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
864               && TREE_CODE_CLASS (rhs_code) != tcc_unary
865               && rhs_code != COND_EXPR
866               && rhs_code != CALL_EXPR)
867             {
868               if (dump_enabled_p ())
869                 {
870                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
871                                    "Build SLP failed: operation");
872                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
873                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
874                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
875                 }
876               /* Fatal mismatch.  */
877               matches[0] = false;
878               return false;
879             }
880
881           if (rhs_code == COND_EXPR)
882             {
883               tree cond_expr = gimple_assign_rhs1 (stmt);
884
885               if (i == 0)
886                 first_cond_code = TREE_CODE (cond_expr);
887               else if (first_cond_code != TREE_CODE (cond_expr))
888                 {
889                   if (dump_enabled_p ())
890                     {
891                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
892                                        "Build SLP failed: different"
893                                        " operation");
894                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
895                                         stmt, 0);
896                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
897                     }
898                   /* Mismatch.  */
899                   continue;
900                 }
901             }
902         }
903
904       matches[i] = true;
905     }
906
907   for (i = 0; i < group_size; ++i)
908     if (!matches[i])
909       return false;
910
911   return true;
912 }
913
914 /* Recursively build an SLP tree starting from NODE.
915    Fail (and return a value not equal to zero) if def-stmts are not
916    isomorphic, require data permutation or are of unsupported types of
917    operation.  Otherwise, return 0.
918    The value returned is the depth in the SLP tree where a mismatch
919    was found.  */
920
921 static bool
922 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
923                      slp_tree *node, unsigned int group_size,
924                      unsigned int *max_nunits,
925                      vec<slp_tree> *loads,
926                      unsigned int vectorization_factor,
927                      bool *matches, unsigned *npermutes, unsigned *tree_size,
928                      unsigned max_tree_size)
929 {
930   unsigned nops, i, this_tree_size = 0;
931   gimple stmt;
932
933   matches[0] = false;
934
935   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
936   if (is_gimple_call (stmt))
937     nops = gimple_call_num_args (stmt);
938   else if (is_gimple_assign (stmt))
939     {
940       nops = gimple_num_ops (stmt) - 1;
941       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
942         nops++;
943     }
944   else
945     return false;
946
947   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
948                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
949                               max_nunits, vectorization_factor, matches))
950     return false;
951
952   /* If the SLP node is a load, terminate the recursion.  */
953   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
954       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
955     {
956       loads->safe_push (*node);
957       return true;
958     }
959
960   /* Get at the operands, verifying they are compatible.  */
961   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
962   slp_oprnd_info oprnd_info;
963   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
964     {
965       switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
966                                            stmt, (i == 0), &oprnds_info))
967         {
968         case 0:
969           break;
970         case -1:
971           matches[0] = false;
972           vect_free_oprnd_info (oprnds_info);
973           return false;
974         case 1:
975           matches[i] = false;
976           break;
977         }
978     }
979   for (i = 0; i < group_size; ++i)
980     if (!matches[i])
981       {
982         vect_free_oprnd_info (oprnds_info);
983         return false;
984       }
985
986   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
987
988   /* Create SLP_TREE nodes for the definition node/s.  */
989   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
990     {
991       slp_tree child;
992       unsigned old_nloads = loads->length ();
993       unsigned old_max_nunits = *max_nunits;
994
995       if (oprnd_info->first_dt != vect_internal_def)
996         continue;
997
998       if (++this_tree_size > max_tree_size)
999         {
1000           vect_free_oprnd_info (oprnds_info);
1001           return false;
1002         }
1003
1004       child = vect_create_new_slp_node (oprnd_info->def_stmts);
1005       if (!child)
1006         {
1007           vect_free_oprnd_info (oprnds_info);
1008           return false;
1009         }
1010
1011       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1012                                group_size, max_nunits, loads,
1013                                vectorization_factor, matches,
1014                                npermutes, &this_tree_size, max_tree_size))
1015         {
1016           oprnd_info->def_stmts = vNULL;
1017           SLP_TREE_CHILDREN (*node).quick_push (child);
1018           continue;
1019         }
1020
1021       /* If the SLP build failed fatally and we analyze a basic-block
1022          simply treat nodes we fail to build as externally defined
1023          (and thus build vectors from the scalar defs).
1024          The cost model will reject outright expensive cases.
1025          ???  This doesn't treat cases where permutation ultimatively
1026          fails (or we don't try permutation below).  Ideally we'd
1027          even compute a permutation that will end up with the maximum
1028          SLP tree size...  */
1029       if (bb_vinfo
1030           && !matches[0]
1031           /* ???  Rejecting patterns this way doesn't work.  We'd have to
1032              do extra work to cancel the pattern so the uses see the
1033              scalar version.  */
1034           && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1035         {
1036           dump_printf_loc (MSG_NOTE, vect_location,
1037                            "Building vector operands from scalars\n");
1038           oprnd_info->def_stmts = vNULL;
1039           vect_free_slp_tree (child);
1040           SLP_TREE_CHILDREN (*node).quick_push (NULL);
1041           continue;
1042         }
1043
1044       /* If the SLP build for operand zero failed and operand zero
1045          and one can be commutated try that for the scalar stmts
1046          that failed the match.  */
1047       if (i == 0
1048           /* A first scalar stmt mismatch signals a fatal mismatch.  */
1049           && matches[0]
1050           /* ???  For COND_EXPRs we can swap the comparison operands
1051              as well as the arms under some constraints.  */
1052           && nops == 2
1053           && oprnds_info[1]->first_dt == vect_internal_def
1054           && is_gimple_assign (stmt)
1055           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1056           /* Do so only if the number of not successful permutes was nor more
1057              than a cut-ff as re-trying the recursive match on
1058              possibly each level of the tree would expose exponential
1059              behavior.  */
1060           && *npermutes < 4)
1061         {
1062           unsigned int j;
1063           slp_tree grandchild;
1064
1065           /* Roll back.  */
1066           *max_nunits = old_max_nunits;
1067           loads->truncate (old_nloads);
1068           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1069             vect_free_slp_tree (grandchild);
1070           SLP_TREE_CHILDREN (child).truncate (0);
1071
1072           /* Swap mismatched definition stmts.  */
1073           dump_printf_loc (MSG_NOTE, vect_location,
1074                            "Re-trying with swapped operands of stmts ");
1075           for (j = 0; j < group_size; ++j)
1076             if (!matches[j])
1077               {
1078                 gimple tem = oprnds_info[0]->def_stmts[j];
1079                 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1080                 oprnds_info[1]->def_stmts[j] = tem;
1081                 dump_printf (MSG_NOTE, "%d ", j);
1082               }
1083           dump_printf (MSG_NOTE, "\n");
1084           /* And try again with scratch 'matches' ... */
1085           bool *tem = XALLOCAVEC (bool, group_size);
1086           if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1087                                    group_size, max_nunits, loads,
1088                                    vectorization_factor,
1089                                    tem, npermutes, &this_tree_size,
1090                                    max_tree_size))
1091             {
1092               /* ... so if successful we can apply the operand swapping
1093                  to the GIMPLE IL.  This is necessary because for example
1094                  vect_get_slp_defs uses operand indexes and thus expects
1095                  canonical operand order.  */
1096               for (j = 0; j < group_size; ++j)
1097                 if (!matches[j])
1098                   {
1099                     gimple stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1100                     swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1101                                        gimple_assign_rhs2_ptr (stmt));
1102                   }
1103               oprnd_info->def_stmts = vNULL;
1104               SLP_TREE_CHILDREN (*node).quick_push (child);
1105               continue;
1106             }
1107
1108           ++*npermutes;
1109         }
1110
1111       oprnd_info->def_stmts = vNULL;
1112       vect_free_slp_tree (child);
1113       vect_free_oprnd_info (oprnds_info);
1114       return false;
1115     }
1116
1117   if (tree_size)
1118     *tree_size += this_tree_size;
1119
1120   vect_free_oprnd_info (oprnds_info);
1121   return true;
1122 }
1123
1124 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1125
1126 static void
1127 vect_print_slp_tree (int dump_kind, slp_tree node)
1128 {
1129   int i;
1130   gimple stmt;
1131   slp_tree child;
1132
1133   if (!node)
1134     return;
1135
1136   dump_printf (dump_kind, "node ");
1137   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1138     {
1139       dump_printf (dump_kind, "\n\tstmt %d ", i);
1140       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1141     }
1142   dump_printf (dump_kind, "\n");
1143
1144   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1145     vect_print_slp_tree (dump_kind, child);
1146 }
1147
1148
1149 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1150    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1151    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1152    stmts in NODE are to be marked.  */
1153
1154 static void
1155 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1156 {
1157   int i;
1158   gimple stmt;
1159   slp_tree child;
1160
1161   if (!node)
1162     return;
1163
1164   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1165     if (j < 0 || i == j)
1166       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1167
1168   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1169     vect_mark_slp_stmts (child, mark, j);
1170 }
1171
1172
1173 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1174
1175 static void
1176 vect_mark_slp_stmts_relevant (slp_tree node)
1177 {
1178   int i;
1179   gimple stmt;
1180   stmt_vec_info stmt_info;
1181   slp_tree child;
1182
1183   if (!node)
1184     return;
1185
1186   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1187     {
1188       stmt_info = vinfo_for_stmt (stmt);
1189       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1190                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1191       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1192     }
1193
1194   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1195     vect_mark_slp_stmts_relevant (child);
1196 }
1197
1198
1199 /* Rearrange the statements of NODE according to PERMUTATION.  */
1200
1201 static void
1202 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1203                           vec<unsigned> permutation)
1204 {
1205   gimple stmt;
1206   vec<gimple> tmp_stmts;
1207   unsigned int i;
1208   slp_tree child;
1209
1210   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1211     vect_slp_rearrange_stmts (child, group_size, permutation);
1212
1213   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1214   tmp_stmts.create (group_size);
1215   tmp_stmts.quick_grow_cleared (group_size);
1216
1217   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1218     tmp_stmts[permutation[i]] = stmt;
1219
1220   SLP_TREE_SCALAR_STMTS (node).release ();
1221   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1222 }
1223
1224
1225 /* Check if the required load permutations in the SLP instance
1226    SLP_INSTN are supported.  */
1227
1228 static bool
1229 vect_supported_load_permutation_p (slp_instance slp_instn)
1230 {
1231   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1232   unsigned int i, j, k, next;
1233   sbitmap load_index;
1234   slp_tree node;
1235   gimple stmt, load, next_load, first_load;
1236   struct data_reference *dr;
1237
1238   if (dump_enabled_p ())
1239     {
1240       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1241       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1242         if (node->load_permutation.exists ())
1243           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1244             dump_printf (MSG_NOTE, "%d ", next);
1245         else
1246           for (k = 0; k < group_size; ++k)
1247             dump_printf (MSG_NOTE, "%d ", k);
1248       dump_printf (MSG_NOTE, "\n");
1249     }
1250
1251   /* In case of reduction every load permutation is allowed, since the order
1252      of the reduction statements is not important (as opposed to the case of
1253      grouped stores).  The only condition we need to check is that all the
1254      load nodes are of the same size and have the same permutation (and then
1255      rearrange all the nodes of the SLP instance according to this 
1256      permutation).  */
1257
1258   /* Check that all the load nodes are of the same size.  */
1259   /* ???  Can't we assert this? */
1260   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1261     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1262       return false;
1263
1264   node = SLP_INSTANCE_TREE (slp_instn);
1265   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1266
1267   /* Reduction (there are no data-refs in the root).
1268      In reduction chain the order of the loads is important.  */
1269   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1270       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1271     {
1272       slp_tree load;
1273       unsigned int lidx;
1274
1275       /* Compare all the permutation sequences to the first one.  We know
1276          that at least one load is permuted.  */
1277       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1278       if (!node->load_permutation.exists ())
1279         return false;
1280       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1281         {
1282           if (!load->load_permutation.exists ())
1283             return false;
1284           FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1285             if (lidx != node->load_permutation[j])
1286               return false;
1287         }
1288
1289       /* Check that the loads in the first sequence are different and there
1290          are no gaps between them.  */
1291       load_index = sbitmap_alloc (group_size);
1292       bitmap_clear (load_index);
1293       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1294         {
1295           if (bitmap_bit_p (load_index, lidx))
1296             {
1297               sbitmap_free (load_index);
1298               return false;
1299             }
1300           bitmap_set_bit (load_index, lidx);
1301         }
1302       for (i = 0; i < group_size; i++)
1303         if (!bitmap_bit_p (load_index, i))
1304           {
1305             sbitmap_free (load_index);
1306             return false;
1307           }
1308       sbitmap_free (load_index);
1309
1310       /* This permutation is valid for reduction.  Since the order of the
1311          statements in the nodes is not important unless they are memory
1312          accesses, we can rearrange the statements in all the nodes
1313          according to the order of the loads.  */
1314       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1315                                 node->load_permutation);
1316
1317       /* We are done, no actual permutations need to be generated.  */
1318       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1319         SLP_TREE_LOAD_PERMUTATION (node).release ();
1320       return true;
1321     }
1322
1323   /* In basic block vectorization we allow any subchain of an interleaving
1324      chain.
1325      FORNOW: not supported in loop SLP because of realignment compications.  */
1326   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1327     {
1328       /* Check that for every node in the instance the loads
1329          form a subchain.  */
1330       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1331         {
1332           next_load = NULL;
1333           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1334             {
1335               if (j != 0 && next_load != load)
1336                 return false;
1337               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1338             }
1339         }
1340
1341       /* Check that the alignment of the first load in every subchain, i.e.,
1342          the first statement in every load node, is supported.
1343          ???  This belongs in alignment checking.  */
1344       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1345         {
1346           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1347           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1348             {
1349               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1350               if (vect_supportable_dr_alignment (dr, false)
1351                   == dr_unaligned_unsupported)
1352                 {
1353                   if (dump_enabled_p ())
1354                     {
1355                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1356                                        vect_location,
1357                                        "unsupported unaligned load ");
1358                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1359                                         first_load, 0);
1360                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1361                     }
1362                   return false;
1363                 }
1364             }
1365         }
1366
1367       /* We are done, no actual permutations need to be generated.  */
1368       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1369         SLP_TREE_LOAD_PERMUTATION (node).release ();
1370       return true;
1371     }
1372
1373   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1374      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1375      well (unless it's reduction).  */
1376   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1377     return false;
1378   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1379     if (!node->load_permutation.exists ())
1380       return false;
1381
1382   load_index = sbitmap_alloc (group_size);
1383   bitmap_clear (load_index);
1384   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1385     {
1386       unsigned int lidx = node->load_permutation[0];
1387       if (bitmap_bit_p (load_index, lidx))
1388         {
1389           sbitmap_free (load_index);
1390           return false;
1391         }
1392       bitmap_set_bit (load_index, lidx);
1393       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1394         if (k != lidx)
1395           {
1396             sbitmap_free (load_index);
1397             return false;
1398           }
1399     }
1400   for (i = 0; i < group_size; i++)
1401     if (!bitmap_bit_p (load_index, i))
1402       {
1403         sbitmap_free (load_index);
1404         return false;
1405       }
1406   sbitmap_free (load_index);
1407
1408   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1409     if (node->load_permutation.exists ()
1410         && !vect_transform_slp_perm_load
1411               (node, vNULL, NULL,
1412                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1413       return false;
1414   return true;
1415 }
1416
1417
1418 /* Find the last store in SLP INSTANCE.  */
1419
1420 static gimple
1421 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1422 {
1423   gimple last = NULL, stmt;
1424
1425   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1426     {
1427       stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1428       if (is_pattern_stmt_p (stmt_vinfo))
1429         last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1430       else
1431         last = get_later_stmt (stmt, last);
1432     }
1433
1434   return last;
1435 }
1436
1437 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1438
1439 static void
1440 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1441                          slp_instance instance, slp_tree node,
1442                          stmt_vector_for_cost *prologue_cost_vec,
1443                          unsigned ncopies_for_cost)
1444 {
1445   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1446
1447   unsigned i;
1448   slp_tree child;
1449   gimple stmt, s;
1450   stmt_vec_info stmt_info;
1451   tree lhs;
1452   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1453
1454   /* Recurse down the SLP tree.  */
1455   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1456     if (child)
1457       vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1458                                instance, child, prologue_cost_vec,
1459                                ncopies_for_cost);
1460
1461   /* Look at the first scalar stmt to determine the cost.  */
1462   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1463   stmt_info = vinfo_for_stmt (stmt);
1464   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1465     {
1466       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1467         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1468                                vect_uninitialized_def,
1469                                node, prologue_cost_vec, body_cost_vec);
1470       else
1471         {
1472           int i;
1473           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1474           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1475                                 node, prologue_cost_vec, body_cost_vec);
1476           /* If the load is permuted record the cost for the permutation.
1477              ???  Loads from multiple chains are let through here only
1478              for a single special case involving complex numbers where
1479              in the end no permutation is necessary.  */
1480           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1481             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1482                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1483                 && vect_get_place_in_interleaving_chain
1484                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1485               {
1486                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1487                                   stmt_info, 0, vect_body);
1488                 break;
1489               }
1490         }
1491     }
1492   else
1493     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1494                       stmt_info, 0, vect_body);
1495
1496   /* Scan operands and account for prologue cost of constants/externals.
1497      ???  This over-estimates cost for multiple uses and should be
1498      re-engineered.  */
1499   lhs = gimple_get_lhs (stmt);
1500   for (i = 0; i < gimple_num_ops (stmt); ++i)
1501     {
1502       tree def, op = gimple_op (stmt, i);
1503       gimple def_stmt;
1504       enum vect_def_type dt;
1505       if (!op || op == lhs)
1506         continue;
1507       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1508                               &def_stmt, &def, &dt))
1509         {
1510           /* Without looking at the actual initializer a vector of
1511              constants can be implemented as load from the constant pool.
1512              ???  We need to pass down stmt_info for a vector type
1513              even if it points to the wrong stmt.  */
1514           if (dt == vect_constant_def)
1515             record_stmt_cost (prologue_cost_vec, 1, vector_load,
1516                               stmt_info, 0, vect_prologue);
1517           else if (dt == vect_external_def)
1518             record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1519                               stmt_info, 0, vect_prologue);
1520         }
1521     }
1522 }
1523
1524 /* Compute the cost for the SLP instance INSTANCE.  */
1525
1526 static void
1527 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1528                        slp_instance instance, unsigned nunits)
1529 {
1530   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1531   unsigned ncopies_for_cost;
1532   stmt_info_for_cost *si;
1533   unsigned i;
1534
1535   /* Calculate the number of vector stmts to create based on the unrolling
1536      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1537      GROUP_SIZE / NUNITS otherwise.  */
1538   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1539   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1540
1541   prologue_cost_vec.create (10);
1542   body_cost_vec.create (10);
1543   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1544   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1545                            instance, SLP_INSTANCE_TREE (instance),
1546                            &prologue_cost_vec, ncopies_for_cost);
1547
1548   /* Record the prologue costs, which were delayed until we were
1549      sure that SLP was successful.  Unlike the body costs, we know
1550      the final values now regardless of the loop vectorization factor.  */
1551   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1552                 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1553   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1554     {
1555       struct _stmt_vec_info *stmt_info
1556         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1557       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1558                             si->misalign, vect_prologue);
1559     }
1560
1561   prologue_cost_vec.release ();
1562 }
1563
1564 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1565    vect_build_slp_tree to build a tree of packed stmts if possible.
1566    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1567
1568 static bool
1569 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1570                            gimple stmt, unsigned max_tree_size)
1571 {
1572   slp_instance new_instance;
1573   slp_tree node;
1574   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1575   unsigned int unrolling_factor = 1, nunits;
1576   tree vectype, scalar_type = NULL_TREE;
1577   gimple next;
1578   unsigned int vectorization_factor = 0;
1579   int i;
1580   unsigned int max_nunits = 0;
1581   vec<slp_tree> loads;
1582   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1583   vec<gimple> scalar_stmts;
1584
1585   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1586     {
1587       if (dr)
1588         {
1589           scalar_type = TREE_TYPE (DR_REF (dr));
1590           vectype = get_vectype_for_scalar_type (scalar_type);
1591         }
1592       else
1593         {
1594           gcc_assert (loop_vinfo);
1595           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1596         }
1597
1598       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1599     }
1600   else
1601     {
1602       gcc_assert (loop_vinfo);
1603       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1604       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1605     }
1606
1607   if (!vectype)
1608     {
1609       if (dump_enabled_p ())
1610         {
1611           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1612                            "Build SLP failed: unsupported data-type ");
1613           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1614           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1615         }
1616
1617       return false;
1618     }
1619
1620   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1621   if (loop_vinfo)
1622     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1623   else
1624     vectorization_factor = nunits;
1625
1626   /* Calculate the unrolling factor.  */
1627   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1628   if (unrolling_factor != 1 && !loop_vinfo)
1629     {
1630       if (dump_enabled_p ())
1631         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1632                          "Build SLP failed: unrolling required in basic"
1633                          " block SLP\n");
1634
1635       return false;
1636     }
1637
1638   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1639   scalar_stmts.create (group_size);
1640   next = stmt;
1641   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1642     {
1643       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1644       while (next)
1645         {
1646           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1647               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1648             scalar_stmts.safe_push (
1649                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1650           else
1651             scalar_stmts.safe_push (next);
1652           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1653         }
1654     }
1655   else
1656     {
1657       /* Collect reduction statements.  */
1658       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1659       for (i = 0; reductions.iterate (i, &next); i++)
1660         scalar_stmts.safe_push (next);
1661     }
1662
1663   node = vect_create_new_slp_node (scalar_stmts);
1664
1665   loads.create (group_size);
1666
1667   /* Build the tree for the SLP instance.  */
1668   bool *matches = XALLOCAVEC (bool, group_size);
1669   unsigned npermutes = 0;
1670   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1671                            &max_nunits, &loads,
1672                            vectorization_factor, matches, &npermutes, NULL,
1673                            max_tree_size))
1674     {
1675       /* Calculate the unrolling factor based on the smallest type.  */
1676       if (max_nunits > nunits)
1677         unrolling_factor = least_common_multiple (max_nunits, group_size)
1678                            / group_size;
1679
1680       if (unrolling_factor != 1 && !loop_vinfo)
1681         {
1682           if (dump_enabled_p ())
1683             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1684                              "Build SLP failed: unrolling required in basic"
1685                              " block SLP\n");
1686           vect_free_slp_tree (node);
1687           loads.release ();
1688           return false;
1689         }
1690
1691       /* Create a new SLP instance.  */
1692       new_instance = XNEW (struct _slp_instance);
1693       SLP_INSTANCE_TREE (new_instance) = node;
1694       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1695       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1696       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1697       SLP_INSTANCE_LOADS (new_instance) = loads;
1698
1699       /* Compute the load permutation.  */
1700       slp_tree load_node;
1701       bool loads_permuted = false;
1702       FOR_EACH_VEC_ELT (loads, i, load_node)
1703         {
1704           vec<unsigned> load_permutation;
1705           int j;
1706           gimple load, first_stmt;
1707           bool this_load_permuted = false;
1708           load_permutation.create (group_size);
1709           first_stmt = GROUP_FIRST_ELEMENT
1710               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1711           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1712             {
1713               int load_place
1714                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1715               gcc_assert (load_place != -1);
1716               if (load_place != j)
1717                 this_load_permuted = true;
1718               load_permutation.safe_push (load_place);
1719             }
1720           if (!this_load_permuted)
1721             {
1722               load_permutation.release ();
1723               continue;
1724             }
1725           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1726           loads_permuted = true;
1727         }
1728
1729       if (loads_permuted)
1730         {
1731           if (!vect_supported_load_permutation_p (new_instance))
1732             {
1733               if (dump_enabled_p ())
1734                 {
1735                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1736                                    "Build SLP failed: unsupported load "
1737                                    "permutation ");
1738                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1739                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1740                 }
1741               vect_free_slp_instance (new_instance);
1742               return false;
1743             }
1744         }
1745
1746
1747       if (loop_vinfo)
1748         {
1749           /* Compute the costs of this SLP instance.  Delay this for BB
1750              vectorization as we don't have vector types computed yet.  */
1751           vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1752                                  new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1753           LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1754         }
1755       else
1756         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1757
1758       if (dump_enabled_p ())
1759         vect_print_slp_tree (MSG_NOTE, node);
1760
1761       return true;
1762     }
1763
1764   /* Failed to SLP.  */
1765   /* Free the allocated memory.  */
1766   vect_free_slp_tree (node);
1767   loads.release ();
1768
1769   return false;
1770 }
1771
1772
1773 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1774    trees of packed scalar stmts if SLP is possible.  */
1775
1776 bool
1777 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1778                   unsigned max_tree_size)
1779 {
1780   unsigned int i;
1781   vec<gimple> grouped_stores;
1782   vec<gimple> reductions = vNULL;
1783   vec<gimple> reduc_chains = vNULL;
1784   gimple first_element;
1785   bool ok = false;
1786
1787   if (dump_enabled_p ())
1788     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1789
1790   if (loop_vinfo)
1791     {
1792       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1793       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1794       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1795     }
1796   else
1797     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1798
1799   /* Find SLP sequences starting from groups of grouped stores.  */
1800   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1801     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1802                                    max_tree_size))
1803       ok = true;
1804
1805   if (bb_vinfo && !ok)
1806     {
1807       if (dump_enabled_p ())
1808         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1809                          "Failed to SLP the basic block.\n");
1810
1811       return false;
1812     }
1813
1814   if (loop_vinfo
1815       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1816     {
1817       /* Find SLP sequences starting from reduction chains.  */
1818       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1819         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1820                                        max_tree_size))
1821           ok = true;
1822         else
1823           return false;
1824
1825       /* Don't try to vectorize SLP reductions if reduction chain was
1826          detected.  */
1827       return ok;
1828     }
1829
1830   /* Find SLP sequences starting from groups of reductions.  */
1831   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1832       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1833                                     max_tree_size))
1834     ok = true;
1835
1836   return true;
1837 }
1838
1839
1840 /* For each possible SLP instance decide whether to SLP it and calculate overall
1841    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1842    least one instance.  */
1843
1844 bool
1845 vect_make_slp_decision (loop_vec_info loop_vinfo)
1846 {
1847   unsigned int i, unrolling_factor = 1;
1848   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1849   slp_instance instance;
1850   int decided_to_slp = 0;
1851
1852   if (dump_enabled_p ())
1853     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1854                      "\n");
1855
1856   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1857     {
1858       /* FORNOW: SLP if you can.  */
1859       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1860         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1861
1862       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1863          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1864          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1865       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1866       decided_to_slp++;
1867     }
1868
1869   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1870
1871   if (decided_to_slp && dump_enabled_p ())
1872     dump_printf_loc (MSG_NOTE, vect_location,
1873                      "Decided to SLP %d instances. Unrolling factor %d\n",
1874                      decided_to_slp, unrolling_factor);
1875
1876   return (decided_to_slp > 0);
1877 }
1878
1879
1880 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1881    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1882
1883 static void
1884 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1885 {
1886   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1887   imm_use_iterator imm_iter;
1888   gimple use_stmt;
1889   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1890   slp_tree child;
1891   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1892   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1893   int j;
1894
1895   /* Propagate hybrid down the SLP tree.  */
1896   if (stype == hybrid)
1897     ;
1898   else if (HYBRID_SLP_STMT (stmt_vinfo))
1899     stype = hybrid;
1900   else
1901     {
1902       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1903       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1904       if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1905         FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1906           if (gimple_bb (use_stmt)
1907               && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1908               && (use_vinfo = vinfo_for_stmt (use_stmt))
1909               && !STMT_SLP_TYPE (use_vinfo)
1910               && (STMT_VINFO_RELEVANT (use_vinfo)
1911                   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))
1912                   || (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1913                       && STMT_VINFO_RELATED_STMT (use_vinfo)
1914                       && !STMT_SLP_TYPE (vinfo_for_stmt
1915                             (STMT_VINFO_RELATED_STMT (use_vinfo)))))
1916               && !(gimple_code (use_stmt) == GIMPLE_PHI
1917                    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1918             stype = hybrid;
1919     }
1920
1921   if (stype == hybrid)
1922     STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1923
1924   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1925     if (child)
1926       vect_detect_hybrid_slp_stmts (child, i, stype);
1927 }
1928
1929 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
1930
1931 static tree
1932 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1933 {
1934   walk_stmt_info *wi = (walk_stmt_info *)data;
1935   struct loop *loopp = (struct loop *)wi->info;
1936
1937   if (wi->is_lhs)
1938     return NULL_TREE;
1939
1940   if (TREE_CODE (*tp) == SSA_NAME
1941       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
1942     {
1943       gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
1944       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
1945           && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
1946         STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
1947     }
1948
1949   return NULL_TREE;
1950 }
1951
1952 static tree
1953 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
1954                           walk_stmt_info *)
1955 {
1956   /* If the stmt is in a SLP instance then this isn't a reason
1957      to mark use definitions in other SLP instances as hybrid.  */
1958   if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
1959     *handled = true;
1960   return NULL_TREE;
1961 }
1962
1963 /* Find stmts that must be both vectorized and SLPed.  */
1964
1965 void
1966 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1967 {
1968   unsigned int i;
1969   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1970   slp_instance instance;
1971
1972   if (dump_enabled_p ())
1973     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1974                      "\n");
1975
1976   /* First walk all pattern stmt in the loop and mark defs of uses as
1977      hybrid because immediate uses in them are not recorded.  */
1978   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1979     {
1980       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1981       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1982            gsi_next (&gsi))
1983         {
1984           gimple stmt = gsi_stmt (gsi);
1985           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1986           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
1987             {
1988               walk_stmt_info wi;
1989               memset (&wi, 0, sizeof (wi));
1990               wi.info = LOOP_VINFO_LOOP (loop_vinfo);
1991               gimple_stmt_iterator gsi2
1992                 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
1993               walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
1994                                 vect_detect_hybrid_slp_1, &wi);
1995               walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
1996                                vect_detect_hybrid_slp_2,
1997                                vect_detect_hybrid_slp_1, &wi);
1998             }
1999         }
2000     }
2001
2002   /* Then walk the SLP instance trees marking stmts with uses in
2003      non-SLP stmts as hybrid, also propagating hybrid down the
2004      SLP tree, collecting the above info on-the-fly.  */
2005   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2006     {
2007       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2008         vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2009                                       i, pure_slp);
2010     }
2011 }
2012
2013
2014 /* Create and initialize a new bb_vec_info struct for BB, as well as
2015    stmt_vec_info structs for all the stmts in it.  */
2016
2017 static bb_vec_info
2018 new_bb_vec_info (basic_block bb)
2019 {
2020   bb_vec_info res = NULL;
2021   gimple_stmt_iterator gsi;
2022
2023   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2024   BB_VINFO_BB (res) = bb;
2025
2026   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2027     {
2028       gimple stmt = gsi_stmt (gsi);
2029       gimple_set_uid (stmt, 0);
2030       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2031     }
2032
2033   BB_VINFO_GROUPED_STORES (res).create (10);
2034   BB_VINFO_SLP_INSTANCES (res).create (2);
2035   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2036
2037   bb->aux = res;
2038   return res;
2039 }
2040
2041
2042 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2043    stmts in the basic block.  */
2044
2045 static void
2046 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2047 {
2048   vec<slp_instance> slp_instances;
2049   slp_instance instance;
2050   basic_block bb;
2051   gimple_stmt_iterator si;
2052   unsigned i;
2053
2054   if (!bb_vinfo)
2055     return;
2056
2057   bb = BB_VINFO_BB (bb_vinfo);
2058
2059   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2060     {
2061       gimple stmt = gsi_stmt (si);
2062       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2063
2064       if (stmt_info)
2065         /* Free stmt_vec_info.  */
2066         free_stmt_vec_info (stmt);
2067     }
2068
2069   vect_destroy_datarefs (NULL, bb_vinfo);
2070   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2071   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2072   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2073   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2074     vect_free_slp_instance (instance);
2075   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2076   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2077   free (bb_vinfo);
2078   bb->aux = NULL;
2079 }
2080
2081
2082 /* Analyze statements contained in SLP tree node after recursively analyzing
2083    the subtree. Return TRUE if the operations are supported.  */
2084
2085 static bool
2086 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2087 {
2088   bool dummy;
2089   int i;
2090   gimple stmt;
2091   slp_tree child;
2092
2093   if (!node)
2094     return true;
2095
2096   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2097     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2098       return false;
2099
2100   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2101     {
2102       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2103       gcc_assert (stmt_info);
2104       gcc_assert (PURE_SLP_STMT (stmt_info));
2105
2106       if (!vect_analyze_stmt (stmt, &dummy, node))
2107         return false;
2108     }
2109
2110   return true;
2111 }
2112
2113
2114 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2115    operations are supported. */
2116
2117 static bool
2118 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2119 {
2120   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2121   slp_instance instance;
2122   int i;
2123
2124   for (i = 0; slp_instances.iterate (i, &instance); )
2125     {
2126       if (!vect_slp_analyze_node_operations (bb_vinfo,
2127                                              SLP_INSTANCE_TREE (instance)))
2128         {
2129           vect_free_slp_instance (instance);
2130           slp_instances.ordered_remove (i);
2131         }
2132       else
2133         i++;
2134     }
2135
2136   if (!slp_instances.length ())
2137     return false;
2138
2139   return true;
2140 }
2141
2142
2143 /* Compute the scalar cost of the SLP node NODE and its children
2144    and return it.  Do not account defs that are marked in LIFE and
2145    update LIFE according to uses of NODE.  */
2146
2147 static unsigned
2148 vect_bb_slp_scalar_cost (basic_block bb,
2149                          slp_tree node, vec<bool, va_heap> *life)
2150 {
2151   unsigned scalar_cost = 0;
2152   unsigned i;
2153   gimple stmt;
2154   slp_tree child;
2155
2156   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2157     {
2158       unsigned stmt_cost;
2159       ssa_op_iter op_iter;
2160       def_operand_p def_p;
2161       stmt_vec_info stmt_info;
2162
2163       if ((*life)[i])
2164         continue;
2165
2166       /* If there is a non-vectorized use of the defs then the scalar
2167          stmt is kept live in which case we do not account it or any
2168          required defs in the SLP children in the scalar cost.  This
2169          way we make the vectorization more costly when compared to
2170          the scalar cost.  */
2171       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2172         {
2173           imm_use_iterator use_iter;
2174           gimple use_stmt;
2175           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2176             if (!is_gimple_debug (use_stmt)
2177                 && (gimple_code (use_stmt) == GIMPLE_PHI
2178                     || gimple_bb (use_stmt) != bb
2179                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2180               {
2181                 (*life)[i] = true;
2182                 BREAK_FROM_IMM_USE_STMT (use_iter);
2183               }
2184         }
2185       if ((*life)[i])
2186         continue;
2187
2188       stmt_info = vinfo_for_stmt (stmt);
2189       if (STMT_VINFO_DATA_REF (stmt_info))
2190         {
2191           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2192             stmt_cost = vect_get_stmt_cost (scalar_load);
2193           else
2194             stmt_cost = vect_get_stmt_cost (scalar_store);
2195         }
2196       else
2197         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2198
2199       scalar_cost += stmt_cost;
2200     }
2201
2202   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2203     if (child)
2204       scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2205
2206   return scalar_cost;
2207 }
2208
2209 /* Check if vectorization of the basic block is profitable.  */
2210
2211 static bool
2212 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2213 {
2214   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2215   slp_instance instance;
2216   int i, j;
2217   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2218   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2219   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2220   stmt_vec_info stmt_info = NULL;
2221   stmt_vector_for_cost body_cost_vec;
2222   stmt_info_for_cost *ci;
2223
2224   /* Calculate vector costs.  */
2225   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2226     {
2227       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2228
2229       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2230         {
2231           stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2232           (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2233                                 stmt_info, ci->misalign, vect_body);
2234         }
2235     }
2236
2237   /* Calculate scalar cost.  */
2238   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2239     {
2240       auto_vec<bool, 20> life;
2241       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2242       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2243                                               SLP_INSTANCE_TREE (instance),
2244                                               &life);
2245     }
2246
2247   /* Complete the target-specific cost calculation.  */
2248   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2249                &vec_inside_cost, &vec_epilogue_cost);
2250
2251   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2252
2253   if (dump_enabled_p ())
2254     {
2255       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2256       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2257                    vec_inside_cost);
2258       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2259       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2260       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2261     }
2262
2263   /* Vectorization is profitable if its cost is less than the cost of scalar
2264      version.  */
2265   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2266     return false;
2267
2268   return true;
2269 }
2270
2271 /* Check if the basic block can be vectorized.  */
2272
2273 static bb_vec_info
2274 vect_slp_analyze_bb_1 (basic_block bb)
2275 {
2276   bb_vec_info bb_vinfo;
2277   vec<slp_instance> slp_instances;
2278   slp_instance instance;
2279   int i;
2280   int min_vf = 2;
2281   unsigned n_stmts = 0;
2282
2283   bb_vinfo = new_bb_vec_info (bb);
2284   if (!bb_vinfo)
2285     return NULL;
2286
2287   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2288     {
2289       if (dump_enabled_p ())
2290         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291                          "not vectorized: unhandled data-ref in basic "
2292                          "block.\n");
2293
2294       destroy_bb_vec_info (bb_vinfo);
2295       return NULL;
2296     }
2297
2298   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2299     {
2300       if (dump_enabled_p ())
2301         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2302                          "not vectorized: not enough data-refs in "
2303                          "basic block.\n");
2304
2305       destroy_bb_vec_info (bb_vinfo);
2306       return NULL;
2307     }
2308
2309   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2310     {
2311      if (dump_enabled_p ())
2312        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313                         "not vectorized: unhandled data access in "
2314                         "basic block.\n");
2315
2316       destroy_bb_vec_info (bb_vinfo);
2317       return NULL;
2318     }
2319
2320   vect_pattern_recog (NULL, bb_vinfo);
2321
2322   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2323     {
2324       if (dump_enabled_p ())
2325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326                          "not vectorized: bad data alignment in basic "
2327                          "block.\n");
2328
2329       destroy_bb_vec_info (bb_vinfo);
2330       return NULL;
2331     }
2332
2333   /* Check the SLP opportunities in the basic block, analyze and build SLP
2334      trees.  */
2335   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2336     {
2337       if (dump_enabled_p ())
2338         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2339                          "not vectorized: failed to find SLP opportunities "
2340                          "in basic block.\n");
2341
2342       destroy_bb_vec_info (bb_vinfo);
2343       return NULL;
2344     }
2345
2346   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2347
2348   /* Mark all the statements that we want to vectorize as pure SLP and
2349      relevant.  */
2350   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2351     {
2352       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2353       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2354     }
2355
2356   /* Mark all the statements that we do not want to vectorize.  */
2357   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2358        !gsi_end_p (gsi); gsi_next (&gsi))
2359     {
2360       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2361       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2362         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2363     }
2364
2365   /* Analyze dependences.  At this point all stmts not participating in
2366      vectorization have to be marked.  Dependence analysis assumes
2367      that we either vectorize all SLP instances or none at all.  */
2368   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2369      {
2370        if (dump_enabled_p ())
2371          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2372                           "not vectorized: unhandled data dependence "
2373                           "in basic block.\n");
2374
2375        destroy_bb_vec_info (bb_vinfo);
2376        return NULL;
2377      }
2378
2379   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2380     {
2381       if (dump_enabled_p ())
2382         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383                          "not vectorized: unsupported alignment in basic "
2384                          "block.\n");
2385       destroy_bb_vec_info (bb_vinfo);
2386       return NULL;
2387     }
2388
2389   if (!vect_slp_analyze_operations (bb_vinfo))
2390     {
2391       if (dump_enabled_p ())
2392         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2393                          "not vectorized: bad operation in basic block.\n");
2394
2395       destroy_bb_vec_info (bb_vinfo);
2396       return NULL;
2397     }
2398
2399   /* Compute the costs of the SLP instances.  */
2400   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2401     {
2402       gimple stmt = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2403       tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2404       vect_analyze_slp_cost (NULL, bb_vinfo,
2405                              instance, TYPE_VECTOR_SUBPARTS (vectype));
2406     }
2407
2408   /* Cost model: check if the vectorization is worthwhile.  */
2409   if (!unlimited_cost_model (NULL)
2410       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2411     {
2412       if (dump_enabled_p ())
2413         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2414                          "not vectorized: vectorization is not "
2415                          "profitable.\n");
2416
2417       destroy_bb_vec_info (bb_vinfo);
2418       return NULL;
2419     }
2420
2421   if (dump_enabled_p ())
2422     dump_printf_loc (MSG_NOTE, vect_location,
2423                      "Basic block will be vectorized using SLP\n");
2424
2425   return bb_vinfo;
2426 }
2427
2428
2429 bb_vec_info
2430 vect_slp_analyze_bb (basic_block bb)
2431 {
2432   bb_vec_info bb_vinfo;
2433   int insns = 0;
2434   gimple_stmt_iterator gsi;
2435   unsigned int vector_sizes;
2436
2437   if (dump_enabled_p ())
2438     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2439
2440   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2441     {
2442       gimple stmt = gsi_stmt (gsi);
2443       if (!is_gimple_debug (stmt)
2444           && !gimple_nop_p (stmt)
2445           && gimple_code (stmt) != GIMPLE_LABEL)
2446         insns++;
2447     }
2448
2449   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2450     {
2451       if (dump_enabled_p ())
2452         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453                          "not vectorized: too many instructions in "
2454                          "basic block.\n");
2455
2456       return NULL;
2457     }
2458
2459   /* Autodetect first vector size we try.  */
2460   current_vector_size = 0;
2461   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2462
2463   while (1)
2464     {
2465       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2466       if (bb_vinfo)
2467         return bb_vinfo;
2468
2469       destroy_bb_vec_info (bb_vinfo);
2470
2471       vector_sizes &= ~current_vector_size;
2472       if (vector_sizes == 0
2473           || current_vector_size == 0)
2474         return NULL;
2475
2476       /* Try the next biggest vector size.  */
2477       current_vector_size = 1 << floor_log2 (vector_sizes);
2478       if (dump_enabled_p ())
2479         dump_printf_loc (MSG_NOTE, vect_location,
2480                          "***** Re-trying analysis with "
2481                          "vector size %d\n", current_vector_size);
2482     }
2483 }
2484
2485
2486 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2487    the number of created vector stmts depends on the unrolling factor).
2488    However, the actual number of vector stmts for every SLP node depends on
2489    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2490    should be updated.  In this function we assume that the inside costs
2491    calculated in vect_model_xxx_cost are linear in ncopies.  */
2492
2493 void
2494 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2495 {
2496   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2497   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2498   slp_instance instance;
2499   stmt_vector_for_cost body_cost_vec;
2500   stmt_info_for_cost *si;
2501   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2502
2503   if (dump_enabled_p ())
2504     dump_printf_loc (MSG_NOTE, vect_location,
2505                      "=== vect_update_slp_costs_according_to_vf ===\n");
2506
2507   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2508     {
2509       /* We assume that costs are linear in ncopies.  */
2510       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2511
2512       /* Record the instance's instructions in the target cost model.
2513          This was delayed until here because the count of instructions
2514          isn't known beforehand.  */
2515       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2516
2517       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2518         (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2519                               vinfo_for_stmt (si->stmt), si->misalign,
2520                               vect_body);
2521     }
2522 }
2523
2524
2525 /* For constant and loop invariant defs of SLP_NODE this function returns
2526    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2527    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2528    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2529    REDUC_INDEX is the index of the reduction operand in the statements, unless
2530    it is -1.  */
2531
2532 static void
2533 vect_get_constant_vectors (tree op, slp_tree slp_node,
2534                            vec<tree> *vec_oprnds,
2535                            unsigned int op_num, unsigned int number_of_vectors,
2536                            int reduc_index)
2537 {
2538   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2539   gimple stmt = stmts[0];
2540   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2541   unsigned nunits;
2542   tree vec_cst;
2543   tree *elts;
2544   unsigned j, number_of_places_left_in_vector;
2545   tree vector_type;
2546   tree vop;
2547   int group_size = stmts.length ();
2548   unsigned int vec_num, i;
2549   unsigned number_of_copies = 1;
2550   vec<tree> voprnds;
2551   voprnds.create (number_of_vectors);
2552   bool constant_p, is_store;
2553   tree neutral_op = NULL;
2554   enum tree_code code = gimple_expr_code (stmt);
2555   gimple def_stmt;
2556   struct loop *loop;
2557   gimple_seq ctor_seq = NULL;
2558
2559   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2560       && reduc_index != -1)
2561     {
2562       op_num = reduc_index - 1;
2563       op = gimple_op (stmt, reduc_index);
2564       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2565          we need either neutral operands or the original operands.  See
2566          get_initial_def_for_reduction() for details.  */
2567       switch (code)
2568         {
2569           case WIDEN_SUM_EXPR:
2570           case DOT_PROD_EXPR:
2571           case PLUS_EXPR:
2572           case MINUS_EXPR:
2573           case BIT_IOR_EXPR:
2574           case BIT_XOR_EXPR:
2575              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2576                neutral_op = build_real (TREE_TYPE (op), dconst0);
2577              else
2578                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2579
2580              break;
2581
2582           case MULT_EXPR:
2583              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2584                neutral_op = build_real (TREE_TYPE (op), dconst1);
2585              else
2586                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2587
2588              break;
2589
2590           case BIT_AND_EXPR:
2591             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2592             break;
2593
2594           /* For MIN/MAX we don't have an easy neutral operand but
2595              the initial values can be used fine here.  Only for
2596              a reduction chain we have to force a neutral element.  */
2597           case MAX_EXPR:
2598           case MIN_EXPR:
2599             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2600               neutral_op = NULL;
2601             else
2602               {
2603                 def_stmt = SSA_NAME_DEF_STMT (op);
2604                 loop = (gimple_bb (stmt))->loop_father;
2605                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2606                                                     loop_preheader_edge (loop));
2607               }
2608             break;
2609
2610           default:
2611             neutral_op = NULL;
2612         }
2613     }
2614
2615   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2616     {
2617       is_store = true;
2618       op = gimple_assign_rhs1 (stmt);
2619     }
2620   else
2621     is_store = false;
2622
2623   gcc_assert (op);
2624
2625   if (CONSTANT_CLASS_P (op))
2626     constant_p = true;
2627   else
2628     constant_p = false;
2629
2630   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2631   gcc_assert (vector_type);
2632   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2633
2634   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2635      created vectors. It is greater than 1 if unrolling is performed.
2636
2637      For example, we have two scalar operands, s1 and s2 (e.g., group of
2638      strided accesses of size two), while NUNITS is four (i.e., four scalars
2639      of this type can be packed in a vector).  The output vector will contain
2640      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2641      will be 2).
2642
2643      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2644      containing the operands.
2645
2646      For example, NUNITS is four as before, and the group size is 8
2647      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2648      {s5, s6, s7, s8}.  */
2649
2650   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2651
2652   number_of_places_left_in_vector = nunits;
2653   elts = XALLOCAVEC (tree, nunits);
2654   bool place_after_defs = false;
2655   for (j = 0; j < number_of_copies; j++)
2656     {
2657       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2658         {
2659           if (is_store)
2660             op = gimple_assign_rhs1 (stmt);
2661           else
2662             {
2663               switch (code)
2664                 {
2665                   case COND_EXPR:
2666                     if (op_num == 0 || op_num == 1)
2667                       {
2668                         tree cond = gimple_assign_rhs1 (stmt);
2669                         op = TREE_OPERAND (cond, op_num);
2670                       }
2671                     else
2672                       {
2673                         if (op_num == 2)
2674                           op = gimple_assign_rhs2 (stmt);
2675                         else
2676                           op = gimple_assign_rhs3 (stmt);
2677                       }
2678                     break;
2679
2680                   case CALL_EXPR:
2681                     op = gimple_call_arg (stmt, op_num);
2682                     break;
2683
2684                   case LSHIFT_EXPR:
2685                   case RSHIFT_EXPR:
2686                   case LROTATE_EXPR:
2687                   case RROTATE_EXPR:
2688                     op = gimple_op (stmt, op_num + 1);
2689                     /* Unlike the other binary operators, shifts/rotates have
2690                        the shift count being int, instead of the same type as
2691                        the lhs, so make sure the scalar is the right type if
2692                        we are dealing with vectors of
2693                        long long/long/short/char.  */
2694                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2695                       op = fold_convert (TREE_TYPE (vector_type), op);
2696                     break;
2697
2698                   default:
2699                     op = gimple_op (stmt, op_num + 1);
2700                     break;
2701                 }
2702             }
2703
2704           if (reduc_index != -1)
2705             {
2706               loop = (gimple_bb (stmt))->loop_father;
2707               def_stmt = SSA_NAME_DEF_STMT (op);
2708
2709               gcc_assert (loop);
2710
2711               /* Get the def before the loop.  In reduction chain we have only
2712                  one initial value.  */
2713               if ((j != (number_of_copies - 1)
2714                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2715                        && i != 0))
2716                   && neutral_op)
2717                 op = neutral_op;
2718               else
2719                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2720                                             loop_preheader_edge (loop));
2721             }
2722
2723           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2724           number_of_places_left_in_vector--;
2725           tree orig_op = op;
2726           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2727             {
2728               if (CONSTANT_CLASS_P (op))
2729                 {
2730                   op = fold_unary (VIEW_CONVERT_EXPR,
2731                                    TREE_TYPE (vector_type), op);
2732                   gcc_assert (op && CONSTANT_CLASS_P (op));
2733                 }
2734               else
2735                 {
2736                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2737                   gimple init_stmt;
2738                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2739                   init_stmt
2740                     = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2741                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2742                   op = new_temp;
2743                 }
2744             }
2745           elts[number_of_places_left_in_vector] = op;
2746           if (!CONSTANT_CLASS_P (op))
2747             constant_p = false;
2748           if (TREE_CODE (orig_op) == SSA_NAME
2749               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2750               && STMT_VINFO_BB_VINFO (stmt_vinfo)
2751               && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2752                   == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2753             place_after_defs = true;
2754
2755           if (number_of_places_left_in_vector == 0)
2756             {
2757               number_of_places_left_in_vector = nunits;
2758
2759               if (constant_p)
2760                 vec_cst = build_vector (vector_type, elts);
2761               else
2762                 {
2763                   vec<constructor_elt, va_gc> *v;
2764                   unsigned k;
2765                   vec_alloc (v, nunits);
2766                   for (k = 0; k < nunits; ++k)
2767                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2768                   vec_cst = build_constructor (vector_type, v);
2769                 }
2770               tree init;
2771               gimple_stmt_iterator gsi;
2772               if (place_after_defs)
2773                 {
2774                   gsi = gsi_for_stmt
2775                           (vect_find_last_scalar_stmt_in_slp (slp_node));
2776                   init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2777                 }
2778               else
2779                 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2780               if (ctor_seq != NULL)
2781                 {
2782                   gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2783                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2784                                                         GSI_SAME_STMT);
2785                   ctor_seq = NULL;
2786                 }
2787               voprnds.quick_push (init);
2788               place_after_defs = false;
2789             }
2790         }
2791     }
2792
2793   /* Since the vectors are created in the reverse order, we should invert
2794      them.  */
2795   vec_num = voprnds.length ();
2796   for (j = vec_num; j != 0; j--)
2797     {
2798       vop = voprnds[j - 1];
2799       vec_oprnds->quick_push (vop);
2800     }
2801
2802   voprnds.release ();
2803
2804   /* In case that VF is greater than the unrolling factor needed for the SLP
2805      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2806      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2807      to replicate the vectors.  */
2808   while (number_of_vectors > vec_oprnds->length ())
2809     {
2810       tree neutral_vec = NULL;
2811
2812       if (neutral_op)
2813         {
2814           if (!neutral_vec)
2815             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2816
2817           vec_oprnds->quick_push (neutral_vec);
2818         }
2819       else
2820         {
2821           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2822             vec_oprnds->quick_push (vop);
2823         }
2824     }
2825 }
2826
2827
2828 /* Get vectorized definitions from SLP_NODE that contains corresponding
2829    vectorized def-stmts.  */
2830
2831 static void
2832 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2833 {
2834   tree vec_oprnd;
2835   gimple vec_def_stmt;
2836   unsigned int i;
2837
2838   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2839
2840   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2841     {
2842       gcc_assert (vec_def_stmt);
2843       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2844       vec_oprnds->quick_push (vec_oprnd);
2845     }
2846 }
2847
2848
2849 /* Get vectorized definitions for SLP_NODE.
2850    If the scalar definitions are loop invariants or constants, collect them and
2851    call vect_get_constant_vectors() to create vector stmts.
2852    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2853    must be stored in the corresponding child of SLP_NODE, and we call
2854    vect_get_slp_vect_defs () to retrieve them.  */
2855
2856 void
2857 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2858                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2859 {
2860   gimple first_stmt;
2861   int number_of_vects = 0, i;
2862   unsigned int child_index = 0;
2863   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2864   slp_tree child = NULL;
2865   vec<tree> vec_defs;
2866   tree oprnd;
2867   bool vectorized_defs;
2868
2869   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2870   FOR_EACH_VEC_ELT (ops, i, oprnd)
2871     {
2872       /* For each operand we check if it has vectorized definitions in a child
2873          node or we need to create them (for invariants and constants).  We
2874          check if the LHS of the first stmt of the next child matches OPRND.
2875          If it does, we found the correct child.  Otherwise, we call
2876          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2877          to check this child node for the next operand.  */
2878       vectorized_defs = false;
2879       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2880         {
2881           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2882
2883           /* We have to check both pattern and original def, if available.  */
2884           if (child)
2885             {
2886               gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2887               gimple related
2888                 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2889
2890               if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2891                   || (related
2892                       && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2893                 {
2894                   /* The number of vector defs is determined by the number of
2895                      vector statements in the node from which we get those
2896                      statements.  */
2897                   number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2898                   vectorized_defs = true;
2899                   child_index++;
2900                 }
2901             }
2902           else
2903             child_index++;
2904         }
2905
2906       if (!vectorized_defs)
2907         {
2908           if (i == 0)
2909             {
2910               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2911               /* Number of vector stmts was calculated according to LHS in
2912                  vect_schedule_slp_instance (), fix it by replacing LHS with
2913                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2914                  details.  */
2915               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2916                                              &rhs_size_unit);
2917               if (rhs_size_unit != lhs_size_unit)
2918                 {
2919                   number_of_vects *= rhs_size_unit;
2920                   number_of_vects /= lhs_size_unit;
2921                 }
2922             }
2923         }
2924
2925       /* Allocate memory for vectorized defs.  */
2926       vec_defs = vNULL;
2927       vec_defs.create (number_of_vects);
2928
2929       /* For reduction defs we call vect_get_constant_vectors (), since we are
2930          looking for initial loop invariant values.  */
2931       if (vectorized_defs && reduc_index == -1)
2932         /* The defs are already vectorized.  */
2933         vect_get_slp_vect_defs (child, &vec_defs);
2934       else
2935         /* Build vectors from scalar defs.  */
2936         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2937                                    number_of_vects, reduc_index);
2938
2939       vec_oprnds->quick_push (vec_defs);
2940
2941       /* For reductions, we only need initial values.  */
2942       if (reduc_index != -1)
2943         return;
2944     }
2945 }
2946
2947
2948 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2949    building a vector of type MASK_TYPE from it) and two input vectors placed in
2950    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2951    shifting by STRIDE elements of DR_CHAIN for every copy.
2952    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2953    copies).
2954    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2955    the created stmts must be inserted.  */
2956
2957 static inline void
2958 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2959                            tree mask, int first_vec_indx, int second_vec_indx,
2960                            gimple_stmt_iterator *gsi, slp_tree node,
2961                            tree vectype, vec<tree> dr_chain,
2962                            int ncopies, int vect_stmts_counter)
2963 {
2964   tree perm_dest;
2965   gimple perm_stmt = NULL;
2966   stmt_vec_info next_stmt_info;
2967   int i, stride;
2968   tree first_vec, second_vec, data_ref;
2969
2970   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2971
2972   /* Initialize the vect stmts of NODE to properly insert the generated
2973      stmts later.  */
2974   for (i = SLP_TREE_VEC_STMTS (node).length ();
2975        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2976     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2977
2978   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2979   for (i = 0; i < ncopies; i++)
2980     {
2981       first_vec = dr_chain[first_vec_indx];
2982       second_vec = dr_chain[second_vec_indx];
2983
2984       /* Generate the permute statement.  */
2985       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
2986                                        first_vec, second_vec, mask);
2987       data_ref = make_ssa_name (perm_dest, perm_stmt);
2988       gimple_set_lhs (perm_stmt, data_ref);
2989       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2990
2991       /* Store the vector statement in NODE.  */
2992       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2993
2994       first_vec_indx += stride;
2995       second_vec_indx += stride;
2996     }
2997
2998   /* Mark the scalar stmt as vectorized.  */
2999   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
3000   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
3001 }
3002
3003
3004 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3005    return in CURRENT_MASK_ELEMENT its equivalent in target specific
3006    representation.  Check that the mask is valid and return FALSE if not.
3007    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3008    the next vector, i.e., the current first vector is not needed.  */
3009
3010 static bool
3011 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
3012                        int mask_nunits, bool only_one_vec, int index,
3013                        unsigned char *mask, int *current_mask_element,
3014                        bool *need_next_vector, int *number_of_mask_fixes,
3015                        bool *mask_fixed, bool *needs_first_vector)
3016 {
3017   int i;
3018
3019   /* Convert to target specific representation.  */
3020   *current_mask_element = first_mask_element + m;
3021   /* Adjust the value in case it's a mask for second and third vectors.  */
3022   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3023
3024   if (*current_mask_element < 0)
3025     {
3026       if (dump_enabled_p ())
3027         {
3028           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029                            "permutation requires past vector ");
3030           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3031           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3032         }
3033       return false;
3034     }
3035
3036   if (*current_mask_element < mask_nunits)
3037     *needs_first_vector = true;
3038
3039   /* We have only one input vector to permute but the mask accesses values in
3040      the next vector as well.  */
3041   if (only_one_vec && *current_mask_element >= mask_nunits)
3042     {
3043       if (dump_enabled_p ())
3044         {
3045           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046                            "permutation requires at least two vectors ");
3047           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3048           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3049         }
3050
3051       return false;
3052     }
3053
3054   /* The mask requires the next vector.  */
3055   while (*current_mask_element >= mask_nunits * 2)
3056     {
3057       if (*needs_first_vector || *mask_fixed)
3058         {
3059           /* We either need the first vector too or have already moved to the
3060              next vector. In both cases, this permutation needs three
3061              vectors.  */
3062           if (dump_enabled_p ())
3063             {
3064               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3065                                "permutation requires at "
3066                                "least three vectors ");
3067               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3068               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3069             }
3070
3071           return false;
3072         }
3073
3074       /* We move to the next vector, dropping the first one and working with
3075          the second and the third - we need to adjust the values of the mask
3076          accordingly.  */
3077       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3078
3079       for (i = 0; i < index; i++)
3080         mask[i] -= mask_nunits * *number_of_mask_fixes;
3081
3082       (*number_of_mask_fixes)++;
3083       *mask_fixed = true;
3084     }
3085
3086   *need_next_vector = *mask_fixed;
3087
3088   /* This was the last element of this mask. Start a new one.  */
3089   if (index == mask_nunits - 1)
3090     {
3091       *number_of_mask_fixes = 1;
3092       *mask_fixed = false;
3093       *needs_first_vector = false;
3094     }
3095
3096   return true;
3097 }
3098
3099
3100 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3101    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3102    permute statements for the SLP node NODE of the SLP instance
3103    SLP_NODE_INSTANCE.  */
3104
3105 bool
3106 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3107                               gimple_stmt_iterator *gsi, int vf,
3108                               slp_instance slp_node_instance, bool analyze_only)
3109 {
3110   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3111   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3112   tree mask_element_type = NULL_TREE, mask_type;
3113   int i, j, k, nunits, vec_index = 0, scalar_index;
3114   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3115   gimple next_scalar_stmt;
3116   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3117   int first_mask_element;
3118   int index, unroll_factor, current_mask_element, ncopies;
3119   unsigned char *mask;
3120   bool only_one_vec = false, need_next_vector = false;
3121   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3122   int number_of_mask_fixes = 1;
3123   bool mask_fixed = false;
3124   bool needs_first_vector = false;
3125   machine_mode mode;
3126
3127   mode = TYPE_MODE (vectype);
3128
3129   if (!can_vec_perm_p (mode, false, NULL))
3130     {
3131       if (dump_enabled_p ())
3132         {
3133           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3134                            "no vect permute for ");
3135           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3136           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3137         }
3138       return false;
3139     }
3140
3141   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3142      same size as the vector element being permuted.  */
3143   mask_element_type = lang_hooks.types.type_for_mode
3144                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3145   mask_type = get_vectype_for_scalar_type (mask_element_type);
3146   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3147   mask = XALLOCAVEC (unsigned char, nunits);
3148   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3149
3150   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3151      unrolling factor.  */
3152   orig_vec_stmts_num = group_size *
3153                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3154   if (orig_vec_stmts_num == 1)
3155     only_one_vec = true;
3156
3157   /* Number of copies is determined by the final vectorization factor
3158      relatively to SLP_NODE_INSTANCE unrolling factor.  */
3159   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3160
3161   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3162     return false;
3163
3164   /* Generate permutation masks for every NODE. Number of masks for each NODE
3165      is equal to GROUP_SIZE.
3166      E.g., we have a group of three nodes with three loads from the same
3167      location in each node, and the vector size is 4. I.e., we have a
3168      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3169      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3170      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3171      ...
3172
3173      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3174      The last mask is illegal since we assume two operands for permute
3175      operation, and the mask element values can't be outside that range.
3176      Hence, the last mask must be converted into {2,5,5,5}.
3177      For the first two permutations we need the first and the second input
3178      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3179      we need the second and the third vectors: {b1,c1,a2,b2} and
3180      {c2,a3,b3,c3}.  */
3181
3182     {
3183       scalar_index = 0;
3184       index = 0;
3185       vect_stmts_counter = 0;
3186       vec_index = 0;
3187       first_vec_index = vec_index++;
3188       if (only_one_vec)
3189         second_vec_index = first_vec_index;
3190       else
3191         second_vec_index =  vec_index++;
3192
3193       for (j = 0; j < unroll_factor; j++)
3194         {
3195           for (k = 0; k < group_size; k++)
3196             {
3197               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3198               first_mask_element = i + j * group_size;
3199               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3200                                           nunits, only_one_vec, index,
3201                                           mask, &current_mask_element,
3202                                           &need_next_vector,
3203                                           &number_of_mask_fixes, &mask_fixed,
3204                                           &needs_first_vector))
3205                 return false;
3206               gcc_assert (current_mask_element >= 0
3207                           && current_mask_element < 2 * nunits);
3208               mask[index++] = current_mask_element;
3209
3210               if (index == nunits)
3211                 {
3212                   index = 0;
3213                   if (!can_vec_perm_p (mode, false, mask))
3214                     {
3215                       if (dump_enabled_p ())
3216                         {
3217                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3218                                            vect_location, 
3219                                            "unsupported vect permute { ");
3220                           for (i = 0; i < nunits; ++i)
3221                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3222                                          mask[i]);
3223                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3224                         }
3225                       return false;
3226                     }
3227
3228                   if (!analyze_only)
3229                     {
3230                       int l;
3231                       tree mask_vec, *mask_elts;
3232                       mask_elts = XALLOCAVEC (tree, nunits);
3233                       for (l = 0; l < nunits; ++l)
3234                         mask_elts[l] = build_int_cst (mask_element_type,
3235                                                       mask[l]);
3236                       mask_vec = build_vector (mask_type, mask_elts);
3237
3238                       if (need_next_vector)
3239                         {
3240                           first_vec_index = second_vec_index;
3241                           second_vec_index = vec_index;
3242                         }
3243
3244                       next_scalar_stmt
3245                           = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3246
3247                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3248                                mask_vec, first_vec_index, second_vec_index,
3249                                gsi, node, vectype, dr_chain,
3250                                ncopies, vect_stmts_counter++);
3251                     }
3252                 }
3253             }
3254         }
3255     }
3256
3257   return true;
3258 }
3259
3260
3261
3262 /* Vectorize SLP instance tree in postorder.  */
3263
3264 static bool
3265 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3266                             unsigned int vectorization_factor)
3267 {
3268   gimple stmt;
3269   bool grouped_store, is_store;
3270   gimple_stmt_iterator si;
3271   stmt_vec_info stmt_info;
3272   unsigned int vec_stmts_size, nunits, group_size;
3273   tree vectype;
3274   int i;
3275   slp_tree child;
3276
3277   if (!node)
3278     return false;
3279
3280   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3281     vect_schedule_slp_instance (child, instance, vectorization_factor);
3282
3283   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3284   stmt_info = vinfo_for_stmt (stmt);
3285
3286   /* VECTYPE is the type of the destination.  */
3287   vectype = STMT_VINFO_VECTYPE (stmt_info);
3288   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3289   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3290
3291   /* For each SLP instance calculate number of vector stmts to be created
3292      for the scalar stmts in each node of the SLP tree.  Number of vector
3293      elements in one vector iteration is the number of scalar elements in
3294      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3295      size.  */
3296   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3297
3298   if (!SLP_TREE_VEC_STMTS (node).exists ())
3299     {
3300       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3301       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3302     }
3303
3304   if (dump_enabled_p ())
3305     {
3306       dump_printf_loc (MSG_NOTE,vect_location,
3307                        "------>vectorizing SLP node starting from: ");
3308       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3309       dump_printf (MSG_NOTE, "\n");
3310     }
3311
3312   /* Vectorized stmts go before the last scalar stmt which is where
3313      all uses are ready.  */
3314   si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3315
3316   /* Mark the first element of the reduction chain as reduction to properly
3317      transform the node.  In the analysis phase only the last element of the
3318      chain is marked as reduction.  */
3319   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3320       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3321     {
3322       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3323       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3324     }
3325
3326   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3327   return is_store;
3328 }
3329
3330 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3331    For loop vectorization this is done in vectorizable_call, but for SLP
3332    it needs to be deferred until end of vect_schedule_slp, because multiple
3333    SLP instances may refer to the same scalar stmt.  */
3334
3335 static void
3336 vect_remove_slp_scalar_calls (slp_tree node)
3337 {
3338   gimple stmt, new_stmt;
3339   gimple_stmt_iterator gsi;
3340   int i;
3341   slp_tree child;
3342   tree lhs;
3343   stmt_vec_info stmt_info;
3344
3345   if (!node)
3346     return;
3347
3348   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3349     vect_remove_slp_scalar_calls (child);
3350
3351   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3352     {
3353       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3354         continue;
3355       stmt_info = vinfo_for_stmt (stmt);
3356       if (stmt_info == NULL
3357           || is_pattern_stmt_p (stmt_info)
3358           || !PURE_SLP_STMT (stmt_info))
3359         continue;
3360       lhs = gimple_call_lhs (stmt);
3361       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3362       set_vinfo_for_stmt (new_stmt, stmt_info);
3363       set_vinfo_for_stmt (stmt, NULL);
3364       STMT_VINFO_STMT (stmt_info) = new_stmt;
3365       gsi = gsi_for_stmt (stmt);
3366       gsi_replace (&gsi, new_stmt, false);
3367       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3368     }
3369 }
3370
3371 /* Generate vector code for all SLP instances in the loop/basic block.  */
3372
3373 bool
3374 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3375 {
3376   vec<slp_instance> slp_instances;
3377   slp_instance instance;
3378   unsigned int i, vf;
3379   bool is_store = false;
3380
3381   if (loop_vinfo)
3382     {
3383       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3384       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3385     }
3386   else
3387     {
3388       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3389       vf = 1;
3390     }
3391
3392   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3393     {
3394       /* Schedule the tree of INSTANCE.  */
3395       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3396                                              instance, vf);
3397       if (dump_enabled_p ())
3398         dump_printf_loc (MSG_NOTE, vect_location,
3399                          "vectorizing stmts using SLP.\n");
3400     }
3401
3402   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3403     {
3404       slp_tree root = SLP_INSTANCE_TREE (instance);
3405       gimple store;
3406       unsigned int j;
3407       gimple_stmt_iterator gsi;
3408
3409       /* Remove scalar call stmts.  Do not do this for basic-block
3410          vectorization as not all uses may be vectorized.
3411          ???  Why should this be necessary?  DCE should be able to
3412          remove the stmts itself.
3413          ???  For BB vectorization we can as well remove scalar
3414          stmts starting from the SLP tree root if they have no
3415          uses.  */
3416       if (loop_vinfo)
3417         vect_remove_slp_scalar_calls (root);
3418
3419       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3420                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3421         {
3422           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3423             break;
3424
3425          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3426            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3427           /* Free the attached stmt_vec_info and remove the stmt.  */
3428           gsi = gsi_for_stmt (store);
3429           unlink_stmt_vdef (store);
3430           gsi_remove (&gsi, true);
3431           release_defs (store);
3432           free_stmt_vec_info (store);
3433         }
3434     }
3435
3436   return is_store;
3437 }
3438
3439
3440 /* Vectorize the basic block.  */
3441
3442 void
3443 vect_slp_transform_bb (basic_block bb)
3444 {
3445   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3446   gimple_stmt_iterator si;
3447
3448   gcc_assert (bb_vinfo);
3449
3450   if (dump_enabled_p ())
3451     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3452
3453   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3454     {
3455       gimple stmt = gsi_stmt (si);
3456       stmt_vec_info stmt_info;
3457
3458       if (dump_enabled_p ())
3459         {
3460           dump_printf_loc (MSG_NOTE, vect_location,
3461                            "------>SLPing statement: ");
3462           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3463           dump_printf (MSG_NOTE, "\n");
3464         }
3465
3466       stmt_info = vinfo_for_stmt (stmt);
3467       gcc_assert (stmt_info);
3468
3469       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3470       if (STMT_SLP_TYPE (stmt_info))
3471         {
3472           vect_schedule_slp (NULL, bb_vinfo);
3473           break;
3474         }
3475     }
3476
3477   if (dump_enabled_p ())
3478     dump_printf_loc (MSG_NOTE, vect_location,
3479                      "BASIC BLOCK VECTORIZED\n");
3480
3481   destroy_bb_vec_info (bb_vinfo);
3482 }