dojump.h: New header file.
[platform/upstream/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h"              /* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
79
80 /* Extract the location of the basic block in the source code.
81    Return the basic block location if succeed and NULL if not.  */
82
83 source_location
84 find_bb_location (basic_block bb)
85 {
86   gimple stmt = NULL;
87   gimple_stmt_iterator si;
88
89   if (!bb)
90     return UNKNOWN_LOCATION;
91
92   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93     {
94       stmt = gsi_stmt (si);
95       if (gimple_location (stmt) != UNKNOWN_LOCATION)
96         return gimple_location (stmt);
97     }
98
99   return UNKNOWN_LOCATION;
100 }
101
102
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
104
105 static void
106 vect_free_slp_tree (slp_tree node)
107 {
108   int i;
109   slp_tree child;
110
111   if (!node)
112     return;
113
114   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115     vect_free_slp_tree (child);
116
117   SLP_TREE_CHILDREN (node).release ();
118   SLP_TREE_SCALAR_STMTS (node).release ();
119   SLP_TREE_VEC_STMTS (node).release ();
120   SLP_TREE_LOAD_PERMUTATION (node).release ();
121
122   free (node);
123 }
124
125
126 /* Free the memory allocated for the SLP instance.  */
127
128 void
129 vect_free_slp_instance (slp_instance instance)
130 {
131   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132   SLP_INSTANCE_LOADS (instance).release ();
133   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
134   free (instance);
135 }
136
137
138 /* Create an SLP node for SCALAR_STMTS.  */
139
140 static slp_tree
141 vect_create_new_slp_node (vec<gimple> scalar_stmts)
142 {
143   slp_tree node;
144   gimple stmt = scalar_stmts[0];
145   unsigned int nops;
146
147   if (is_gimple_call (stmt))
148     nops = gimple_call_num_args (stmt);
149   else if (is_gimple_assign (stmt))
150     {
151       nops = gimple_num_ops (stmt) - 1;
152       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
153         nops++;
154     }
155   else
156     return NULL;
157
158   node = XNEW (struct _slp_tree);
159   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
160   SLP_TREE_VEC_STMTS (node).create (0);
161   SLP_TREE_CHILDREN (node).create (nops);
162   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
163
164   return node;
165 }
166
167
168 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169    operand.  */
170 static vec<slp_oprnd_info> 
171 vect_create_oprnd_info (int nops, int group_size)
172 {
173   int i;
174   slp_oprnd_info oprnd_info;
175   vec<slp_oprnd_info> oprnds_info;
176
177   oprnds_info.create (nops);
178   for (i = 0; i < nops; i++)
179     {
180       oprnd_info = XNEW (struct _slp_oprnd_info);
181       oprnd_info->def_stmts.create (group_size);
182       oprnd_info->first_dt = vect_uninitialized_def;
183       oprnd_info->first_op_type = NULL_TREE;
184       oprnd_info->first_pattern = false;
185       oprnds_info.quick_push (oprnd_info);
186     }
187
188   return oprnds_info;
189 }
190
191
192 /* Free operands info.  */
193
194 static void
195 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
196 {
197   int i;
198   slp_oprnd_info oprnd_info;
199
200   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
201     {
202       oprnd_info->def_stmts.release ();
203       XDELETE (oprnd_info);
204     }
205
206   oprnds_info.release ();
207 }
208
209
210 /* Find the place of the data-ref in STMT in the interleaving chain that starts
211    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
212
213 static int
214 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
215 {
216   gimple next_stmt = first_stmt;
217   int result = 0;
218
219   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
220     return -1;
221
222   do
223     {
224       if (next_stmt == stmt)
225         return result;
226       result++;
227       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228     }
229   while (next_stmt);
230
231   return -1;
232 }
233
234
235 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
236    they are of a valid type and that they match the defs of the first stmt of
237    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
238    return -1, if the error could be corrected by swapping operands of the
239    operation return 1, if everything is ok return 0.  */
240
241 static int 
242 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
243                              gimple stmt, bool first,
244                              vec<slp_oprnd_info> *oprnds_info)
245 {
246   tree oprnd;
247   unsigned int i, number_of_oprnds;
248   tree def;
249   gimple def_stmt;
250   enum vect_def_type dt = vect_uninitialized_def;
251   struct loop *loop = NULL;
252   bool pattern = false;
253   slp_oprnd_info oprnd_info;
254   int first_op_idx = 1;
255   bool commutative = false;
256   bool first_op_cond = false;
257
258   if (loop_vinfo)
259     loop = LOOP_VINFO_LOOP (loop_vinfo);
260
261   if (is_gimple_call (stmt))
262     {
263       number_of_oprnds = gimple_call_num_args (stmt);
264       first_op_idx = 3;
265     }
266   else if (is_gimple_assign (stmt))
267     {
268       enum tree_code code = gimple_assign_rhs_code (stmt);
269       number_of_oprnds = gimple_num_ops (stmt) - 1;
270       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
271         {
272           first_op_cond = true;
273           commutative = true;
274           number_of_oprnds++;
275         }
276       else
277         commutative = commutative_tree_code (code);
278     }
279   else
280     return -1;
281
282   bool swapped = false;
283   for (i = 0; i < number_of_oprnds; i++)
284     {
285 again:
286       if (first_op_cond)
287         {
288           if (i == 0 || i == 1)
289             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
290                                   swapped ? !i : i);
291           else
292             oprnd = gimple_op (stmt, first_op_idx + i - 1);
293         }
294       else
295         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
296
297       oprnd_info = (*oprnds_info)[i];
298
299       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
300                                &def, &dt)
301           || (!def_stmt && dt != vect_constant_def))
302         {
303           if (dump_enabled_p ())
304             {
305               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306                                "Build SLP failed: can't find def for ");
307               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
308               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
309             }
310
311           return -1;
312         }
313
314       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
315          from the pattern.  Check that all the stmts of the node are in the
316          pattern.  */
317       if (def_stmt && gimple_bb (def_stmt)
318           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
319               || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
320                   && gimple_code (def_stmt) != GIMPLE_PHI))
321           && vinfo_for_stmt (def_stmt)
322           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
323           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
324           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
325         {
326           pattern = true;
327           if (!first && !oprnd_info->first_pattern)
328             {
329               if (i == 0
330                   && !swapped
331                   && commutative)
332                 {
333                   swapped = true;
334                   goto again;
335                 }
336
337               if (dump_enabled_p ())
338                 {
339                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340                                    "Build SLP failed: some of the stmts"
341                                    " are in a pattern, and others are not ");
342                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
343                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344                 }
345
346               return 1;
347             }
348
349           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
350           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
351
352           if (dt == vect_unknown_def_type)
353             {
354               if (dump_enabled_p ())
355                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356                                  "Unsupported pattern.\n");
357               return -1;
358             }
359
360           switch (gimple_code (def_stmt))
361             {
362               case GIMPLE_PHI:
363                 def = gimple_phi_result (def_stmt);
364                 break;
365
366               case GIMPLE_ASSIGN:
367                 def = gimple_assign_lhs (def_stmt);
368                 break;
369
370               default:
371                 if (dump_enabled_p ())
372                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373                                    "unsupported defining stmt:\n");
374                 return -1;
375             }
376         }
377
378       if (first)
379         {
380           oprnd_info->first_dt = dt;
381           oprnd_info->first_pattern = pattern;
382           oprnd_info->first_op_type = TREE_TYPE (oprnd);
383         }
384       else
385         {
386           /* Not first stmt of the group, check that the def-stmt/s match
387              the def-stmt/s of the first stmt.  Allow different definition
388              types for reduction chains: the first stmt must be a
389              vect_reduction_def (a phi node), and the rest
390              vect_internal_def.  */
391           if (((oprnd_info->first_dt != dt
392                 && !(oprnd_info->first_dt == vect_reduction_def
393                      && dt == vect_internal_def)
394                 && !((oprnd_info->first_dt == vect_external_def
395                       || oprnd_info->first_dt == vect_constant_def)
396                      && (dt == vect_external_def
397                          || dt == vect_constant_def)))
398                || !types_compatible_p (oprnd_info->first_op_type,
399                                        TREE_TYPE (oprnd))))
400             {
401               /* Try swapping operands if we got a mismatch.  */
402               if (i == 0
403                   && !swapped
404                   && commutative)
405                 {
406                   swapped = true;
407                   goto again;
408                 }
409
410               if (dump_enabled_p ())
411                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412                                  "Build SLP failed: different types\n");
413
414               return 1;
415             }
416         }
417
418       /* Check the types of the definitions.  */
419       switch (dt)
420         {
421         case vect_constant_def:
422         case vect_external_def:
423         case vect_reduction_def:
424           break;
425
426         case vect_internal_def:
427           oprnd_info->def_stmts.quick_push (def_stmt);
428           break;
429
430         default:
431           /* FORNOW: Not supported.  */
432           if (dump_enabled_p ())
433             {
434               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435                                "Build SLP failed: illegal type of def ");
436               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
438             }
439
440           return -1;
441         }
442     }
443
444   /* Swap operands.  */
445   if (swapped)
446     {
447       if (first_op_cond)
448         {
449           tree cond = gimple_assign_rhs1 (stmt);
450           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451                              &TREE_OPERAND (cond, 1));
452           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
453         }
454       else
455         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456                            gimple_assign_rhs2_ptr (stmt));
457     }
458
459   return 0;
460 }
461
462
463 /* Verify if the scalar stmts STMTS are isomorphic, require data
464    permutation or are of unsupported types of operation.  Return
465    true if they are, otherwise return false and indicate in *MATCHES
466    which stmts are not isomorphic to the first one.  If MATCHES[0]
467    is false then this indicates the comparison could not be
468    carried out or the stmts will never be vectorized by SLP.  */
469
470 static bool
471 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472                        vec<gimple> stmts, unsigned int group_size,
473                        unsigned nops, unsigned int *max_nunits,
474                        unsigned int vectorization_factor, bool *matches)
475 {
476   unsigned int i;
477   gimple stmt = stmts[0];
478   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
479   enum tree_code first_cond_code = ERROR_MARK;
480   tree lhs;
481   bool need_same_oprnds = false;
482   tree vectype, scalar_type, first_op1 = NULL_TREE;
483   optab optab;
484   int icode;
485   machine_mode optab_op2_mode;
486   machine_mode vec_mode;
487   struct data_reference *first_dr;
488   HOST_WIDE_INT dummy;
489   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
490   tree cond;
491
492   /* For every stmt in NODE find its def stmt/s.  */
493   FOR_EACH_VEC_ELT (stmts, i, stmt)
494     {
495       matches[i] = false;
496
497       if (dump_enabled_p ())
498         {
499           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
500           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
501           dump_printf (MSG_NOTE, "\n");
502         }
503
504       /* Fail to vectorize statements marked as unvectorizable.  */
505       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
506         {
507           if (dump_enabled_p ())
508             {
509               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510                                "Build SLP failed: unvectorizable statement ");
511               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513             }
514           /* Fatal mismatch.  */
515           matches[0] = false;
516           return false;
517         }
518
519       lhs = gimple_get_lhs (stmt);
520       if (lhs == NULL_TREE)
521         {
522           if (dump_enabled_p ())
523             {
524               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525                                "Build SLP failed: not GIMPLE_ASSIGN nor "
526                                "GIMPLE_CALL ");
527               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
528               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529             }
530           /* Fatal mismatch.  */
531           matches[0] = false;
532           return false;
533         }
534
535        if (is_gimple_assign (stmt)
536            && gimple_assign_rhs_code (stmt) == COND_EXPR
537            && (cond = gimple_assign_rhs1 (stmt))
538            && !COMPARISON_CLASS_P (cond))
539         {
540           if (dump_enabled_p ())
541             {
542               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
543                                "Build SLP failed: condition is not "
544                                "comparison ");
545               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
546               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547             }
548           /* Fatal mismatch.  */
549           matches[0] = false;
550           return false;
551         }
552
553       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
554       vectype = get_vectype_for_scalar_type (scalar_type);
555       if (!vectype)
556         {
557           if (dump_enabled_p ())
558             {
559               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
560                                "Build SLP failed: unsupported data-type ");
561               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
562                                  scalar_type);
563               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
564             }
565           /* Fatal mismatch.  */
566           matches[0] = false;
567           return false;
568         }
569
570       /* In case of multiple types we need to detect the smallest type.  */
571       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
572         {
573           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
574           if (bb_vinfo)
575             vectorization_factor = *max_nunits;
576         }
577
578       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
579         {
580           rhs_code = CALL_EXPR;
581           if (gimple_call_internal_p (call_stmt)
582               || gimple_call_tail_p (call_stmt)
583               || gimple_call_noreturn_p (call_stmt)
584               || !gimple_call_nothrow_p (call_stmt)
585               || gimple_call_chain (call_stmt))
586             {
587               if (dump_enabled_p ())
588                 {
589                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
590                                    "Build SLP failed: unsupported call type ");
591                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
592                                     call_stmt, 0);
593                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
594                 }
595               /* Fatal mismatch.  */
596               matches[0] = false;
597               return false;
598             }
599         }
600       else
601         rhs_code = gimple_assign_rhs_code (stmt);
602
603       /* Check the operation.  */
604       if (i == 0)
605         {
606           first_stmt_code = rhs_code;
607
608           /* Shift arguments should be equal in all the packed stmts for a
609              vector shift with scalar shift operand.  */
610           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
611               || rhs_code == LROTATE_EXPR
612               || rhs_code == RROTATE_EXPR)
613             {
614               vec_mode = TYPE_MODE (vectype);
615
616               /* First see if we have a vector/vector shift.  */
617               optab = optab_for_tree_code (rhs_code, vectype,
618                                            optab_vector);
619
620               if (!optab
621                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
622                 {
623                   /* No vector/vector shift, try for a vector/scalar shift.  */
624                   optab = optab_for_tree_code (rhs_code, vectype,
625                                                optab_scalar);
626
627                   if (!optab)
628                     {
629                       if (dump_enabled_p ())
630                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
631                                          "Build SLP failed: no optab.\n");
632                       /* Fatal mismatch.  */
633                       matches[0] = false;
634                       return false;
635                     }
636                   icode = (int) optab_handler (optab, vec_mode);
637                   if (icode == CODE_FOR_nothing)
638                     {
639                       if (dump_enabled_p ())
640                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641                                          "Build SLP failed: "
642                                          "op not supported by target.\n");
643                       /* Fatal mismatch.  */
644                       matches[0] = false;
645                       return false;
646                     }
647                   optab_op2_mode = insn_data[icode].operand[2].mode;
648                   if (!VECTOR_MODE_P (optab_op2_mode))
649                     {
650                       need_same_oprnds = true;
651                       first_op1 = gimple_assign_rhs2 (stmt);
652                     }
653                 }
654             }
655           else if (rhs_code == WIDEN_LSHIFT_EXPR)
656             {
657               need_same_oprnds = true;
658               first_op1 = gimple_assign_rhs2 (stmt);
659             }
660         }
661       else
662         {
663           if (first_stmt_code != rhs_code
664               && (first_stmt_code != IMAGPART_EXPR
665                   || rhs_code != REALPART_EXPR)
666               && (first_stmt_code != REALPART_EXPR
667                   || rhs_code != IMAGPART_EXPR)
668               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669                    && (first_stmt_code == ARRAY_REF
670                        || first_stmt_code == BIT_FIELD_REF
671                        || first_stmt_code == INDIRECT_REF
672                        || first_stmt_code == COMPONENT_REF
673                        || first_stmt_code == MEM_REF)))
674             {
675               if (dump_enabled_p ())
676                 {
677                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
678                                    "Build SLP failed: different operation "
679                                    "in stmt ");
680                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
681                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
682                 }
683               /* Mismatch.  */
684               continue;
685             }
686
687           if (need_same_oprnds
688               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
689             {
690               if (dump_enabled_p ())
691                 {
692                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
693                                    "Build SLP failed: different shift "
694                                    "arguments in ");
695                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
696                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
697                 }
698               /* Mismatch.  */
699               continue;
700             }
701
702           if (rhs_code == CALL_EXPR)
703             {
704               gimple first_stmt = stmts[0];
705               if (gimple_call_num_args (stmt) != nops
706                   || !operand_equal_p (gimple_call_fn (first_stmt),
707                                        gimple_call_fn (stmt), 0)
708                   || gimple_call_fntype (first_stmt)
709                      != gimple_call_fntype (stmt))
710                 {
711                   if (dump_enabled_p ())
712                     {
713                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
714                                        "Build SLP failed: different calls in ");
715                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
716                                         stmt, 0);
717                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
718                     }
719                   /* Mismatch.  */
720                   continue;
721                 }
722             }
723         }
724
725       /* Grouped store or load.  */
726       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
727         {
728           if (REFERENCE_CLASS_P (lhs))
729             {
730               /* Store.  */
731               ;
732             }
733           else
734             {
735               /* Load.  */
736               unsigned unrolling_factor
737                 = least_common_multiple
738                     (*max_nunits, group_size) / group_size;
739               /* FORNOW: Check that there is no gap between the loads
740                  and no gap between the groups when we need to load
741                  multiple groups at once.
742                  ???  We should enhance this to only disallow gaps
743                  inside vectors.  */
744               if ((unrolling_factor > 1
745                    && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
746                         && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
747                        /* If the group is split up then GROUP_GAP
748                           isn't correct here, nor is GROUP_FIRST_ELEMENT.  */
749                        || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
750                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
751                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
752                 {
753                   if (dump_enabled_p ())
754                     {
755                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756                                        "Build SLP failed: grouped "
757                                        "loads have gaps ");
758                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
759                                         stmt, 0);
760                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
761                     }
762                   /* Fatal mismatch.  */
763                   matches[0] = false;
764                   return false;
765                 }
766
767               /* Check that the size of interleaved loads group is not
768                  greater than the SLP group size.  */
769               unsigned ncopies
770                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
771               if (loop_vinfo
772                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
774                        - GROUP_GAP (vinfo_for_stmt (stmt)))
775                       > ncopies * group_size))
776                 {
777                   if (dump_enabled_p ())
778                     {
779                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780                                        "Build SLP failed: the number "
781                                        "of interleaved loads is greater than "
782                                        "the SLP group size ");
783                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
784                                         stmt, 0);
785                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
786                     }
787                   /* Fatal mismatch.  */
788                   matches[0] = false;
789                   return false;
790                 }
791
792               old_first_load = first_load;
793               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
794               if (prev_first_load)
795                 {
796                   /* Check that there are no loads from different interleaving
797                      chains in the same node.  */
798                   if (prev_first_load != first_load)
799                     {
800                       if (dump_enabled_p ())
801                         {
802                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803                                            vect_location, 
804                                            "Build SLP failed: different "
805                                            "interleaving chains in one node ");
806                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
807                                             stmt, 0);
808                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
809                         }
810                       /* Mismatch.  */
811                       continue;
812                     }
813                 }
814               else
815                 prev_first_load = first_load;
816
817               /* In some cases a group of loads is just the same load
818                  repeated N times.  Only analyze its cost once.  */
819               if (first_load == stmt && old_first_load != first_load)
820                 {
821                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
822                   if (vect_supportable_dr_alignment (first_dr, false)
823                       == dr_unaligned_unsupported)
824                     {
825                       if (dump_enabled_p ())
826                         {
827                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
828                                            vect_location, 
829                                            "Build SLP failed: unsupported "
830                                            "unaligned load ");
831                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
832                                             stmt, 0);
833                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
834                         }
835                       /* Fatal mismatch.  */
836                       matches[0] = false;
837                       return false;
838                     }
839                 }
840            }
841         } /* Grouped access.  */
842       else
843         {
844           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
845             {
846               /* Not grouped load.  */
847               if (dump_enabled_p ())
848                 {
849                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
850                                    "Build SLP failed: not grouped load ");
851                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
852                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
853                 }
854
855               /* FORNOW: Not grouped loads are not supported.  */
856               /* Fatal mismatch.  */
857               matches[0] = false;
858               return false;
859             }
860
861           /* Not memory operation.  */
862           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
863               && TREE_CODE_CLASS (rhs_code) != tcc_unary
864               && rhs_code != COND_EXPR
865               && rhs_code != CALL_EXPR)
866             {
867               if (dump_enabled_p ())
868                 {
869                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
870                                    "Build SLP failed: operation");
871                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
872                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
873                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
874                 }
875               /* Fatal mismatch.  */
876               matches[0] = false;
877               return false;
878             }
879
880           if (rhs_code == COND_EXPR)
881             {
882               tree cond_expr = gimple_assign_rhs1 (stmt);
883
884               if (i == 0)
885                 first_cond_code = TREE_CODE (cond_expr);
886               else if (first_cond_code != TREE_CODE (cond_expr))
887                 {
888                   if (dump_enabled_p ())
889                     {
890                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891                                        "Build SLP failed: different"
892                                        " operation");
893                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
894                                         stmt, 0);
895                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
896                     }
897                   /* Mismatch.  */
898                   continue;
899                 }
900             }
901         }
902
903       matches[i] = true;
904     }
905
906   for (i = 0; i < group_size; ++i)
907     if (!matches[i])
908       return false;
909
910   return true;
911 }
912
913 /* Recursively build an SLP tree starting from NODE.
914    Fail (and return a value not equal to zero) if def-stmts are not
915    isomorphic, require data permutation or are of unsupported types of
916    operation.  Otherwise, return 0.
917    The value returned is the depth in the SLP tree where a mismatch
918    was found.  */
919
920 static bool
921 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
922                      slp_tree *node, unsigned int group_size,
923                      unsigned int *max_nunits,
924                      vec<slp_tree> *loads,
925                      unsigned int vectorization_factor,
926                      bool *matches, unsigned *npermutes, unsigned *tree_size,
927                      unsigned max_tree_size)
928 {
929   unsigned nops, i, this_npermutes = 0, this_tree_size = 0;
930   gimple stmt;
931
932   if (!matches)
933     matches = XALLOCAVEC (bool, group_size);
934   if (!npermutes)
935     npermutes = &this_npermutes;
936
937   matches[0] = false;
938
939   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
940   if (is_gimple_call (stmt))
941     nops = gimple_call_num_args (stmt);
942   else if (is_gimple_assign (stmt))
943     {
944       nops = gimple_num_ops (stmt) - 1;
945       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
946         nops++;
947     }
948   else
949     return false;
950
951   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
952                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
953                               max_nunits, vectorization_factor, matches))
954     return false;
955
956   /* If the SLP node is a load, terminate the recursion.  */
957   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
958       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
959     {
960       loads->safe_push (*node);
961       return true;
962     }
963
964   /* Get at the operands, verifying they are compatible.  */
965   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
966   slp_oprnd_info oprnd_info;
967   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
968     {
969       switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
970                                            stmt, (i == 0), &oprnds_info))
971         {
972         case 0:
973           break;
974         case -1:
975           matches[0] = false;
976           vect_free_oprnd_info (oprnds_info);
977           return false;
978         case 1:
979           matches[i] = false;
980           break;
981         }
982     }
983   for (i = 0; i < group_size; ++i)
984     if (!matches[i])
985       {
986         vect_free_oprnd_info (oprnds_info);
987         return false;
988       }
989
990   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
991
992   /* Create SLP_TREE nodes for the definition node/s.  */
993   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
994     {
995       slp_tree child;
996       unsigned old_nloads = loads->length ();
997       unsigned old_max_nunits = *max_nunits;
998
999       if (oprnd_info->first_dt != vect_internal_def)
1000         continue;
1001
1002       if (++this_tree_size > max_tree_size)
1003         {
1004           vect_free_oprnd_info (oprnds_info);
1005           return false;
1006         }
1007
1008       child = vect_create_new_slp_node (oprnd_info->def_stmts);
1009       if (!child)
1010         {
1011           vect_free_oprnd_info (oprnds_info);
1012           return false;
1013         }
1014
1015       bool *matches = XALLOCAVEC (bool, group_size);
1016       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1017                                group_size, max_nunits, loads,
1018                                vectorization_factor, matches,
1019                                npermutes, &this_tree_size, max_tree_size))
1020         {
1021           oprnd_info->def_stmts = vNULL;
1022           SLP_TREE_CHILDREN (*node).quick_push (child);
1023           continue;
1024         }
1025
1026       /* If the SLP build for operand zero failed and operand zero
1027          and one can be commutated try that for the scalar stmts
1028          that failed the match.  */
1029       if (i == 0
1030           /* A first scalar stmt mismatch signals a fatal mismatch.  */
1031           && matches[0]
1032           /* ???  For COND_EXPRs we can swap the comparison operands
1033              as well as the arms under some constraints.  */
1034           && nops == 2
1035           && oprnds_info[1]->first_dt == vect_internal_def
1036           && is_gimple_assign (stmt)
1037           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1038           /* Do so only if the number of not successful permutes was nor more
1039              than a cut-ff as re-trying the recursive match on
1040              possibly each level of the tree would expose exponential
1041              behavior.  */
1042           && *npermutes < 4)
1043         {
1044           /* Roll back.  */
1045           *max_nunits = old_max_nunits;
1046           loads->truncate (old_nloads);
1047           /* Swap mismatched definition stmts.  */
1048           dump_printf_loc (MSG_NOTE, vect_location,
1049                            "Re-trying with swapped operands of stmts ");
1050           for (unsigned j = 0; j < group_size; ++j)
1051             if (!matches[j])
1052               {
1053                 gimple tem = oprnds_info[0]->def_stmts[j];
1054                 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1055                 oprnds_info[1]->def_stmts[j] = tem;
1056                 dump_printf (MSG_NOTE, "%d ", j);
1057               }
1058           dump_printf (MSG_NOTE, "\n");
1059           /* And try again ... */
1060           if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1061                                    group_size, max_nunits, loads,
1062                                    vectorization_factor,
1063                                    matches, npermutes, &this_tree_size,
1064                                    max_tree_size))
1065             {
1066               oprnd_info->def_stmts = vNULL;
1067               SLP_TREE_CHILDREN (*node).quick_push (child);
1068               continue;
1069             }
1070
1071           ++*npermutes;
1072         }
1073
1074       oprnd_info->def_stmts = vNULL;
1075       vect_free_slp_tree (child);
1076       vect_free_oprnd_info (oprnds_info);
1077       return false;
1078     }
1079
1080   if (tree_size)
1081     *tree_size += this_tree_size;
1082
1083   vect_free_oprnd_info (oprnds_info);
1084   return true;
1085 }
1086
1087 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1088
1089 static void
1090 vect_print_slp_tree (int dump_kind, slp_tree node)
1091 {
1092   int i;
1093   gimple stmt;
1094   slp_tree child;
1095
1096   if (!node)
1097     return;
1098
1099   dump_printf (dump_kind, "node ");
1100   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1101     {
1102       dump_printf (dump_kind, "\n\tstmt %d ", i);
1103       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1104     }
1105   dump_printf (dump_kind, "\n");
1106
1107   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1108     vect_print_slp_tree (dump_kind, child);
1109 }
1110
1111
1112 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1113    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1114    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1115    stmts in NODE are to be marked.  */
1116
1117 static void
1118 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1119 {
1120   int i;
1121   gimple stmt;
1122   slp_tree child;
1123
1124   if (!node)
1125     return;
1126
1127   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1128     if (j < 0 || i == j)
1129       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1130
1131   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1132     vect_mark_slp_stmts (child, mark, j);
1133 }
1134
1135
1136 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1137
1138 static void
1139 vect_mark_slp_stmts_relevant (slp_tree node)
1140 {
1141   int i;
1142   gimple stmt;
1143   stmt_vec_info stmt_info;
1144   slp_tree child;
1145
1146   if (!node)
1147     return;
1148
1149   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1150     {
1151       stmt_info = vinfo_for_stmt (stmt);
1152       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1153                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1154       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1155     }
1156
1157   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1158     vect_mark_slp_stmts_relevant (child);
1159 }
1160
1161
1162 /* Rearrange the statements of NODE according to PERMUTATION.  */
1163
1164 static void
1165 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1166                           vec<unsigned> permutation)
1167 {
1168   gimple stmt;
1169   vec<gimple> tmp_stmts;
1170   unsigned int i;
1171   slp_tree child;
1172
1173   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1174     vect_slp_rearrange_stmts (child, group_size, permutation);
1175
1176   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1177   tmp_stmts.create (group_size);
1178   tmp_stmts.quick_grow_cleared (group_size);
1179
1180   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1181     tmp_stmts[permutation[i]] = stmt;
1182
1183   SLP_TREE_SCALAR_STMTS (node).release ();
1184   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1185 }
1186
1187
1188 /* Check if the required load permutations in the SLP instance
1189    SLP_INSTN are supported.  */
1190
1191 static bool
1192 vect_supported_load_permutation_p (slp_instance slp_instn)
1193 {
1194   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1195   unsigned int i, j, k, next;
1196   sbitmap load_index;
1197   slp_tree node;
1198   gimple stmt, load, next_load, first_load;
1199   struct data_reference *dr;
1200
1201   if (dump_enabled_p ())
1202     {
1203       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1204       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1205         if (node->load_permutation.exists ())
1206           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1207             dump_printf (MSG_NOTE, "%d ", next);
1208         else
1209           for (k = 0; k < group_size; ++k)
1210             dump_printf (MSG_NOTE, "%d ", k);
1211       dump_printf (MSG_NOTE, "\n");
1212     }
1213
1214   /* In case of reduction every load permutation is allowed, since the order
1215      of the reduction statements is not important (as opposed to the case of
1216      grouped stores).  The only condition we need to check is that all the
1217      load nodes are of the same size and have the same permutation (and then
1218      rearrange all the nodes of the SLP instance according to this 
1219      permutation).  */
1220
1221   /* Check that all the load nodes are of the same size.  */
1222   /* ???  Can't we assert this? */
1223   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1224     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1225       return false;
1226
1227   node = SLP_INSTANCE_TREE (slp_instn);
1228   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1229
1230   /* Reduction (there are no data-refs in the root).
1231      In reduction chain the order of the loads is important.  */
1232   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1233       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1234     {
1235       slp_tree load;
1236       unsigned int lidx;
1237
1238       /* Compare all the permutation sequences to the first one.  We know
1239          that at least one load is permuted.  */
1240       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1241       if (!node->load_permutation.exists ())
1242         return false;
1243       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1244         {
1245           if (!load->load_permutation.exists ())
1246             return false;
1247           FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1248             if (lidx != node->load_permutation[j])
1249               return false;
1250         }
1251
1252       /* Check that the loads in the first sequence are different and there
1253          are no gaps between them.  */
1254       load_index = sbitmap_alloc (group_size);
1255       bitmap_clear (load_index);
1256       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1257         {
1258           if (bitmap_bit_p (load_index, lidx))
1259             {
1260               sbitmap_free (load_index);
1261               return false;
1262             }
1263           bitmap_set_bit (load_index, lidx);
1264         }
1265       for (i = 0; i < group_size; i++)
1266         if (!bitmap_bit_p (load_index, i))
1267           {
1268             sbitmap_free (load_index);
1269             return false;
1270           }
1271       sbitmap_free (load_index);
1272
1273       /* This permutation is valid for reduction.  Since the order of the
1274          statements in the nodes is not important unless they are memory
1275          accesses, we can rearrange the statements in all the nodes
1276          according to the order of the loads.  */
1277       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1278                                 node->load_permutation);
1279
1280       /* We are done, no actual permutations need to be generated.  */
1281       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1282         SLP_TREE_LOAD_PERMUTATION (node).release ();
1283       return true;
1284     }
1285
1286   /* In basic block vectorization we allow any subchain of an interleaving
1287      chain.
1288      FORNOW: not supported in loop SLP because of realignment compications.  */
1289   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1290     {
1291       /* Check that for every node in the instance the loads
1292          form a subchain.  */
1293       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1294         {
1295           next_load = NULL;
1296           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1297             {
1298               if (j != 0 && next_load != load)
1299                 return false;
1300               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1301             }
1302         }
1303
1304       /* Check that the alignment of the first load in every subchain, i.e.,
1305          the first statement in every load node, is supported.
1306          ???  This belongs in alignment checking.  */
1307       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1308         {
1309           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1310           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1311             {
1312               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1313               if (vect_supportable_dr_alignment (dr, false)
1314                   == dr_unaligned_unsupported)
1315                 {
1316                   if (dump_enabled_p ())
1317                     {
1318                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1319                                        vect_location,
1320                                        "unsupported unaligned load ");
1321                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1322                                         first_load, 0);
1323                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1324                     }
1325                   return false;
1326                 }
1327             }
1328         }
1329
1330       /* We are done, no actual permutations need to be generated.  */
1331       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1332         SLP_TREE_LOAD_PERMUTATION (node).release ();
1333       return true;
1334     }
1335
1336   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1337      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1338      well (unless it's reduction).  */
1339   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1340     return false;
1341   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1342     if (!node->load_permutation.exists ())
1343       return false;
1344
1345   load_index = sbitmap_alloc (group_size);
1346   bitmap_clear (load_index);
1347   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1348     {
1349       unsigned int lidx = node->load_permutation[0];
1350       if (bitmap_bit_p (load_index, lidx))
1351         {
1352           sbitmap_free (load_index);
1353           return false;
1354         }
1355       bitmap_set_bit (load_index, lidx);
1356       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1357         if (k != lidx)
1358           {
1359             sbitmap_free (load_index);
1360             return false;
1361           }
1362     }
1363   for (i = 0; i < group_size; i++)
1364     if (!bitmap_bit_p (load_index, i))
1365       {
1366         sbitmap_free (load_index);
1367         return false;
1368       }
1369   sbitmap_free (load_index);
1370
1371   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1372     if (node->load_permutation.exists ()
1373         && !vect_transform_slp_perm_load
1374               (node, vNULL, NULL,
1375                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1376       return false;
1377   return true;
1378 }
1379
1380
1381 /* Find the first load in the loop that belongs to INSTANCE.
1382    When loads are in several SLP nodes, there can be a case in which the first
1383    load does not appear in the first SLP node to be transformed, causing
1384    incorrect order of statements.  Since we generate all the loads together,
1385    they must be inserted before the first load of the SLP instance and not
1386    before the first load of the first node of the instance.  */
1387
1388 static gimple
1389 vect_find_first_load_in_slp_instance (slp_instance instance)
1390 {
1391   int i, j;
1392   slp_tree load_node;
1393   gimple first_load = NULL, load;
1394
1395   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1396     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1397       first_load = get_earlier_stmt (load, first_load);
1398
1399   return first_load;
1400 }
1401
1402
1403 /* Find the last store in SLP INSTANCE.  */
1404
1405 static gimple
1406 vect_find_last_store_in_slp_instance (slp_instance instance)
1407 {
1408   int i;
1409   slp_tree node;
1410   gimple last_store = NULL, store;
1411
1412   node = SLP_INSTANCE_TREE (instance);
1413   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1414     last_store = get_later_stmt (store, last_store);
1415
1416   return last_store;
1417 }
1418
1419 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1420
1421 static void
1422 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1423                          slp_instance instance, slp_tree node,
1424                          stmt_vector_for_cost *prologue_cost_vec,
1425                          unsigned ncopies_for_cost)
1426 {
1427   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1428
1429   unsigned i;
1430   slp_tree child;
1431   gimple stmt, s;
1432   stmt_vec_info stmt_info;
1433   tree lhs;
1434   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1435
1436   /* Recurse down the SLP tree.  */
1437   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1438     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1439                              instance, child, prologue_cost_vec,
1440                              ncopies_for_cost);
1441
1442   /* Look at the first scalar stmt to determine the cost.  */
1443   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1444   stmt_info = vinfo_for_stmt (stmt);
1445   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1446     {
1447       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1448         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1449                                vect_uninitialized_def,
1450                                node, prologue_cost_vec, body_cost_vec);
1451       else
1452         {
1453           int i;
1454           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1455           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1456                                 node, prologue_cost_vec, body_cost_vec);
1457           /* If the load is permuted record the cost for the permutation.
1458              ???  Loads from multiple chains are let through here only
1459              for a single special case involving complex numbers where
1460              in the end no permutation is necessary.  */
1461           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1462             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1463                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1464                 && vect_get_place_in_interleaving_chain
1465                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1466               {
1467                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1468                                   stmt_info, 0, vect_body);
1469                 break;
1470               }
1471         }
1472     }
1473   else
1474     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1475                       stmt_info, 0, vect_body);
1476
1477   /* Scan operands and account for prologue cost of constants/externals.
1478      ???  This over-estimates cost for multiple uses and should be
1479      re-engineered.  */
1480   lhs = gimple_get_lhs (stmt);
1481   for (i = 0; i < gimple_num_ops (stmt); ++i)
1482     {
1483       tree def, op = gimple_op (stmt, i);
1484       gimple def_stmt;
1485       enum vect_def_type dt;
1486       if (!op || op == lhs)
1487         continue;
1488       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1489                               &def_stmt, &def, &dt)
1490           && (dt == vect_constant_def || dt == vect_external_def))
1491         record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1492                           stmt_info, 0, vect_prologue);
1493     }
1494 }
1495
1496 /* Compute the cost for the SLP instance INSTANCE.  */
1497
1498 static void
1499 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1500                        slp_instance instance, unsigned nunits)
1501 {
1502   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1503   unsigned ncopies_for_cost;
1504   stmt_info_for_cost *si;
1505   unsigned i;
1506
1507   /* Calculate the number of vector stmts to create based on the unrolling
1508      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1509      GROUP_SIZE / NUNITS otherwise.  */
1510   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1511   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1512
1513   prologue_cost_vec.create (10);
1514   body_cost_vec.create (10);
1515   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1516   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1517                            instance, SLP_INSTANCE_TREE (instance),
1518                            &prologue_cost_vec, ncopies_for_cost);
1519
1520   /* Record the prologue costs, which were delayed until we were
1521      sure that SLP was successful.  Unlike the body costs, we know
1522      the final values now regardless of the loop vectorization factor.  */
1523   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1524                 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1525   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1526     {
1527       struct _stmt_vec_info *stmt_info
1528         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1529       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1530                             si->misalign, vect_prologue);
1531     }
1532
1533   prologue_cost_vec.release ();
1534 }
1535
1536 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1537    vect_build_slp_tree to build a tree of packed stmts if possible.
1538    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1539
1540 static bool
1541 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1542                            gimple stmt, unsigned max_tree_size)
1543 {
1544   slp_instance new_instance;
1545   slp_tree node;
1546   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1547   unsigned int unrolling_factor = 1, nunits;
1548   tree vectype, scalar_type = NULL_TREE;
1549   gimple next;
1550   unsigned int vectorization_factor = 0;
1551   int i;
1552   unsigned int max_nunits = 0;
1553   vec<slp_tree> loads;
1554   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1555   vec<gimple> scalar_stmts;
1556
1557   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1558     {
1559       if (dr)
1560         {
1561           scalar_type = TREE_TYPE (DR_REF (dr));
1562           vectype = get_vectype_for_scalar_type (scalar_type);
1563         }
1564       else
1565         {
1566           gcc_assert (loop_vinfo);
1567           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1568         }
1569
1570       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1571     }
1572   else
1573     {
1574       gcc_assert (loop_vinfo);
1575       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1576       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1577     }
1578
1579   if (!vectype)
1580     {
1581       if (dump_enabled_p ())
1582         {
1583           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1584                            "Build SLP failed: unsupported data-type ");
1585           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1586           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1587         }
1588
1589       return false;
1590     }
1591
1592   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1593   if (loop_vinfo)
1594     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1595   else
1596     vectorization_factor = nunits;
1597
1598   /* Calculate the unrolling factor.  */
1599   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1600   if (unrolling_factor != 1 && !loop_vinfo)
1601     {
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1604                          "Build SLP failed: unrolling required in basic"
1605                          " block SLP\n");
1606
1607       return false;
1608     }
1609
1610   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1611   scalar_stmts.create (group_size);
1612   next = stmt;
1613   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1614     {
1615       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1616       while (next)
1617         {
1618           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1619               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1620             scalar_stmts.safe_push (
1621                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1622           else
1623             scalar_stmts.safe_push (next);
1624           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1625         }
1626     }
1627   else
1628     {
1629       /* Collect reduction statements.  */
1630       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1631       for (i = 0; reductions.iterate (i, &next); i++)
1632         scalar_stmts.safe_push (next);
1633     }
1634
1635   node = vect_create_new_slp_node (scalar_stmts);
1636
1637   loads.create (group_size);
1638
1639   /* Build the tree for the SLP instance.  */
1640   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1641                            &max_nunits, &loads,
1642                            vectorization_factor, NULL, NULL, NULL,
1643                            max_tree_size))
1644     {
1645       /* Calculate the unrolling factor based on the smallest type.  */
1646       if (max_nunits > nunits)
1647         unrolling_factor = least_common_multiple (max_nunits, group_size)
1648                            / group_size;
1649
1650       if (unrolling_factor != 1 && !loop_vinfo)
1651         {
1652           if (dump_enabled_p ())
1653             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654                              "Build SLP failed: unrolling required in basic"
1655                              " block SLP\n");
1656           vect_free_slp_tree (node);
1657           loads.release ();
1658           return false;
1659         }
1660
1661       /* Create a new SLP instance.  */
1662       new_instance = XNEW (struct _slp_instance);
1663       SLP_INSTANCE_TREE (new_instance) = node;
1664       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1665       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1666       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1667       SLP_INSTANCE_LOADS (new_instance) = loads;
1668       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1669
1670       /* Compute the load permutation.  */
1671       slp_tree load_node;
1672       bool loads_permuted = false;
1673       FOR_EACH_VEC_ELT (loads, i, load_node)
1674         {
1675           vec<unsigned> load_permutation;
1676           int j;
1677           gimple load, first_stmt;
1678           bool this_load_permuted = false;
1679           load_permutation.create (group_size);
1680           first_stmt = GROUP_FIRST_ELEMENT
1681               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1682           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1683             {
1684               int load_place
1685                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1686               gcc_assert (load_place != -1);
1687               if (load_place != j)
1688                 this_load_permuted = true;
1689               load_permutation.safe_push (load_place);
1690             }
1691           if (!this_load_permuted)
1692             {
1693               load_permutation.release ();
1694               continue;
1695             }
1696           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1697           loads_permuted = true;
1698         }
1699
1700       if (loads_permuted)
1701         {
1702           if (!vect_supported_load_permutation_p (new_instance))
1703             {
1704               if (dump_enabled_p ())
1705                 {
1706                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                                    "Build SLP failed: unsupported load "
1708                                    "permutation ");
1709                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1710                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1711                 }
1712               vect_free_slp_instance (new_instance);
1713               return false;
1714             }
1715
1716           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1717             = vect_find_first_load_in_slp_instance (new_instance);
1718         }
1719
1720       /* Compute the costs of this SLP instance.  */
1721       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1722                              new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1723
1724       if (loop_vinfo)
1725         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1726       else
1727         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1728
1729       if (dump_enabled_p ())
1730         vect_print_slp_tree (MSG_NOTE, node);
1731
1732       return true;
1733     }
1734
1735   /* Failed to SLP.  */
1736   /* Free the allocated memory.  */
1737   vect_free_slp_tree (node);
1738   loads.release ();
1739
1740   return false;
1741 }
1742
1743
1744 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1745    trees of packed scalar stmts if SLP is possible.  */
1746
1747 bool
1748 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1749                   unsigned max_tree_size)
1750 {
1751   unsigned int i;
1752   vec<gimple> grouped_stores;
1753   vec<gimple> reductions = vNULL;
1754   vec<gimple> reduc_chains = vNULL;
1755   gimple first_element;
1756   bool ok = false;
1757
1758   if (dump_enabled_p ())
1759     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1760
1761   if (loop_vinfo)
1762     {
1763       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1764       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1765       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1766     }
1767   else
1768     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1769
1770   /* Find SLP sequences starting from groups of grouped stores.  */
1771   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1772     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1773                                    max_tree_size))
1774       ok = true;
1775
1776   if (bb_vinfo && !ok)
1777     {
1778       if (dump_enabled_p ())
1779         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1780                          "Failed to SLP the basic block.\n");
1781
1782       return false;
1783     }
1784
1785   if (loop_vinfo
1786       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1787     {
1788       /* Find SLP sequences starting from reduction chains.  */
1789       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1790         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1791                                        max_tree_size))
1792           ok = true;
1793         else
1794           return false;
1795
1796       /* Don't try to vectorize SLP reductions if reduction chain was
1797          detected.  */
1798       return ok;
1799     }
1800
1801   /* Find SLP sequences starting from groups of reductions.  */
1802   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1803       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1804                                     max_tree_size))
1805     ok = true;
1806
1807   return true;
1808 }
1809
1810
1811 /* For each possible SLP instance decide whether to SLP it and calculate overall
1812    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1813    least one instance.  */
1814
1815 bool
1816 vect_make_slp_decision (loop_vec_info loop_vinfo)
1817 {
1818   unsigned int i, unrolling_factor = 1;
1819   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1820   slp_instance instance;
1821   int decided_to_slp = 0;
1822
1823   if (dump_enabled_p ())
1824     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1825                      "\n");
1826
1827   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1828     {
1829       /* FORNOW: SLP if you can.  */
1830       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1831         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1832
1833       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1834          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1835          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1836       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1837       decided_to_slp++;
1838     }
1839
1840   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1841
1842   if (decided_to_slp && dump_enabled_p ())
1843     dump_printf_loc (MSG_NOTE, vect_location,
1844                      "Decided to SLP %d instances. Unrolling factor %d\n",
1845                      decided_to_slp, unrolling_factor);
1846
1847   return (decided_to_slp > 0);
1848 }
1849
1850
1851 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1852    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1853
1854 static void
1855 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1856 {
1857   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1858   imm_use_iterator imm_iter;
1859   gimple use_stmt;
1860   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1861   slp_tree child;
1862   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1863   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1864   int j;
1865
1866   /* Propagate hybrid down the SLP tree.  */
1867   if (stype == hybrid)
1868     ;
1869   else if (HYBRID_SLP_STMT (stmt_vinfo))
1870     stype = hybrid;
1871   else
1872     {
1873       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1874       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1875       if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1876         FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1877           if (gimple_bb (use_stmt)
1878               && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1879               && (use_vinfo = vinfo_for_stmt (use_stmt))
1880               && !STMT_SLP_TYPE (use_vinfo)
1881               && (STMT_VINFO_RELEVANT (use_vinfo)
1882                   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))
1883                   || (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1884                       && STMT_VINFO_RELATED_STMT (use_vinfo)
1885                       && !STMT_SLP_TYPE (vinfo_for_stmt
1886                             (STMT_VINFO_RELATED_STMT (use_vinfo)))))
1887               && !(gimple_code (use_stmt) == GIMPLE_PHI
1888                    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1889             stype = hybrid;
1890     }
1891
1892   if (stype == hybrid)
1893     STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1894
1895   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1896     vect_detect_hybrid_slp_stmts (child, i, stype);
1897 }
1898
1899 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
1900
1901 static tree
1902 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1903 {
1904   walk_stmt_info *wi = (walk_stmt_info *)data;
1905   struct loop *loopp = (struct loop *)wi->info;
1906
1907   if (wi->is_lhs)
1908     return NULL_TREE;
1909
1910   if (TREE_CODE (*tp) == SSA_NAME
1911       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
1912     {
1913       gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
1914       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
1915           && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
1916         STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
1917     }
1918
1919   return NULL_TREE;
1920 }
1921
1922 static tree
1923 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
1924                           walk_stmt_info *)
1925 {
1926   /* If the stmt is in a SLP instance then this isn't a reason
1927      to mark use definitions in other SLP instances as hybrid.  */
1928   if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
1929     *handled = true;
1930   return NULL_TREE;
1931 }
1932
1933 /* Find stmts that must be both vectorized and SLPed.  */
1934
1935 void
1936 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1937 {
1938   unsigned int i;
1939   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1940   slp_instance instance;
1941
1942   if (dump_enabled_p ())
1943     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1944                      "\n");
1945
1946   /* First walk all pattern stmt in the loop and mark defs of uses as
1947      hybrid because immediate uses in them are not recorded.  */
1948   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1949     {
1950       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1951       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1952            gsi_next (&gsi))
1953         {
1954           gimple stmt = gsi_stmt (gsi);
1955           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1956           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
1957             {
1958               walk_stmt_info wi;
1959               memset (&wi, 0, sizeof (wi));
1960               wi.info = LOOP_VINFO_LOOP (loop_vinfo);
1961               gimple_stmt_iterator gsi2
1962                 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
1963               walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
1964                                 vect_detect_hybrid_slp_1, &wi);
1965               walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
1966                                vect_detect_hybrid_slp_2,
1967                                vect_detect_hybrid_slp_1, &wi);
1968             }
1969         }
1970     }
1971
1972   /* Then walk the SLP instance trees marking stmts with uses in
1973      non-SLP stmts as hybrid, also propagating hybrid down the
1974      SLP tree, collecting the above info on-the-fly.  */
1975   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1976     {
1977       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
1978         vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
1979                                       i, pure_slp);
1980     }
1981 }
1982
1983
1984 /* Create and initialize a new bb_vec_info struct for BB, as well as
1985    stmt_vec_info structs for all the stmts in it.  */
1986
1987 static bb_vec_info
1988 new_bb_vec_info (basic_block bb)
1989 {
1990   bb_vec_info res = NULL;
1991   gimple_stmt_iterator gsi;
1992
1993   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1994   BB_VINFO_BB (res) = bb;
1995
1996   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1997     {
1998       gimple stmt = gsi_stmt (gsi);
1999       gimple_set_uid (stmt, 0);
2000       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2001     }
2002
2003   BB_VINFO_GROUPED_STORES (res).create (10);
2004   BB_VINFO_SLP_INSTANCES (res).create (2);
2005   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2006
2007   bb->aux = res;
2008   return res;
2009 }
2010
2011
2012 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2013    stmts in the basic block.  */
2014
2015 static void
2016 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2017 {
2018   vec<slp_instance> slp_instances;
2019   slp_instance instance;
2020   basic_block bb;
2021   gimple_stmt_iterator si;
2022   unsigned i;
2023
2024   if (!bb_vinfo)
2025     return;
2026
2027   bb = BB_VINFO_BB (bb_vinfo);
2028
2029   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2030     {
2031       gimple stmt = gsi_stmt (si);
2032       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2033
2034       if (stmt_info)
2035         /* Free stmt_vec_info.  */
2036         free_stmt_vec_info (stmt);
2037     }
2038
2039   vect_destroy_datarefs (NULL, bb_vinfo);
2040   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2041   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2042   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2043   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2044     vect_free_slp_instance (instance);
2045   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2046   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2047   free (bb_vinfo);
2048   bb->aux = NULL;
2049 }
2050
2051
2052 /* Analyze statements contained in SLP tree node after recursively analyzing
2053    the subtree. Return TRUE if the operations are supported.  */
2054
2055 static bool
2056 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2057 {
2058   bool dummy;
2059   int i;
2060   gimple stmt;
2061   slp_tree child;
2062
2063   if (!node)
2064     return true;
2065
2066   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2067     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2068       return false;
2069
2070   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2071     {
2072       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2073       gcc_assert (stmt_info);
2074       gcc_assert (PURE_SLP_STMT (stmt_info));
2075
2076       if (!vect_analyze_stmt (stmt, &dummy, node))
2077         return false;
2078     }
2079
2080   return true;
2081 }
2082
2083
2084 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2085    operations are supported. */
2086
2087 static bool
2088 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2089 {
2090   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2091   slp_instance instance;
2092   int i;
2093
2094   for (i = 0; slp_instances.iterate (i, &instance); )
2095     {
2096       if (!vect_slp_analyze_node_operations (bb_vinfo,
2097                                              SLP_INSTANCE_TREE (instance)))
2098         {
2099           vect_free_slp_instance (instance);
2100           slp_instances.ordered_remove (i);
2101         }
2102       else
2103         i++;
2104     }
2105
2106   if (!slp_instances.length ())
2107     return false;
2108
2109   return true;
2110 }
2111
2112
2113 /* Compute the scalar cost of the SLP node NODE and its children
2114    and return it.  Do not account defs that are marked in LIFE and
2115    update LIFE according to uses of NODE.  */
2116
2117 static unsigned
2118 vect_bb_slp_scalar_cost (basic_block bb,
2119                          slp_tree node, vec<bool, va_heap> *life)
2120 {
2121   unsigned scalar_cost = 0;
2122   unsigned i;
2123   gimple stmt;
2124   slp_tree child;
2125
2126   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2127     {
2128       unsigned stmt_cost;
2129       ssa_op_iter op_iter;
2130       def_operand_p def_p;
2131       stmt_vec_info stmt_info;
2132
2133       if ((*life)[i])
2134         continue;
2135
2136       /* If there is a non-vectorized use of the defs then the scalar
2137          stmt is kept live in which case we do not account it or any
2138          required defs in the SLP children in the scalar cost.  This
2139          way we make the vectorization more costly when compared to
2140          the scalar cost.  */
2141       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2142         {
2143           imm_use_iterator use_iter;
2144           gimple use_stmt;
2145           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2146             if (!is_gimple_debug (use_stmt)
2147                 && (gimple_code (use_stmt) == GIMPLE_PHI
2148                     || gimple_bb (use_stmt) != bb
2149                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2150               {
2151                 (*life)[i] = true;
2152                 BREAK_FROM_IMM_USE_STMT (use_iter);
2153               }
2154         }
2155       if ((*life)[i])
2156         continue;
2157
2158       stmt_info = vinfo_for_stmt (stmt);
2159       if (STMT_VINFO_DATA_REF (stmt_info))
2160         {
2161           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2162             stmt_cost = vect_get_stmt_cost (scalar_load);
2163           else
2164             stmt_cost = vect_get_stmt_cost (scalar_store);
2165         }
2166       else
2167         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2168
2169       scalar_cost += stmt_cost;
2170     }
2171
2172   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2173     scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2174
2175   return scalar_cost;
2176 }
2177
2178 /* Check if vectorization of the basic block is profitable.  */
2179
2180 static bool
2181 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2182 {
2183   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2184   slp_instance instance;
2185   int i, j;
2186   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2187   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2188   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2189   stmt_vec_info stmt_info = NULL;
2190   stmt_vector_for_cost body_cost_vec;
2191   stmt_info_for_cost *ci;
2192
2193   /* Calculate vector costs.  */
2194   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2195     {
2196       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2197
2198       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2199         {
2200           stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2201           (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2202                                 stmt_info, ci->misalign, vect_body);
2203         }
2204     }
2205
2206   /* Calculate scalar cost.  */
2207   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2208     {
2209       auto_vec<bool, 20> life;
2210       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2211       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2212                                               SLP_INSTANCE_TREE (instance),
2213                                               &life);
2214     }
2215
2216   /* Complete the target-specific cost calculation.  */
2217   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2218                &vec_inside_cost, &vec_epilogue_cost);
2219
2220   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2221
2222   if (dump_enabled_p ())
2223     {
2224       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2225       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2226                    vec_inside_cost);
2227       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2228       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2229       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2230     }
2231
2232   /* Vectorization is profitable if its cost is less than the cost of scalar
2233      version.  */
2234   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2235     return false;
2236
2237   return true;
2238 }
2239
2240 /* Check if the basic block can be vectorized.  */
2241
2242 static bb_vec_info
2243 vect_slp_analyze_bb_1 (basic_block bb)
2244 {
2245   bb_vec_info bb_vinfo;
2246   vec<slp_instance> slp_instances;
2247   slp_instance instance;
2248   int i;
2249   int min_vf = 2;
2250   unsigned n_stmts = 0;
2251
2252   bb_vinfo = new_bb_vec_info (bb);
2253   if (!bb_vinfo)
2254     return NULL;
2255
2256   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2257     {
2258       if (dump_enabled_p ())
2259         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2260                          "not vectorized: unhandled data-ref in basic "
2261                          "block.\n");
2262
2263       destroy_bb_vec_info (bb_vinfo);
2264       return NULL;
2265     }
2266
2267   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2268     {
2269       if (dump_enabled_p ())
2270         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271                          "not vectorized: not enough data-refs in "
2272                          "basic block.\n");
2273
2274       destroy_bb_vec_info (bb_vinfo);
2275       return NULL;
2276     }
2277
2278   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2279     {
2280      if (dump_enabled_p ())
2281        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2282                         "not vectorized: unhandled data access in "
2283                         "basic block.\n");
2284
2285       destroy_bb_vec_info (bb_vinfo);
2286       return NULL;
2287     }
2288
2289   vect_pattern_recog (NULL, bb_vinfo);
2290
2291   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2292     {
2293       if (dump_enabled_p ())
2294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2295                          "not vectorized: bad data alignment in basic "
2296                          "block.\n");
2297
2298       destroy_bb_vec_info (bb_vinfo);
2299       return NULL;
2300     }
2301
2302   /* Check the SLP opportunities in the basic block, analyze and build SLP
2303      trees.  */
2304   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2305     {
2306       if (dump_enabled_p ())
2307         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2308                          "not vectorized: failed to find SLP opportunities "
2309                          "in basic block.\n");
2310
2311       destroy_bb_vec_info (bb_vinfo);
2312       return NULL;
2313     }
2314
2315   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2316
2317   /* Mark all the statements that we want to vectorize as pure SLP and
2318      relevant.  */
2319   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2320     {
2321       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2322       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2323     }
2324
2325   /* Mark all the statements that we do not want to vectorize.  */
2326   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2327        !gsi_end_p (gsi); gsi_next (&gsi))
2328     {
2329       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2330       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2331         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2332     }
2333
2334   /* Analyze dependences.  At this point all stmts not participating in
2335      vectorization have to be marked.  Dependence analysis assumes
2336      that we either vectorize all SLP instances or none at all.  */
2337   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2338      {
2339        if (dump_enabled_p ())
2340          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2341                           "not vectorized: unhandled data dependence "
2342                           "in basic block.\n");
2343
2344        destroy_bb_vec_info (bb_vinfo);
2345        return NULL;
2346      }
2347
2348   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2349     {
2350       if (dump_enabled_p ())
2351         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352                          "not vectorized: unsupported alignment in basic "
2353                          "block.\n");
2354       destroy_bb_vec_info (bb_vinfo);
2355       return NULL;
2356     }
2357
2358   if (!vect_slp_analyze_operations (bb_vinfo))
2359     {
2360       if (dump_enabled_p ())
2361         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2362                          "not vectorized: bad operation in basic block.\n");
2363
2364       destroy_bb_vec_info (bb_vinfo);
2365       return NULL;
2366     }
2367
2368   /* Cost model: check if the vectorization is worthwhile.  */
2369   if (!unlimited_cost_model (NULL)
2370       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2371     {
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374                          "not vectorized: vectorization is not "
2375                          "profitable.\n");
2376
2377       destroy_bb_vec_info (bb_vinfo);
2378       return NULL;
2379     }
2380
2381   if (dump_enabled_p ())
2382     dump_printf_loc (MSG_NOTE, vect_location,
2383                      "Basic block will be vectorized using SLP\n");
2384
2385   return bb_vinfo;
2386 }
2387
2388
2389 bb_vec_info
2390 vect_slp_analyze_bb (basic_block bb)
2391 {
2392   bb_vec_info bb_vinfo;
2393   int insns = 0;
2394   gimple_stmt_iterator gsi;
2395   unsigned int vector_sizes;
2396
2397   if (dump_enabled_p ())
2398     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2399
2400   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2401     {
2402       gimple stmt = gsi_stmt (gsi);
2403       if (!is_gimple_debug (stmt)
2404           && !gimple_nop_p (stmt)
2405           && gimple_code (stmt) != GIMPLE_LABEL)
2406         insns++;
2407     }
2408
2409   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2410     {
2411       if (dump_enabled_p ())
2412         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2413                          "not vectorized: too many instructions in "
2414                          "basic block.\n");
2415
2416       return NULL;
2417     }
2418
2419   /* Autodetect first vector size we try.  */
2420   current_vector_size = 0;
2421   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2422
2423   while (1)
2424     {
2425       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2426       if (bb_vinfo)
2427         return bb_vinfo;
2428
2429       destroy_bb_vec_info (bb_vinfo);
2430
2431       vector_sizes &= ~current_vector_size;
2432       if (vector_sizes == 0
2433           || current_vector_size == 0)
2434         return NULL;
2435
2436       /* Try the next biggest vector size.  */
2437       current_vector_size = 1 << floor_log2 (vector_sizes);
2438       if (dump_enabled_p ())
2439         dump_printf_loc (MSG_NOTE, vect_location,
2440                          "***** Re-trying analysis with "
2441                          "vector size %d\n", current_vector_size);
2442     }
2443 }
2444
2445
2446 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2447    the number of created vector stmts depends on the unrolling factor).
2448    However, the actual number of vector stmts for every SLP node depends on
2449    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2450    should be updated.  In this function we assume that the inside costs
2451    calculated in vect_model_xxx_cost are linear in ncopies.  */
2452
2453 void
2454 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2455 {
2456   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2457   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2458   slp_instance instance;
2459   stmt_vector_for_cost body_cost_vec;
2460   stmt_info_for_cost *si;
2461   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2462
2463   if (dump_enabled_p ())
2464     dump_printf_loc (MSG_NOTE, vect_location,
2465                      "=== vect_update_slp_costs_according_to_vf ===\n");
2466
2467   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2468     {
2469       /* We assume that costs are linear in ncopies.  */
2470       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2471
2472       /* Record the instance's instructions in the target cost model.
2473          This was delayed until here because the count of instructions
2474          isn't known beforehand.  */
2475       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2476
2477       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2478         (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2479                               vinfo_for_stmt (si->stmt), si->misalign,
2480                               vect_body);
2481     }
2482 }
2483
2484
2485 /* For constant and loop invariant defs of SLP_NODE this function returns
2486    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2487    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2488    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2489    REDUC_INDEX is the index of the reduction operand in the statements, unless
2490    it is -1.  */
2491
2492 static void
2493 vect_get_constant_vectors (tree op, slp_tree slp_node,
2494                            vec<tree> *vec_oprnds,
2495                            unsigned int op_num, unsigned int number_of_vectors,
2496                            int reduc_index)
2497 {
2498   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2499   gimple stmt = stmts[0];
2500   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2501   unsigned nunits;
2502   tree vec_cst;
2503   tree *elts;
2504   unsigned j, number_of_places_left_in_vector;
2505   tree vector_type;
2506   tree vop;
2507   int group_size = stmts.length ();
2508   unsigned int vec_num, i;
2509   unsigned number_of_copies = 1;
2510   vec<tree> voprnds;
2511   voprnds.create (number_of_vectors);
2512   bool constant_p, is_store;
2513   tree neutral_op = NULL;
2514   enum tree_code code = gimple_expr_code (stmt);
2515   gimple def_stmt;
2516   struct loop *loop;
2517   gimple_seq ctor_seq = NULL;
2518
2519   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2520       && reduc_index != -1)
2521     {
2522       op_num = reduc_index - 1;
2523       op = gimple_op (stmt, reduc_index);
2524       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2525          we need either neutral operands or the original operands.  See
2526          get_initial_def_for_reduction() for details.  */
2527       switch (code)
2528         {
2529           case WIDEN_SUM_EXPR:
2530           case DOT_PROD_EXPR:
2531           case PLUS_EXPR:
2532           case MINUS_EXPR:
2533           case BIT_IOR_EXPR:
2534           case BIT_XOR_EXPR:
2535              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2536                neutral_op = build_real (TREE_TYPE (op), dconst0);
2537              else
2538                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2539
2540              break;
2541
2542           case MULT_EXPR:
2543              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2544                neutral_op = build_real (TREE_TYPE (op), dconst1);
2545              else
2546                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2547
2548              break;
2549
2550           case BIT_AND_EXPR:
2551             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2552             break;
2553
2554           /* For MIN/MAX we don't have an easy neutral operand but
2555              the initial values can be used fine here.  Only for
2556              a reduction chain we have to force a neutral element.  */
2557           case MAX_EXPR:
2558           case MIN_EXPR:
2559             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2560               neutral_op = NULL;
2561             else
2562               {
2563                 def_stmt = SSA_NAME_DEF_STMT (op);
2564                 loop = (gimple_bb (stmt))->loop_father;
2565                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2566                                                     loop_preheader_edge (loop));
2567               }
2568             break;
2569
2570           default:
2571             neutral_op = NULL;
2572         }
2573     }
2574
2575   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2576     {
2577       is_store = true;
2578       op = gimple_assign_rhs1 (stmt);
2579     }
2580   else
2581     is_store = false;
2582
2583   gcc_assert (op);
2584
2585   if (CONSTANT_CLASS_P (op))
2586     constant_p = true;
2587   else
2588     constant_p = false;
2589
2590   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2591   gcc_assert (vector_type);
2592   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2593
2594   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2595      created vectors. It is greater than 1 if unrolling is performed.
2596
2597      For example, we have two scalar operands, s1 and s2 (e.g., group of
2598      strided accesses of size two), while NUNITS is four (i.e., four scalars
2599      of this type can be packed in a vector).  The output vector will contain
2600      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2601      will be 2).
2602
2603      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2604      containing the operands.
2605
2606      For example, NUNITS is four as before, and the group size is 8
2607      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2608      {s5, s6, s7, s8}.  */
2609
2610   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2611
2612   number_of_places_left_in_vector = nunits;
2613   elts = XALLOCAVEC (tree, nunits);
2614   for (j = 0; j < number_of_copies; j++)
2615     {
2616       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2617         {
2618           if (is_store)
2619             op = gimple_assign_rhs1 (stmt);
2620           else
2621             {
2622               switch (code)
2623                 {
2624                   case COND_EXPR:
2625                     if (op_num == 0 || op_num == 1)
2626                       {
2627                         tree cond = gimple_assign_rhs1 (stmt);
2628                         op = TREE_OPERAND (cond, op_num);
2629                       }
2630                     else
2631                       {
2632                         if (op_num == 2)
2633                           op = gimple_assign_rhs2 (stmt);
2634                         else
2635                           op = gimple_assign_rhs3 (stmt);
2636                       }
2637                     break;
2638
2639                   case CALL_EXPR:
2640                     op = gimple_call_arg (stmt, op_num);
2641                     break;
2642
2643                   case LSHIFT_EXPR:
2644                   case RSHIFT_EXPR:
2645                   case LROTATE_EXPR:
2646                   case RROTATE_EXPR:
2647                     op = gimple_op (stmt, op_num + 1);
2648                     /* Unlike the other binary operators, shifts/rotates have
2649                        the shift count being int, instead of the same type as
2650                        the lhs, so make sure the scalar is the right type if
2651                        we are dealing with vectors of
2652                        long long/long/short/char.  */
2653                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2654                       op = fold_convert (TREE_TYPE (vector_type), op);
2655                     break;
2656
2657                   default:
2658                     op = gimple_op (stmt, op_num + 1);
2659                     break;
2660                 }
2661             }
2662
2663           if (reduc_index != -1)
2664             {
2665               loop = (gimple_bb (stmt))->loop_father;
2666               def_stmt = SSA_NAME_DEF_STMT (op);
2667
2668               gcc_assert (loop);
2669
2670               /* Get the def before the loop.  In reduction chain we have only
2671                  one initial value.  */
2672               if ((j != (number_of_copies - 1)
2673                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2674                        && i != 0))
2675                   && neutral_op)
2676                 op = neutral_op;
2677               else
2678                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2679                                             loop_preheader_edge (loop));
2680             }
2681
2682           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2683           number_of_places_left_in_vector--;
2684           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2685             {
2686               if (CONSTANT_CLASS_P (op))
2687                 {
2688                   op = fold_unary (VIEW_CONVERT_EXPR,
2689                                    TREE_TYPE (vector_type), op);
2690                   gcc_assert (op && CONSTANT_CLASS_P (op));
2691                 }
2692               else
2693                 {
2694                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2695                   gimple init_stmt;
2696                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2697                   init_stmt
2698                     = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2699                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2700                   op = new_temp;
2701                 }
2702             }
2703           elts[number_of_places_left_in_vector] = op;
2704           if (!CONSTANT_CLASS_P (op))
2705             constant_p = false;
2706
2707           if (number_of_places_left_in_vector == 0)
2708             {
2709               number_of_places_left_in_vector = nunits;
2710
2711               if (constant_p)
2712                 vec_cst = build_vector (vector_type, elts);
2713               else
2714                 {
2715                   vec<constructor_elt, va_gc> *v;
2716                   unsigned k;
2717                   vec_alloc (v, nunits);
2718                   for (k = 0; k < nunits; ++k)
2719                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2720                   vec_cst = build_constructor (vector_type, v);
2721                 }
2722               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2723                                                     vector_type, NULL));
2724               if (ctor_seq != NULL)
2725                 {
2726                   gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2727                   gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2728                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2729                                                         GSI_SAME_STMT);
2730                   ctor_seq = NULL;
2731                 }
2732             }
2733         }
2734     }
2735
2736   /* Since the vectors are created in the reverse order, we should invert
2737      them.  */
2738   vec_num = voprnds.length ();
2739   for (j = vec_num; j != 0; j--)
2740     {
2741       vop = voprnds[j - 1];
2742       vec_oprnds->quick_push (vop);
2743     }
2744
2745   voprnds.release ();
2746
2747   /* In case that VF is greater than the unrolling factor needed for the SLP
2748      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2749      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2750      to replicate the vectors.  */
2751   while (number_of_vectors > vec_oprnds->length ())
2752     {
2753       tree neutral_vec = NULL;
2754
2755       if (neutral_op)
2756         {
2757           if (!neutral_vec)
2758             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2759
2760           vec_oprnds->quick_push (neutral_vec);
2761         }
2762       else
2763         {
2764           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2765             vec_oprnds->quick_push (vop);
2766         }
2767     }
2768 }
2769
2770
2771 /* Get vectorized definitions from SLP_NODE that contains corresponding
2772    vectorized def-stmts.  */
2773
2774 static void
2775 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2776 {
2777   tree vec_oprnd;
2778   gimple vec_def_stmt;
2779   unsigned int i;
2780
2781   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2782
2783   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2784     {
2785       gcc_assert (vec_def_stmt);
2786       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2787       vec_oprnds->quick_push (vec_oprnd);
2788     }
2789 }
2790
2791
2792 /* Get vectorized definitions for SLP_NODE.
2793    If the scalar definitions are loop invariants or constants, collect them and
2794    call vect_get_constant_vectors() to create vector stmts.
2795    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2796    must be stored in the corresponding child of SLP_NODE, and we call
2797    vect_get_slp_vect_defs () to retrieve them.  */
2798
2799 void
2800 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2801                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2802 {
2803   gimple first_stmt;
2804   int number_of_vects = 0, i;
2805   unsigned int child_index = 0;
2806   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2807   slp_tree child = NULL;
2808   vec<tree> vec_defs;
2809   tree oprnd;
2810   bool vectorized_defs;
2811
2812   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2813   FOR_EACH_VEC_ELT (ops, i, oprnd)
2814     {
2815       /* For each operand we check if it has vectorized definitions in a child
2816          node or we need to create them (for invariants and constants).  We
2817          check if the LHS of the first stmt of the next child matches OPRND.
2818          If it does, we found the correct child.  Otherwise, we call
2819          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2820          to check this child node for the next operand.  */
2821       vectorized_defs = false;
2822       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2823         {
2824           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2825
2826           /* We have to check both pattern and original def, if available.  */
2827           gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2828           gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2829
2830           if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2831               || (related
2832                   && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2833             {
2834               /* The number of vector defs is determined by the number of
2835                  vector statements in the node from which we get those
2836                  statements.  */
2837               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2838               vectorized_defs = true;
2839               child_index++;
2840             }
2841         }
2842
2843       if (!vectorized_defs)
2844         {
2845           if (i == 0)
2846             {
2847               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2848               /* Number of vector stmts was calculated according to LHS in
2849                  vect_schedule_slp_instance (), fix it by replacing LHS with
2850                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2851                  details.  */
2852               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2853                                              &rhs_size_unit);
2854               if (rhs_size_unit != lhs_size_unit)
2855                 {
2856                   number_of_vects *= rhs_size_unit;
2857                   number_of_vects /= lhs_size_unit;
2858                 }
2859             }
2860         }
2861
2862       /* Allocate memory for vectorized defs.  */
2863       vec_defs = vNULL;
2864       vec_defs.create (number_of_vects);
2865
2866       /* For reduction defs we call vect_get_constant_vectors (), since we are
2867          looking for initial loop invariant values.  */
2868       if (vectorized_defs && reduc_index == -1)
2869         /* The defs are already vectorized.  */
2870         vect_get_slp_vect_defs (child, &vec_defs);
2871       else
2872         /* Build vectors from scalar defs.  */
2873         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2874                                    number_of_vects, reduc_index);
2875
2876       vec_oprnds->quick_push (vec_defs);
2877
2878       /* For reductions, we only need initial values.  */
2879       if (reduc_index != -1)
2880         return;
2881     }
2882 }
2883
2884
2885 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2886    building a vector of type MASK_TYPE from it) and two input vectors placed in
2887    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2888    shifting by STRIDE elements of DR_CHAIN for every copy.
2889    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2890    copies).
2891    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2892    the created stmts must be inserted.  */
2893
2894 static inline void
2895 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2896                            tree mask, int first_vec_indx, int second_vec_indx,
2897                            gimple_stmt_iterator *gsi, slp_tree node,
2898                            tree vectype, vec<tree> dr_chain,
2899                            int ncopies, int vect_stmts_counter)
2900 {
2901   tree perm_dest;
2902   gimple perm_stmt = NULL;
2903   stmt_vec_info next_stmt_info;
2904   int i, stride;
2905   tree first_vec, second_vec, data_ref;
2906
2907   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2908
2909   /* Initialize the vect stmts of NODE to properly insert the generated
2910      stmts later.  */
2911   for (i = SLP_TREE_VEC_STMTS (node).length ();
2912        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2913     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2914
2915   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2916   for (i = 0; i < ncopies; i++)
2917     {
2918       first_vec = dr_chain[first_vec_indx];
2919       second_vec = dr_chain[second_vec_indx];
2920
2921       /* Generate the permute statement.  */
2922       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
2923                                        first_vec, second_vec, mask);
2924       data_ref = make_ssa_name (perm_dest, perm_stmt);
2925       gimple_set_lhs (perm_stmt, data_ref);
2926       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2927
2928       /* Store the vector statement in NODE.  */
2929       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2930
2931       first_vec_indx += stride;
2932       second_vec_indx += stride;
2933     }
2934
2935   /* Mark the scalar stmt as vectorized.  */
2936   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2937   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2938 }
2939
2940
2941 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2942    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2943    representation.  Check that the mask is valid and return FALSE if not.
2944    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2945    the next vector, i.e., the current first vector is not needed.  */
2946
2947 static bool
2948 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2949                        int mask_nunits, bool only_one_vec, int index,
2950                        unsigned char *mask, int *current_mask_element,
2951                        bool *need_next_vector, int *number_of_mask_fixes,
2952                        bool *mask_fixed, bool *needs_first_vector)
2953 {
2954   int i;
2955
2956   /* Convert to target specific representation.  */
2957   *current_mask_element = first_mask_element + m;
2958   /* Adjust the value in case it's a mask for second and third vectors.  */
2959   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2960
2961   if (*current_mask_element < mask_nunits)
2962     *needs_first_vector = true;
2963
2964   /* We have only one input vector to permute but the mask accesses values in
2965      the next vector as well.  */
2966   if (only_one_vec && *current_mask_element >= mask_nunits)
2967     {
2968       if (dump_enabled_p ())
2969         {
2970           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2971                            "permutation requires at least two vectors ");
2972           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2973           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2974         }
2975
2976       return false;
2977     }
2978
2979   /* The mask requires the next vector.  */
2980   while (*current_mask_element >= mask_nunits * 2)
2981     {
2982       if (*needs_first_vector || *mask_fixed)
2983         {
2984           /* We either need the first vector too or have already moved to the
2985              next vector. In both cases, this permutation needs three
2986              vectors.  */
2987           if (dump_enabled_p ())
2988             {
2989               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2990                                "permutation requires at "
2991                                "least three vectors ");
2992               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2993               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2994             }
2995
2996           return false;
2997         }
2998
2999       /* We move to the next vector, dropping the first one and working with
3000          the second and the third - we need to adjust the values of the mask
3001          accordingly.  */
3002       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3003
3004       for (i = 0; i < index; i++)
3005         mask[i] -= mask_nunits * *number_of_mask_fixes;
3006
3007       (*number_of_mask_fixes)++;
3008       *mask_fixed = true;
3009     }
3010
3011   *need_next_vector = *mask_fixed;
3012
3013   /* This was the last element of this mask. Start a new one.  */
3014   if (index == mask_nunits - 1)
3015     {
3016       *number_of_mask_fixes = 1;
3017       *mask_fixed = false;
3018       *needs_first_vector = false;
3019     }
3020
3021   return true;
3022 }
3023
3024
3025 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3026    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3027    permute statements for the SLP node NODE of the SLP instance
3028    SLP_NODE_INSTANCE.  */
3029
3030 bool
3031 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3032                               gimple_stmt_iterator *gsi, int vf,
3033                               slp_instance slp_node_instance, bool analyze_only)
3034 {
3035   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3036   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3037   tree mask_element_type = NULL_TREE, mask_type;
3038   int i, j, k, nunits, vec_index = 0, scalar_index;
3039   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3040   gimple next_scalar_stmt;
3041   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3042   int first_mask_element;
3043   int index, unroll_factor, current_mask_element, ncopies;
3044   unsigned char *mask;
3045   bool only_one_vec = false, need_next_vector = false;
3046   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3047   int number_of_mask_fixes = 1;
3048   bool mask_fixed = false;
3049   bool needs_first_vector = false;
3050   machine_mode mode;
3051
3052   mode = TYPE_MODE (vectype);
3053
3054   if (!can_vec_perm_p (mode, false, NULL))
3055     {
3056       if (dump_enabled_p ())
3057         {
3058           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3059                            "no vect permute for ");
3060           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3061           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3062         }
3063       return false;
3064     }
3065
3066   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3067      same size as the vector element being permuted.  */
3068   mask_element_type = lang_hooks.types.type_for_mode
3069                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3070   mask_type = get_vectype_for_scalar_type (mask_element_type);
3071   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3072   mask = XALLOCAVEC (unsigned char, nunits);
3073   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3074
3075   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3076      unrolling factor.  */
3077   orig_vec_stmts_num = group_size *
3078                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3079   if (orig_vec_stmts_num == 1)
3080     only_one_vec = true;
3081
3082   /* Number of copies is determined by the final vectorization factor
3083      relatively to SLP_NODE_INSTANCE unrolling factor.  */
3084   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3085
3086   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3087     return false;
3088
3089   /* Generate permutation masks for every NODE. Number of masks for each NODE
3090      is equal to GROUP_SIZE.
3091      E.g., we have a group of three nodes with three loads from the same
3092      location in each node, and the vector size is 4. I.e., we have a
3093      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3094      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3095      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3096      ...
3097
3098      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3099      The last mask is illegal since we assume two operands for permute
3100      operation, and the mask element values can't be outside that range.
3101      Hence, the last mask must be converted into {2,5,5,5}.
3102      For the first two permutations we need the first and the second input
3103      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3104      we need the second and the third vectors: {b1,c1,a2,b2} and
3105      {c2,a3,b3,c3}.  */
3106
3107     {
3108       scalar_index = 0;
3109       index = 0;
3110       vect_stmts_counter = 0;
3111       vec_index = 0;
3112       first_vec_index = vec_index++;
3113       if (only_one_vec)
3114         second_vec_index = first_vec_index;
3115       else
3116         second_vec_index =  vec_index++;
3117
3118       for (j = 0; j < unroll_factor; j++)
3119         {
3120           for (k = 0; k < group_size; k++)
3121             {
3122               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3123               first_mask_element = i + j * group_size;
3124               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3125                                           nunits, only_one_vec, index,
3126                                           mask, &current_mask_element,
3127                                           &need_next_vector,
3128                                           &number_of_mask_fixes, &mask_fixed,
3129                                           &needs_first_vector))
3130                 return false;
3131               gcc_assert (current_mask_element < 2 * nunits);
3132               mask[index++] = current_mask_element;
3133
3134               if (index == nunits)
3135                 {
3136                   index = 0;
3137                   if (!can_vec_perm_p (mode, false, mask))
3138                     {
3139                       if (dump_enabled_p ())
3140                         {
3141                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3142                                            vect_location, 
3143                                            "unsupported vect permute { ");
3144                           for (i = 0; i < nunits; ++i)
3145                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3146                                          mask[i]);
3147                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3148                         }
3149                       return false;
3150                     }
3151
3152                   if (!analyze_only)
3153                     {
3154                       int l;
3155                       tree mask_vec, *mask_elts;
3156                       mask_elts = XALLOCAVEC (tree, nunits);
3157                       for (l = 0; l < nunits; ++l)
3158                         mask_elts[l] = build_int_cst (mask_element_type,
3159                                                       mask[l]);
3160                       mask_vec = build_vector (mask_type, mask_elts);
3161
3162                       if (need_next_vector)
3163                         {
3164                           first_vec_index = second_vec_index;
3165                           second_vec_index = vec_index;
3166                         }
3167
3168                       next_scalar_stmt
3169                           = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3170
3171                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3172                                mask_vec, first_vec_index, second_vec_index,
3173                                gsi, node, vectype, dr_chain,
3174                                ncopies, vect_stmts_counter++);
3175                     }
3176                 }
3177             }
3178         }
3179     }
3180
3181   return true;
3182 }
3183
3184
3185
3186 /* Vectorize SLP instance tree in postorder.  */
3187
3188 static bool
3189 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3190                             unsigned int vectorization_factor)
3191 {
3192   gimple stmt;
3193   bool grouped_store, is_store;
3194   gimple_stmt_iterator si;
3195   stmt_vec_info stmt_info;
3196   unsigned int vec_stmts_size, nunits, group_size;
3197   tree vectype;
3198   int i;
3199   slp_tree child;
3200
3201   if (!node)
3202     return false;
3203
3204   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3205     vect_schedule_slp_instance (child, instance, vectorization_factor);
3206
3207   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3208   stmt_info = vinfo_for_stmt (stmt);
3209
3210   /* VECTYPE is the type of the destination.  */
3211   vectype = STMT_VINFO_VECTYPE (stmt_info);
3212   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3213   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3214
3215   /* For each SLP instance calculate number of vector stmts to be created
3216      for the scalar stmts in each node of the SLP tree.  Number of vector
3217      elements in one vector iteration is the number of scalar elements in
3218      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3219      size.  */
3220   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3221
3222   if (!SLP_TREE_VEC_STMTS (node).exists ())
3223     {
3224       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3225       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3226     }
3227
3228   if (dump_enabled_p ())
3229     {
3230       dump_printf_loc (MSG_NOTE,vect_location,
3231                        "------>vectorizing SLP node starting from: ");
3232       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3233       dump_printf (MSG_NOTE, "\n");
3234     }
3235
3236   /* Loads should be inserted before the first load.  */
3237   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3238       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3239       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3240       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3241     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3242   else if (is_pattern_stmt_p (stmt_info))
3243     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3244   else
3245     si = gsi_for_stmt (stmt);
3246
3247   /* Stores should be inserted just before the last store.  */
3248   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3249       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3250     { 
3251       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3252       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3253        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3254       si = gsi_for_stmt (last_store);
3255     }
3256
3257   /* Mark the first element of the reduction chain as reduction to properly
3258      transform the node.  In the analysis phase only the last element of the
3259      chain is marked as reduction.  */
3260   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3261       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3262     {
3263       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3264       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3265     }
3266
3267   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3268   return is_store;
3269 }
3270
3271 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3272    For loop vectorization this is done in vectorizable_call, but for SLP
3273    it needs to be deferred until end of vect_schedule_slp, because multiple
3274    SLP instances may refer to the same scalar stmt.  */
3275
3276 static void
3277 vect_remove_slp_scalar_calls (slp_tree node)
3278 {
3279   gimple stmt, new_stmt;
3280   gimple_stmt_iterator gsi;
3281   int i;
3282   slp_tree child;
3283   tree lhs;
3284   stmt_vec_info stmt_info;
3285
3286   if (!node)
3287     return;
3288
3289   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3290     vect_remove_slp_scalar_calls (child);
3291
3292   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3293     {
3294       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3295         continue;
3296       stmt_info = vinfo_for_stmt (stmt);
3297       if (stmt_info == NULL
3298           || is_pattern_stmt_p (stmt_info)
3299           || !PURE_SLP_STMT (stmt_info))
3300         continue;
3301       lhs = gimple_call_lhs (stmt);
3302       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3303       set_vinfo_for_stmt (new_stmt, stmt_info);
3304       set_vinfo_for_stmt (stmt, NULL);
3305       STMT_VINFO_STMT (stmt_info) = new_stmt;
3306       gsi = gsi_for_stmt (stmt);
3307       gsi_replace (&gsi, new_stmt, false);
3308       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3309     }
3310 }
3311
3312 /* Generate vector code for all SLP instances in the loop/basic block.  */
3313
3314 bool
3315 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3316 {
3317   vec<slp_instance> slp_instances;
3318   slp_instance instance;
3319   unsigned int i, vf;
3320   bool is_store = false;
3321
3322   if (loop_vinfo)
3323     {
3324       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3325       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3326     }
3327   else
3328     {
3329       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3330       vf = 1;
3331     }
3332
3333   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3334     {
3335       /* Schedule the tree of INSTANCE.  */
3336       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3337                                              instance, vf);
3338       if (dump_enabled_p ())
3339         dump_printf_loc (MSG_NOTE, vect_location,
3340                          "vectorizing stmts using SLP.\n");
3341     }
3342
3343   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3344     {
3345       slp_tree root = SLP_INSTANCE_TREE (instance);
3346       gimple store;
3347       unsigned int j;
3348       gimple_stmt_iterator gsi;
3349
3350       /* Remove scalar call stmts.  Do not do this for basic-block
3351          vectorization as not all uses may be vectorized.
3352          ???  Why should this be necessary?  DCE should be able to
3353          remove the stmts itself.
3354          ???  For BB vectorization we can as well remove scalar
3355          stmts starting from the SLP tree root if they have no
3356          uses.  */
3357       if (loop_vinfo)
3358         vect_remove_slp_scalar_calls (root);
3359
3360       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3361                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3362         {
3363           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3364             break;
3365
3366          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3367            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3368           /* Free the attached stmt_vec_info and remove the stmt.  */
3369           gsi = gsi_for_stmt (store);
3370           unlink_stmt_vdef (store);
3371           gsi_remove (&gsi, true);
3372           release_defs (store);
3373           free_stmt_vec_info (store);
3374         }
3375     }
3376
3377   return is_store;
3378 }
3379
3380
3381 /* Vectorize the basic block.  */
3382
3383 void
3384 vect_slp_transform_bb (basic_block bb)
3385 {
3386   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3387   gimple_stmt_iterator si;
3388
3389   gcc_assert (bb_vinfo);
3390
3391   if (dump_enabled_p ())
3392     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3393
3394   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3395     {
3396       gimple stmt = gsi_stmt (si);
3397       stmt_vec_info stmt_info;
3398
3399       if (dump_enabled_p ())
3400         {
3401           dump_printf_loc (MSG_NOTE, vect_location,
3402                            "------>SLPing statement: ");
3403           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3404           dump_printf (MSG_NOTE, "\n");
3405         }
3406
3407       stmt_info = vinfo_for_stmt (stmt);
3408       gcc_assert (stmt_info);
3409
3410       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3411       if (STMT_SLP_TYPE (stmt_info))
3412         {
3413           vect_schedule_slp (NULL, bb_vinfo);
3414           break;
3415         }
3416     }
3417
3418   if (dump_enabled_p ())
3419     dump_printf_loc (MSG_NOTE, vect_location,
3420                      "BASIC BLOCK VECTORIZED\n");
3421
3422   destroy_bb_vec_info (bb_vinfo);
3423 }