genattrtab.c (write_header): Include hash-set.h...
[platform/upstream/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "input.h"
43 #include "function.h"
44 #include "basic-block.h"
45 #include "gimple-pretty-print.h"
46 #include "tree-ssa-alias.h"
47 #include "internal-fn.h"
48 #include "gimple-expr.h"
49 #include "is-a.h"
50 #include "gimple.h"
51 #include "gimple-iterator.h"
52 #include "gimple-ssa.h"
53 #include "tree-phinodes.h"
54 #include "ssa-iterators.h"
55 #include "stringpool.h"
56 #include "tree-ssanames.h"
57 #include "tree-pass.h"
58 #include "cfgloop.h"
59 #include "expr.h"
60 #include "recog.h"              /* FIXME: for insn_data */
61 #include "insn-codes.h"
62 #include "optabs.h"
63 #include "tree-vectorizer.h"
64 #include "langhooks.h"
65 #include "gimple-walk.h"
66
67 /* Extract the location of the basic block in the source code.
68    Return the basic block location if succeed and NULL if not.  */
69
70 source_location
71 find_bb_location (basic_block bb)
72 {
73   gimple stmt = NULL;
74   gimple_stmt_iterator si;
75
76   if (!bb)
77     return UNKNOWN_LOCATION;
78
79   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
80     {
81       stmt = gsi_stmt (si);
82       if (gimple_location (stmt) != UNKNOWN_LOCATION)
83         return gimple_location (stmt);
84     }
85
86   return UNKNOWN_LOCATION;
87 }
88
89
90 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
91
92 static void
93 vect_free_slp_tree (slp_tree node)
94 {
95   int i;
96   slp_tree child;
97
98   if (!node)
99     return;
100
101   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
102     vect_free_slp_tree (child);
103
104   SLP_TREE_CHILDREN (node).release ();
105   SLP_TREE_SCALAR_STMTS (node).release ();
106   SLP_TREE_VEC_STMTS (node).release ();
107   SLP_TREE_LOAD_PERMUTATION (node).release ();
108
109   free (node);
110 }
111
112
113 /* Free the memory allocated for the SLP instance.  */
114
115 void
116 vect_free_slp_instance (slp_instance instance)
117 {
118   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
119   SLP_INSTANCE_LOADS (instance).release ();
120   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
121   free (instance);
122 }
123
124
125 /* Create an SLP node for SCALAR_STMTS.  */
126
127 static slp_tree
128 vect_create_new_slp_node (vec<gimple> scalar_stmts)
129 {
130   slp_tree node;
131   gimple stmt = scalar_stmts[0];
132   unsigned int nops;
133
134   if (is_gimple_call (stmt))
135     nops = gimple_call_num_args (stmt);
136   else if (is_gimple_assign (stmt))
137     {
138       nops = gimple_num_ops (stmt) - 1;
139       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
140         nops++;
141     }
142   else
143     return NULL;
144
145   node = XNEW (struct _slp_tree);
146   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
147   SLP_TREE_VEC_STMTS (node).create (0);
148   SLP_TREE_CHILDREN (node).create (nops);
149   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
150
151   return node;
152 }
153
154
155 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
156    operand.  */
157 static vec<slp_oprnd_info> 
158 vect_create_oprnd_info (int nops, int group_size)
159 {
160   int i;
161   slp_oprnd_info oprnd_info;
162   vec<slp_oprnd_info> oprnds_info;
163
164   oprnds_info.create (nops);
165   for (i = 0; i < nops; i++)
166     {
167       oprnd_info = XNEW (struct _slp_oprnd_info);
168       oprnd_info->def_stmts.create (group_size);
169       oprnd_info->first_dt = vect_uninitialized_def;
170       oprnd_info->first_op_type = NULL_TREE;
171       oprnd_info->first_pattern = false;
172       oprnds_info.quick_push (oprnd_info);
173     }
174
175   return oprnds_info;
176 }
177
178
179 /* Free operands info.  */
180
181 static void
182 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
183 {
184   int i;
185   slp_oprnd_info oprnd_info;
186
187   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
188     {
189       oprnd_info->def_stmts.release ();
190       XDELETE (oprnd_info);
191     }
192
193   oprnds_info.release ();
194 }
195
196
197 /* Find the place of the data-ref in STMT in the interleaving chain that starts
198    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
199
200 static int
201 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
202 {
203   gimple next_stmt = first_stmt;
204   int result = 0;
205
206   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
207     return -1;
208
209   do
210     {
211       if (next_stmt == stmt)
212         return result;
213       result++;
214       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
215     }
216   while (next_stmt);
217
218   return -1;
219 }
220
221
222 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
223    they are of a valid type and that they match the defs of the first stmt of
224    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
225    return -1, if the error could be corrected by swapping operands of the
226    operation return 1, if everything is ok return 0.  */
227
228 static int 
229 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
230                              gimple stmt, bool first,
231                              vec<slp_oprnd_info> *oprnds_info)
232 {
233   tree oprnd;
234   unsigned int i, number_of_oprnds;
235   tree def;
236   gimple def_stmt;
237   enum vect_def_type dt = vect_uninitialized_def;
238   struct loop *loop = NULL;
239   bool pattern = false;
240   slp_oprnd_info oprnd_info;
241   int first_op_idx = 1;
242   bool commutative = false;
243   bool first_op_cond = false;
244
245   if (loop_vinfo)
246     loop = LOOP_VINFO_LOOP (loop_vinfo);
247
248   if (is_gimple_call (stmt))
249     {
250       number_of_oprnds = gimple_call_num_args (stmt);
251       first_op_idx = 3;
252     }
253   else if (is_gimple_assign (stmt))
254     {
255       enum tree_code code = gimple_assign_rhs_code (stmt);
256       number_of_oprnds = gimple_num_ops (stmt) - 1;
257       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
258         {
259           first_op_cond = true;
260           commutative = true;
261           number_of_oprnds++;
262         }
263       else
264         commutative = commutative_tree_code (code);
265     }
266   else
267     return -1;
268
269   bool swapped = false;
270   for (i = 0; i < number_of_oprnds; i++)
271     {
272 again:
273       if (first_op_cond)
274         {
275           if (i == 0 || i == 1)
276             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
277                                   swapped ? !i : i);
278           else
279             oprnd = gimple_op (stmt, first_op_idx + i - 1);
280         }
281       else
282         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
283
284       oprnd_info = (*oprnds_info)[i];
285
286       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
287                                &def, &dt)
288           || (!def_stmt && dt != vect_constant_def))
289         {
290           if (dump_enabled_p ())
291             {
292               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
293                                "Build SLP failed: can't find def for ");
294               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
295               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
296             }
297
298           return -1;
299         }
300
301       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
302          from the pattern.  Check that all the stmts of the node are in the
303          pattern.  */
304       if (def_stmt && gimple_bb (def_stmt)
305           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
306               || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
307                   && gimple_code (def_stmt) != GIMPLE_PHI))
308           && vinfo_for_stmt (def_stmt)
309           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
310           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
311           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
312         {
313           pattern = true;
314           if (!first && !oprnd_info->first_pattern)
315             {
316               if (i == 0
317                   && !swapped
318                   && commutative)
319                 {
320                   swapped = true;
321                   goto again;
322                 }
323
324               if (dump_enabled_p ())
325                 {
326                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
327                                    "Build SLP failed: some of the stmts"
328                                    " are in a pattern, and others are not ");
329                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
330                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331                 }
332
333               return 1;
334             }
335
336           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
337           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
338
339           if (dt == vect_unknown_def_type)
340             {
341               if (dump_enabled_p ())
342                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
343                                  "Unsupported pattern.\n");
344               return -1;
345             }
346
347           switch (gimple_code (def_stmt))
348             {
349               case GIMPLE_PHI:
350                 def = gimple_phi_result (def_stmt);
351                 break;
352
353               case GIMPLE_ASSIGN:
354                 def = gimple_assign_lhs (def_stmt);
355                 break;
356
357               default:
358                 if (dump_enabled_p ())
359                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
360                                    "unsupported defining stmt:\n");
361                 return -1;
362             }
363         }
364
365       if (first)
366         {
367           oprnd_info->first_dt = dt;
368           oprnd_info->first_pattern = pattern;
369           oprnd_info->first_op_type = TREE_TYPE (oprnd);
370         }
371       else
372         {
373           /* Not first stmt of the group, check that the def-stmt/s match
374              the def-stmt/s of the first stmt.  Allow different definition
375              types for reduction chains: the first stmt must be a
376              vect_reduction_def (a phi node), and the rest
377              vect_internal_def.  */
378           if (((oprnd_info->first_dt != dt
379                 && !(oprnd_info->first_dt == vect_reduction_def
380                      && dt == vect_internal_def)
381                 && !((oprnd_info->first_dt == vect_external_def
382                       || oprnd_info->first_dt == vect_constant_def)
383                      && (dt == vect_external_def
384                          || dt == vect_constant_def)))
385                || !types_compatible_p (oprnd_info->first_op_type,
386                                        TREE_TYPE (oprnd))))
387             {
388               /* Try swapping operands if we got a mismatch.  */
389               if (i == 0
390                   && !swapped
391                   && commutative)
392                 {
393                   swapped = true;
394                   goto again;
395                 }
396
397               if (dump_enabled_p ())
398                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399                                  "Build SLP failed: different types\n");
400
401               return 1;
402             }
403         }
404
405       /* Check the types of the definitions.  */
406       switch (dt)
407         {
408         case vect_constant_def:
409         case vect_external_def:
410         case vect_reduction_def:
411           break;
412
413         case vect_internal_def:
414           oprnd_info->def_stmts.quick_push (def_stmt);
415           break;
416
417         default:
418           /* FORNOW: Not supported.  */
419           if (dump_enabled_p ())
420             {
421               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422                                "Build SLP failed: illegal type of def ");
423               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
424               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
425             }
426
427           return -1;
428         }
429     }
430
431   /* Swap operands.  */
432   if (swapped)
433     {
434       if (first_op_cond)
435         {
436           tree cond = gimple_assign_rhs1 (stmt);
437           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
438                              &TREE_OPERAND (cond, 1));
439           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
440         }
441       else
442         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
443                            gimple_assign_rhs2_ptr (stmt));
444     }
445
446   return 0;
447 }
448
449
450 /* Verify if the scalar stmts STMTS are isomorphic, require data
451    permutation or are of unsupported types of operation.  Return
452    true if they are, otherwise return false and indicate in *MATCHES
453    which stmts are not isomorphic to the first one.  If MATCHES[0]
454    is false then this indicates the comparison could not be
455    carried out or the stmts will never be vectorized by SLP.  */
456
457 static bool
458 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
459                        vec<gimple> stmts, unsigned int group_size,
460                        unsigned nops, unsigned int *max_nunits,
461                        unsigned int vectorization_factor, bool *matches)
462 {
463   unsigned int i;
464   gimple stmt = stmts[0];
465   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
466   enum tree_code first_cond_code = ERROR_MARK;
467   tree lhs;
468   bool need_same_oprnds = false;
469   tree vectype, scalar_type, first_op1 = NULL_TREE;
470   optab optab;
471   int icode;
472   machine_mode optab_op2_mode;
473   machine_mode vec_mode;
474   struct data_reference *first_dr;
475   HOST_WIDE_INT dummy;
476   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
477   tree cond;
478
479   /* For every stmt in NODE find its def stmt/s.  */
480   FOR_EACH_VEC_ELT (stmts, i, stmt)
481     {
482       matches[i] = false;
483
484       if (dump_enabled_p ())
485         {
486           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
487           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
488           dump_printf (MSG_NOTE, "\n");
489         }
490
491       /* Fail to vectorize statements marked as unvectorizable.  */
492       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
493         {
494           if (dump_enabled_p ())
495             {
496               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497                                "Build SLP failed: unvectorizable statement ");
498               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
499               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
500             }
501           /* Fatal mismatch.  */
502           matches[0] = false;
503           return false;
504         }
505
506       lhs = gimple_get_lhs (stmt);
507       if (lhs == NULL_TREE)
508         {
509           if (dump_enabled_p ())
510             {
511               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
512                                "Build SLP failed: not GIMPLE_ASSIGN nor "
513                                "GIMPLE_CALL ");
514               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
515               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
516             }
517           /* Fatal mismatch.  */
518           matches[0] = false;
519           return false;
520         }
521
522        if (is_gimple_assign (stmt)
523            && gimple_assign_rhs_code (stmt) == COND_EXPR
524            && (cond = gimple_assign_rhs1 (stmt))
525            && !COMPARISON_CLASS_P (cond))
526         {
527           if (dump_enabled_p ())
528             {
529               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
530                                "Build SLP failed: condition is not "
531                                "comparison ");
532               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
533               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
534             }
535           /* Fatal mismatch.  */
536           matches[0] = false;
537           return false;
538         }
539
540       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
541       vectype = get_vectype_for_scalar_type (scalar_type);
542       if (!vectype)
543         {
544           if (dump_enabled_p ())
545             {
546               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
547                                "Build SLP failed: unsupported data-type ");
548               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
549                                  scalar_type);
550               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
551             }
552           /* Fatal mismatch.  */
553           matches[0] = false;
554           return false;
555         }
556
557       /* In case of multiple types we need to detect the smallest type.  */
558       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
559         {
560           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
561           if (bb_vinfo)
562             vectorization_factor = *max_nunits;
563         }
564
565       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
566         {
567           rhs_code = CALL_EXPR;
568           if (gimple_call_internal_p (call_stmt)
569               || gimple_call_tail_p (call_stmt)
570               || gimple_call_noreturn_p (call_stmt)
571               || !gimple_call_nothrow_p (call_stmt)
572               || gimple_call_chain (call_stmt))
573             {
574               if (dump_enabled_p ())
575                 {
576                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
577                                    "Build SLP failed: unsupported call type ");
578                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
579                                     call_stmt, 0);
580                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
581                 }
582               /* Fatal mismatch.  */
583               matches[0] = false;
584               return false;
585             }
586         }
587       else
588         rhs_code = gimple_assign_rhs_code (stmt);
589
590       /* Check the operation.  */
591       if (i == 0)
592         {
593           first_stmt_code = rhs_code;
594
595           /* Shift arguments should be equal in all the packed stmts for a
596              vector shift with scalar shift operand.  */
597           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
598               || rhs_code == LROTATE_EXPR
599               || rhs_code == RROTATE_EXPR)
600             {
601               vec_mode = TYPE_MODE (vectype);
602
603               /* First see if we have a vector/vector shift.  */
604               optab = optab_for_tree_code (rhs_code, vectype,
605                                            optab_vector);
606
607               if (!optab
608                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
609                 {
610                   /* No vector/vector shift, try for a vector/scalar shift.  */
611                   optab = optab_for_tree_code (rhs_code, vectype,
612                                                optab_scalar);
613
614                   if (!optab)
615                     {
616                       if (dump_enabled_p ())
617                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618                                          "Build SLP failed: no optab.\n");
619                       /* Fatal mismatch.  */
620                       matches[0] = false;
621                       return false;
622                     }
623                   icode = (int) optab_handler (optab, vec_mode);
624                   if (icode == CODE_FOR_nothing)
625                     {
626                       if (dump_enabled_p ())
627                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
628                                          "Build SLP failed: "
629                                          "op not supported by target.\n");
630                       /* Fatal mismatch.  */
631                       matches[0] = false;
632                       return false;
633                     }
634                   optab_op2_mode = insn_data[icode].operand[2].mode;
635                   if (!VECTOR_MODE_P (optab_op2_mode))
636                     {
637                       need_same_oprnds = true;
638                       first_op1 = gimple_assign_rhs2 (stmt);
639                     }
640                 }
641             }
642           else if (rhs_code == WIDEN_LSHIFT_EXPR)
643             {
644               need_same_oprnds = true;
645               first_op1 = gimple_assign_rhs2 (stmt);
646             }
647         }
648       else
649         {
650           if (first_stmt_code != rhs_code
651               && (first_stmt_code != IMAGPART_EXPR
652                   || rhs_code != REALPART_EXPR)
653               && (first_stmt_code != REALPART_EXPR
654                   || rhs_code != IMAGPART_EXPR)
655               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
656                    && (first_stmt_code == ARRAY_REF
657                        || first_stmt_code == BIT_FIELD_REF
658                        || first_stmt_code == INDIRECT_REF
659                        || first_stmt_code == COMPONENT_REF
660                        || first_stmt_code == MEM_REF)))
661             {
662               if (dump_enabled_p ())
663                 {
664                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
665                                    "Build SLP failed: different operation "
666                                    "in stmt ");
667                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
668                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
669                 }
670               /* Mismatch.  */
671               continue;
672             }
673
674           if (need_same_oprnds
675               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
676             {
677               if (dump_enabled_p ())
678                 {
679                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
680                                    "Build SLP failed: different shift "
681                                    "arguments in ");
682                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
683                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
684                 }
685               /* Mismatch.  */
686               continue;
687             }
688
689           if (rhs_code == CALL_EXPR)
690             {
691               gimple first_stmt = stmts[0];
692               if (gimple_call_num_args (stmt) != nops
693                   || !operand_equal_p (gimple_call_fn (first_stmt),
694                                        gimple_call_fn (stmt), 0)
695                   || gimple_call_fntype (first_stmt)
696                      != gimple_call_fntype (stmt))
697                 {
698                   if (dump_enabled_p ())
699                     {
700                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
701                                        "Build SLP failed: different calls in ");
702                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
703                                         stmt, 0);
704                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
705                     }
706                   /* Mismatch.  */
707                   continue;
708                 }
709             }
710         }
711
712       /* Grouped store or load.  */
713       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
714         {
715           if (REFERENCE_CLASS_P (lhs))
716             {
717               /* Store.  */
718               ;
719             }
720           else
721             {
722               /* Load.  */
723               unsigned unrolling_factor
724                 = least_common_multiple
725                     (*max_nunits, group_size) / group_size;
726               /* FORNOW: Check that there is no gap between the loads
727                  and no gap between the groups when we need to load
728                  multiple groups at once.
729                  ???  We should enhance this to only disallow gaps
730                  inside vectors.  */
731               if ((unrolling_factor > 1
732                    && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
733                    && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
734                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
735                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
736                 {
737                   if (dump_enabled_p ())
738                     {
739                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
740                                        "Build SLP failed: grouped "
741                                        "loads have gaps ");
742                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
743                                         stmt, 0);
744                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
745                     }
746                   /* Fatal mismatch.  */
747                   matches[0] = false;
748                   return false;
749                 }
750
751               /* Check that the size of interleaved loads group is not
752                  greater than the SLP group size.  */
753               unsigned ncopies
754                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
755               if (loop_vinfo
756                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
757                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
758                        - GROUP_GAP (vinfo_for_stmt (stmt)))
759                       > ncopies * group_size))
760                 {
761                   if (dump_enabled_p ())
762                     {
763                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
764                                        "Build SLP failed: the number "
765                                        "of interleaved loads is greater than "
766                                        "the SLP group size ");
767                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
768                                         stmt, 0);
769                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
770                     }
771                   /* Fatal mismatch.  */
772                   matches[0] = false;
773                   return false;
774                 }
775
776               old_first_load = first_load;
777               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
778               if (prev_first_load)
779                 {
780                   /* Check that there are no loads from different interleaving
781                      chains in the same node.  */
782                   if (prev_first_load != first_load)
783                     {
784                       if (dump_enabled_p ())
785                         {
786                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
787                                            vect_location, 
788                                            "Build SLP failed: different "
789                                            "interleaving chains in one node ");
790                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
791                                             stmt, 0);
792                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
793                         }
794                       /* Mismatch.  */
795                       continue;
796                     }
797                 }
798               else
799                 prev_first_load = first_load;
800
801               /* In some cases a group of loads is just the same load
802                  repeated N times.  Only analyze its cost once.  */
803               if (first_load == stmt && old_first_load != first_load)
804                 {
805                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
806                   if (vect_supportable_dr_alignment (first_dr, false)
807                       == dr_unaligned_unsupported)
808                     {
809                       if (dump_enabled_p ())
810                         {
811                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
812                                            vect_location, 
813                                            "Build SLP failed: unsupported "
814                                            "unaligned load ");
815                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
816                                             stmt, 0);
817                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
818                         }
819                       /* Fatal mismatch.  */
820                       matches[0] = false;
821                       return false;
822                     }
823                 }
824            }
825         } /* Grouped access.  */
826       else
827         {
828           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
829             {
830               /* Not grouped load.  */
831               if (dump_enabled_p ())
832                 {
833                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
834                                    "Build SLP failed: not grouped load ");
835                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
836                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
837                 }
838
839               /* FORNOW: Not grouped loads are not supported.  */
840               /* Fatal mismatch.  */
841               matches[0] = false;
842               return false;
843             }
844
845           /* Not memory operation.  */
846           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
847               && TREE_CODE_CLASS (rhs_code) != tcc_unary
848               && rhs_code != COND_EXPR
849               && rhs_code != CALL_EXPR)
850             {
851               if (dump_enabled_p ())
852                 {
853                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
854                                    "Build SLP failed: operation");
855                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
856                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
857                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
858                 }
859               /* Fatal mismatch.  */
860               matches[0] = false;
861               return false;
862             }
863
864           if (rhs_code == COND_EXPR)
865             {
866               tree cond_expr = gimple_assign_rhs1 (stmt);
867
868               if (i == 0)
869                 first_cond_code = TREE_CODE (cond_expr);
870               else if (first_cond_code != TREE_CODE (cond_expr))
871                 {
872                   if (dump_enabled_p ())
873                     {
874                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
875                                        "Build SLP failed: different"
876                                        " operation");
877                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
878                                         stmt, 0);
879                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
880                     }
881                   /* Mismatch.  */
882                   continue;
883                 }
884             }
885         }
886
887       matches[i] = true;
888     }
889
890   for (i = 0; i < group_size; ++i)
891     if (!matches[i])
892       return false;
893
894   return true;
895 }
896
897 /* Recursively build an SLP tree starting from NODE.
898    Fail (and return a value not equal to zero) if def-stmts are not
899    isomorphic, require data permutation or are of unsupported types of
900    operation.  Otherwise, return 0.
901    The value returned is the depth in the SLP tree where a mismatch
902    was found.  */
903
904 static bool
905 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
906                      slp_tree *node, unsigned int group_size,
907                      unsigned int *max_nunits,
908                      vec<slp_tree> *loads,
909                      unsigned int vectorization_factor,
910                      bool *matches, unsigned *npermutes, unsigned *tree_size,
911                      unsigned max_tree_size)
912 {
913   unsigned nops, i, this_npermutes = 0, this_tree_size = 0;
914   gimple stmt;
915
916   if (!matches)
917     matches = XALLOCAVEC (bool, group_size);
918   if (!npermutes)
919     npermutes = &this_npermutes;
920
921   matches[0] = false;
922
923   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
924   if (is_gimple_call (stmt))
925     nops = gimple_call_num_args (stmt);
926   else if (is_gimple_assign (stmt))
927     {
928       nops = gimple_num_ops (stmt) - 1;
929       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
930         nops++;
931     }
932   else
933     return false;
934
935   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
936                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
937                               max_nunits, vectorization_factor, matches))
938     return false;
939
940   /* If the SLP node is a load, terminate the recursion.  */
941   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
942       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
943     {
944       loads->safe_push (*node);
945       return true;
946     }
947
948   /* Get at the operands, verifying they are compatible.  */
949   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
950   slp_oprnd_info oprnd_info;
951   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
952     {
953       switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
954                                            stmt, (i == 0), &oprnds_info))
955         {
956         case 0:
957           break;
958         case -1:
959           matches[0] = false;
960           vect_free_oprnd_info (oprnds_info);
961           return false;
962         case 1:
963           matches[i] = false;
964           break;
965         }
966     }
967   for (i = 0; i < group_size; ++i)
968     if (!matches[i])
969       {
970         vect_free_oprnd_info (oprnds_info);
971         return false;
972       }
973
974   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
975
976   /* Create SLP_TREE nodes for the definition node/s.  */
977   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
978     {
979       slp_tree child;
980       unsigned old_nloads = loads->length ();
981       unsigned old_max_nunits = *max_nunits;
982
983       if (oprnd_info->first_dt != vect_internal_def)
984         continue;
985
986       if (++this_tree_size > max_tree_size)
987         {
988           vect_free_oprnd_info (oprnds_info);
989           return false;
990         }
991
992       child = vect_create_new_slp_node (oprnd_info->def_stmts);
993       if (!child)
994         {
995           vect_free_oprnd_info (oprnds_info);
996           return false;
997         }
998
999       bool *matches = XALLOCAVEC (bool, group_size);
1000       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1001                                group_size, max_nunits, loads,
1002                                vectorization_factor, matches,
1003                                npermutes, &this_tree_size, max_tree_size))
1004         {
1005           oprnd_info->def_stmts = vNULL;
1006           SLP_TREE_CHILDREN (*node).quick_push (child);
1007           continue;
1008         }
1009
1010       /* If the SLP build for operand zero failed and operand zero
1011          and one can be commutated try that for the scalar stmts
1012          that failed the match.  */
1013       if (i == 0
1014           /* A first scalar stmt mismatch signals a fatal mismatch.  */
1015           && matches[0]
1016           /* ???  For COND_EXPRs we can swap the comparison operands
1017              as well as the arms under some constraints.  */
1018           && nops == 2
1019           && oprnds_info[1]->first_dt == vect_internal_def
1020           && is_gimple_assign (stmt)
1021           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1022           /* Do so only if the number of not successful permutes was nor more
1023              than a cut-ff as re-trying the recursive match on
1024              possibly each level of the tree would expose exponential
1025              behavior.  */
1026           && *npermutes < 4)
1027         {
1028           /* Roll back.  */
1029           *max_nunits = old_max_nunits;
1030           loads->truncate (old_nloads);
1031           /* Swap mismatched definition stmts.  */
1032           dump_printf_loc (MSG_NOTE, vect_location,
1033                            "Re-trying with swapped operands of stmts ");
1034           for (unsigned j = 0; j < group_size; ++j)
1035             if (!matches[j])
1036               {
1037                 gimple tem = oprnds_info[0]->def_stmts[j];
1038                 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1039                 oprnds_info[1]->def_stmts[j] = tem;
1040                 dump_printf (MSG_NOTE, "%d ", j);
1041               }
1042           dump_printf (MSG_NOTE, "\n");
1043           /* And try again ... */
1044           if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1045                                    group_size, max_nunits, loads,
1046                                    vectorization_factor,
1047                                    matches, npermutes, &this_tree_size,
1048                                    max_tree_size))
1049             {
1050               oprnd_info->def_stmts = vNULL;
1051               SLP_TREE_CHILDREN (*node).quick_push (child);
1052               continue;
1053             }
1054
1055           ++*npermutes;
1056         }
1057
1058       oprnd_info->def_stmts = vNULL;
1059       vect_free_slp_tree (child);
1060       vect_free_oprnd_info (oprnds_info);
1061       return false;
1062     }
1063
1064   if (tree_size)
1065     *tree_size += this_tree_size;
1066
1067   vect_free_oprnd_info (oprnds_info);
1068   return true;
1069 }
1070
1071 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1072
1073 static void
1074 vect_print_slp_tree (int dump_kind, slp_tree node)
1075 {
1076   int i;
1077   gimple stmt;
1078   slp_tree child;
1079
1080   if (!node)
1081     return;
1082
1083   dump_printf (dump_kind, "node ");
1084   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1085     {
1086       dump_printf (dump_kind, "\n\tstmt %d ", i);
1087       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1088     }
1089   dump_printf (dump_kind, "\n");
1090
1091   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1092     vect_print_slp_tree (dump_kind, child);
1093 }
1094
1095
1096 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1097    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1098    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1099    stmts in NODE are to be marked.  */
1100
1101 static void
1102 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1103 {
1104   int i;
1105   gimple stmt;
1106   slp_tree child;
1107
1108   if (!node)
1109     return;
1110
1111   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1112     if (j < 0 || i == j)
1113       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1114
1115   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1116     vect_mark_slp_stmts (child, mark, j);
1117 }
1118
1119
1120 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1121
1122 static void
1123 vect_mark_slp_stmts_relevant (slp_tree node)
1124 {
1125   int i;
1126   gimple stmt;
1127   stmt_vec_info stmt_info;
1128   slp_tree child;
1129
1130   if (!node)
1131     return;
1132
1133   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1134     {
1135       stmt_info = vinfo_for_stmt (stmt);
1136       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1137                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1138       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1139     }
1140
1141   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1142     vect_mark_slp_stmts_relevant (child);
1143 }
1144
1145
1146 /* Rearrange the statements of NODE according to PERMUTATION.  */
1147
1148 static void
1149 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1150                           vec<unsigned> permutation)
1151 {
1152   gimple stmt;
1153   vec<gimple> tmp_stmts;
1154   unsigned int i;
1155   slp_tree child;
1156
1157   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1158     vect_slp_rearrange_stmts (child, group_size, permutation);
1159
1160   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1161   tmp_stmts.create (group_size);
1162   tmp_stmts.quick_grow_cleared (group_size);
1163
1164   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1165     tmp_stmts[permutation[i]] = stmt;
1166
1167   SLP_TREE_SCALAR_STMTS (node).release ();
1168   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1169 }
1170
1171
1172 /* Check if the required load permutations in the SLP instance
1173    SLP_INSTN are supported.  */
1174
1175 static bool
1176 vect_supported_load_permutation_p (slp_instance slp_instn)
1177 {
1178   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1179   unsigned int i, j, k, next;
1180   sbitmap load_index;
1181   slp_tree node;
1182   gimple stmt, load, next_load, first_load;
1183   struct data_reference *dr;
1184
1185   if (dump_enabled_p ())
1186     {
1187       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1188       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1189         if (node->load_permutation.exists ())
1190           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1191             dump_printf (MSG_NOTE, "%d ", next);
1192         else
1193           for (k = 0; k < group_size; ++k)
1194             dump_printf (MSG_NOTE, "%d ", k);
1195       dump_printf (MSG_NOTE, "\n");
1196     }
1197
1198   /* In case of reduction every load permutation is allowed, since the order
1199      of the reduction statements is not important (as opposed to the case of
1200      grouped stores).  The only condition we need to check is that all the
1201      load nodes are of the same size and have the same permutation (and then
1202      rearrange all the nodes of the SLP instance according to this 
1203      permutation).  */
1204
1205   /* Check that all the load nodes are of the same size.  */
1206   /* ???  Can't we assert this? */
1207   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1208     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1209       return false;
1210
1211   node = SLP_INSTANCE_TREE (slp_instn);
1212   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1213
1214   /* Reduction (there are no data-refs in the root).
1215      In reduction chain the order of the loads is important.  */
1216   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1217       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1218     {
1219       slp_tree load;
1220       unsigned int lidx;
1221
1222       /* Compare all the permutation sequences to the first one.  We know
1223          that at least one load is permuted.  */
1224       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1225       if (!node->load_permutation.exists ())
1226         return false;
1227       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1228         {
1229           if (!load->load_permutation.exists ())
1230             return false;
1231           FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1232             if (lidx != node->load_permutation[j])
1233               return false;
1234         }
1235
1236       /* Check that the loads in the first sequence are different and there
1237          are no gaps between them.  */
1238       load_index = sbitmap_alloc (group_size);
1239       bitmap_clear (load_index);
1240       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1241         {
1242           if (bitmap_bit_p (load_index, lidx))
1243             {
1244               sbitmap_free (load_index);
1245               return false;
1246             }
1247           bitmap_set_bit (load_index, lidx);
1248         }
1249       for (i = 0; i < group_size; i++)
1250         if (!bitmap_bit_p (load_index, i))
1251           {
1252             sbitmap_free (load_index);
1253             return false;
1254           }
1255       sbitmap_free (load_index);
1256
1257       /* This permutation is valid for reduction.  Since the order of the
1258          statements in the nodes is not important unless they are memory
1259          accesses, we can rearrange the statements in all the nodes
1260          according to the order of the loads.  */
1261       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1262                                 node->load_permutation);
1263
1264       /* We are done, no actual permutations need to be generated.  */
1265       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1266         SLP_TREE_LOAD_PERMUTATION (node).release ();
1267       return true;
1268     }
1269
1270   /* In basic block vectorization we allow any subchain of an interleaving
1271      chain.
1272      FORNOW: not supported in loop SLP because of realignment compications.  */
1273   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1274     {
1275       /* Check that for every node in the instance the loads
1276          form a subchain.  */
1277       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1278         {
1279           next_load = NULL;
1280           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1281             {
1282               if (j != 0 && next_load != load)
1283                 return false;
1284               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1285             }
1286         }
1287
1288       /* Check that the alignment of the first load in every subchain, i.e.,
1289          the first statement in every load node, is supported.
1290          ???  This belongs in alignment checking.  */
1291       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1292         {
1293           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1294           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1295             {
1296               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1297               if (vect_supportable_dr_alignment (dr, false)
1298                   == dr_unaligned_unsupported)
1299                 {
1300                   if (dump_enabled_p ())
1301                     {
1302                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1303                                        vect_location,
1304                                        "unsupported unaligned load ");
1305                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1306                                         first_load, 0);
1307                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1308                     }
1309                   return false;
1310                 }
1311             }
1312         }
1313
1314       /* We are done, no actual permutations need to be generated.  */
1315       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1316         SLP_TREE_LOAD_PERMUTATION (node).release ();
1317       return true;
1318     }
1319
1320   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1321      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1322      well (unless it's reduction).  */
1323   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1324     return false;
1325   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1326     if (!node->load_permutation.exists ())
1327       return false;
1328
1329   load_index = sbitmap_alloc (group_size);
1330   bitmap_clear (load_index);
1331   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1332     {
1333       unsigned int lidx = node->load_permutation[0];
1334       if (bitmap_bit_p (load_index, lidx))
1335         {
1336           sbitmap_free (load_index);
1337           return false;
1338         }
1339       bitmap_set_bit (load_index, lidx);
1340       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1341         if (k != lidx)
1342           {
1343             sbitmap_free (load_index);
1344             return false;
1345           }
1346     }
1347   for (i = 0; i < group_size; i++)
1348     if (!bitmap_bit_p (load_index, i))
1349       {
1350         sbitmap_free (load_index);
1351         return false;
1352       }
1353   sbitmap_free (load_index);
1354
1355   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1356     if (node->load_permutation.exists ()
1357         && !vect_transform_slp_perm_load
1358               (node, vNULL, NULL,
1359                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1360       return false;
1361   return true;
1362 }
1363
1364
1365 /* Find the first load in the loop that belongs to INSTANCE.
1366    When loads are in several SLP nodes, there can be a case in which the first
1367    load does not appear in the first SLP node to be transformed, causing
1368    incorrect order of statements.  Since we generate all the loads together,
1369    they must be inserted before the first load of the SLP instance and not
1370    before the first load of the first node of the instance.  */
1371
1372 static gimple
1373 vect_find_first_load_in_slp_instance (slp_instance instance)
1374 {
1375   int i, j;
1376   slp_tree load_node;
1377   gimple first_load = NULL, load;
1378
1379   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1380     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1381       first_load = get_earlier_stmt (load, first_load);
1382
1383   return first_load;
1384 }
1385
1386
1387 /* Find the last store in SLP INSTANCE.  */
1388
1389 static gimple
1390 vect_find_last_store_in_slp_instance (slp_instance instance)
1391 {
1392   int i;
1393   slp_tree node;
1394   gimple last_store = NULL, store;
1395
1396   node = SLP_INSTANCE_TREE (instance);
1397   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1398     last_store = get_later_stmt (store, last_store);
1399
1400   return last_store;
1401 }
1402
1403 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1404
1405 static void
1406 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1407                          slp_instance instance, slp_tree node,
1408                          stmt_vector_for_cost *prologue_cost_vec,
1409                          unsigned ncopies_for_cost)
1410 {
1411   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1412
1413   unsigned i;
1414   slp_tree child;
1415   gimple stmt, s;
1416   stmt_vec_info stmt_info;
1417   tree lhs;
1418   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1419
1420   /* Recurse down the SLP tree.  */
1421   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1422     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1423                              instance, child, prologue_cost_vec,
1424                              ncopies_for_cost);
1425
1426   /* Look at the first scalar stmt to determine the cost.  */
1427   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1428   stmt_info = vinfo_for_stmt (stmt);
1429   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1430     {
1431       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1432         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1433                                vect_uninitialized_def,
1434                                node, prologue_cost_vec, body_cost_vec);
1435       else
1436         {
1437           int i;
1438           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1439           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1440                                 node, prologue_cost_vec, body_cost_vec);
1441           /* If the load is permuted record the cost for the permutation.
1442              ???  Loads from multiple chains are let through here only
1443              for a single special case involving complex numbers where
1444              in the end no permutation is necessary.  */
1445           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1446             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1447                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1448                 && vect_get_place_in_interleaving_chain
1449                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1450               {
1451                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1452                                   stmt_info, 0, vect_body);
1453                 break;
1454               }
1455         }
1456     }
1457   else
1458     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1459                       stmt_info, 0, vect_body);
1460
1461   /* Scan operands and account for prologue cost of constants/externals.
1462      ???  This over-estimates cost for multiple uses and should be
1463      re-engineered.  */
1464   lhs = gimple_get_lhs (stmt);
1465   for (i = 0; i < gimple_num_ops (stmt); ++i)
1466     {
1467       tree def, op = gimple_op (stmt, i);
1468       gimple def_stmt;
1469       enum vect_def_type dt;
1470       if (!op || op == lhs)
1471         continue;
1472       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1473                               &def_stmt, &def, &dt)
1474           && (dt == vect_constant_def || dt == vect_external_def))
1475         record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1476                           stmt_info, 0, vect_prologue);
1477     }
1478 }
1479
1480 /* Compute the cost for the SLP instance INSTANCE.  */
1481
1482 static void
1483 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1484                        slp_instance instance, unsigned nunits)
1485 {
1486   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1487   unsigned ncopies_for_cost;
1488   stmt_info_for_cost *si;
1489   unsigned i;
1490
1491   /* Calculate the number of vector stmts to create based on the unrolling
1492      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1493      GROUP_SIZE / NUNITS otherwise.  */
1494   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1495   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1496
1497   prologue_cost_vec.create (10);
1498   body_cost_vec.create (10);
1499   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1500   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1501                            instance, SLP_INSTANCE_TREE (instance),
1502                            &prologue_cost_vec, ncopies_for_cost);
1503
1504   /* Record the prologue costs, which were delayed until we were
1505      sure that SLP was successful.  Unlike the body costs, we know
1506      the final values now regardless of the loop vectorization factor.  */
1507   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1508                 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1509   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1510     {
1511       struct _stmt_vec_info *stmt_info
1512         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1513       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1514                             si->misalign, vect_prologue);
1515     }
1516
1517   prologue_cost_vec.release ();
1518 }
1519
1520 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1521    vect_build_slp_tree to build a tree of packed stmts if possible.
1522    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1523
1524 static bool
1525 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1526                            gimple stmt, unsigned max_tree_size)
1527 {
1528   slp_instance new_instance;
1529   slp_tree node;
1530   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1531   unsigned int unrolling_factor = 1, nunits;
1532   tree vectype, scalar_type = NULL_TREE;
1533   gimple next;
1534   unsigned int vectorization_factor = 0;
1535   int i;
1536   unsigned int max_nunits = 0;
1537   vec<slp_tree> loads;
1538   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1539   vec<gimple> scalar_stmts;
1540
1541   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1542     {
1543       if (dr)
1544         {
1545           scalar_type = TREE_TYPE (DR_REF (dr));
1546           vectype = get_vectype_for_scalar_type (scalar_type);
1547         }
1548       else
1549         {
1550           gcc_assert (loop_vinfo);
1551           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1552         }
1553
1554       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1555     }
1556   else
1557     {
1558       gcc_assert (loop_vinfo);
1559       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1560       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1561     }
1562
1563   if (!vectype)
1564     {
1565       if (dump_enabled_p ())
1566         {
1567           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1568                            "Build SLP failed: unsupported data-type ");
1569           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1570           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1571         }
1572
1573       return false;
1574     }
1575
1576   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1577   if (loop_vinfo)
1578     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1579   else
1580     vectorization_factor = nunits;
1581
1582   /* Calculate the unrolling factor.  */
1583   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1584   if (unrolling_factor != 1 && !loop_vinfo)
1585     {
1586       if (dump_enabled_p ())
1587         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1588                          "Build SLP failed: unrolling required in basic"
1589                          " block SLP\n");
1590
1591       return false;
1592     }
1593
1594   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1595   scalar_stmts.create (group_size);
1596   next = stmt;
1597   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1598     {
1599       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1600       while (next)
1601         {
1602           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1603               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1604             scalar_stmts.safe_push (
1605                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1606           else
1607             scalar_stmts.safe_push (next);
1608           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1609         }
1610     }
1611   else
1612     {
1613       /* Collect reduction statements.  */
1614       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1615       for (i = 0; reductions.iterate (i, &next); i++)
1616         scalar_stmts.safe_push (next);
1617     }
1618
1619   node = vect_create_new_slp_node (scalar_stmts);
1620
1621   loads.create (group_size);
1622
1623   /* Build the tree for the SLP instance.  */
1624   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1625                            &max_nunits, &loads,
1626                            vectorization_factor, NULL, NULL, NULL,
1627                            max_tree_size))
1628     {
1629       /* Calculate the unrolling factor based on the smallest type.  */
1630       if (max_nunits > nunits)
1631         unrolling_factor = least_common_multiple (max_nunits, group_size)
1632                            / group_size;
1633
1634       if (unrolling_factor != 1 && !loop_vinfo)
1635         {
1636           if (dump_enabled_p ())
1637             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1638                              "Build SLP failed: unrolling required in basic"
1639                              " block SLP\n");
1640           vect_free_slp_tree (node);
1641           loads.release ();
1642           return false;
1643         }
1644
1645       /* Create a new SLP instance.  */
1646       new_instance = XNEW (struct _slp_instance);
1647       SLP_INSTANCE_TREE (new_instance) = node;
1648       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1649       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1650       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1651       SLP_INSTANCE_LOADS (new_instance) = loads;
1652       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1653
1654       /* Compute the load permutation.  */
1655       slp_tree load_node;
1656       bool loads_permuted = false;
1657       FOR_EACH_VEC_ELT (loads, i, load_node)
1658         {
1659           vec<unsigned> load_permutation;
1660           int j;
1661           gimple load, first_stmt;
1662           bool this_load_permuted = false;
1663           load_permutation.create (group_size);
1664           first_stmt = GROUP_FIRST_ELEMENT
1665               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1666           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1667             {
1668               int load_place
1669                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1670               gcc_assert (load_place != -1);
1671               if (load_place != j)
1672                 this_load_permuted = true;
1673               load_permutation.safe_push (load_place);
1674             }
1675           if (!this_load_permuted)
1676             {
1677               load_permutation.release ();
1678               continue;
1679             }
1680           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1681           loads_permuted = true;
1682         }
1683
1684       if (loads_permuted)
1685         {
1686           if (!vect_supported_load_permutation_p (new_instance))
1687             {
1688               if (dump_enabled_p ())
1689                 {
1690                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691                                    "Build SLP failed: unsupported load "
1692                                    "permutation ");
1693                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1694                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1695                 }
1696               vect_free_slp_instance (new_instance);
1697               return false;
1698             }
1699
1700           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1701             = vect_find_first_load_in_slp_instance (new_instance);
1702         }
1703
1704       /* Compute the costs of this SLP instance.  */
1705       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1706                              new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1707
1708       if (loop_vinfo)
1709         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1710       else
1711         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1712
1713       if (dump_enabled_p ())
1714         vect_print_slp_tree (MSG_NOTE, node);
1715
1716       return true;
1717     }
1718
1719   /* Failed to SLP.  */
1720   /* Free the allocated memory.  */
1721   vect_free_slp_tree (node);
1722   loads.release ();
1723
1724   return false;
1725 }
1726
1727
1728 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1729    trees of packed scalar stmts if SLP is possible.  */
1730
1731 bool
1732 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1733                   unsigned max_tree_size)
1734 {
1735   unsigned int i;
1736   vec<gimple> grouped_stores;
1737   vec<gimple> reductions = vNULL;
1738   vec<gimple> reduc_chains = vNULL;
1739   gimple first_element;
1740   bool ok = false;
1741
1742   if (dump_enabled_p ())
1743     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1744
1745   if (loop_vinfo)
1746     {
1747       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1748       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1749       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1750     }
1751   else
1752     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1753
1754   /* Find SLP sequences starting from groups of grouped stores.  */
1755   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1756     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1757                                    max_tree_size))
1758       ok = true;
1759
1760   if (bb_vinfo && !ok)
1761     {
1762       if (dump_enabled_p ())
1763         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1764                          "Failed to SLP the basic block.\n");
1765
1766       return false;
1767     }
1768
1769   if (loop_vinfo
1770       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1771     {
1772       /* Find SLP sequences starting from reduction chains.  */
1773       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1774         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1775                                        max_tree_size))
1776           ok = true;
1777         else
1778           return false;
1779
1780       /* Don't try to vectorize SLP reductions if reduction chain was
1781          detected.  */
1782       return ok;
1783     }
1784
1785   /* Find SLP sequences starting from groups of reductions.  */
1786   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1787       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1788                                     max_tree_size))
1789     ok = true;
1790
1791   return true;
1792 }
1793
1794
1795 /* For each possible SLP instance decide whether to SLP it and calculate overall
1796    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1797    least one instance.  */
1798
1799 bool
1800 vect_make_slp_decision (loop_vec_info loop_vinfo)
1801 {
1802   unsigned int i, unrolling_factor = 1;
1803   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1804   slp_instance instance;
1805   int decided_to_slp = 0;
1806
1807   if (dump_enabled_p ())
1808     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1809                      "\n");
1810
1811   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1812     {
1813       /* FORNOW: SLP if you can.  */
1814       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1815         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1816
1817       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1818          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1819          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1820       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1821       decided_to_slp++;
1822     }
1823
1824   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1825
1826   if (decided_to_slp && dump_enabled_p ())
1827     dump_printf_loc (MSG_NOTE, vect_location,
1828                      "Decided to SLP %d instances. Unrolling factor %d\n",
1829                      decided_to_slp, unrolling_factor);
1830
1831   return (decided_to_slp > 0);
1832 }
1833
1834
1835 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1836    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1837
1838 static void
1839 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1840 {
1841   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1842   imm_use_iterator imm_iter;
1843   gimple use_stmt;
1844   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1845   slp_tree child;
1846   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1847   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1848   int j;
1849
1850   /* Propagate hybrid down the SLP tree.  */
1851   if (stype == hybrid)
1852     ;
1853   else if (HYBRID_SLP_STMT (stmt_vinfo))
1854     stype = hybrid;
1855   else
1856     {
1857       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1858       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1859       if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1860         FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1861           if (gimple_bb (use_stmt)
1862               && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1863               && (use_vinfo = vinfo_for_stmt (use_stmt))
1864               && !STMT_SLP_TYPE (use_vinfo)
1865               && (STMT_VINFO_RELEVANT (use_vinfo)
1866                   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))
1867                   || (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1868                       && STMT_VINFO_RELATED_STMT (use_vinfo)
1869                       && !STMT_SLP_TYPE (vinfo_for_stmt
1870                             (STMT_VINFO_RELATED_STMT (use_vinfo)))))
1871               && !(gimple_code (use_stmt) == GIMPLE_PHI
1872                    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1873             stype = hybrid;
1874     }
1875
1876   if (stype == hybrid)
1877     STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1878
1879   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1880     vect_detect_hybrid_slp_stmts (child, i, stype);
1881 }
1882
1883 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
1884
1885 static tree
1886 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1887 {
1888   walk_stmt_info *wi = (walk_stmt_info *)data;
1889   struct loop *loopp = (struct loop *)wi->info;
1890
1891   if (wi->is_lhs)
1892     return NULL_TREE;
1893
1894   if (TREE_CODE (*tp) == SSA_NAME
1895       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
1896     {
1897       gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
1898       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
1899           && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
1900         STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
1901     }
1902
1903   return NULL_TREE;
1904 }
1905
1906 static tree
1907 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
1908                           walk_stmt_info *)
1909 {
1910   /* If the stmt is in a SLP instance then this isn't a reason
1911      to mark use definitions in other SLP instances as hybrid.  */
1912   if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
1913     *handled = true;
1914   return NULL_TREE;
1915 }
1916
1917 /* Find stmts that must be both vectorized and SLPed.  */
1918
1919 void
1920 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1921 {
1922   unsigned int i;
1923   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1924   slp_instance instance;
1925
1926   if (dump_enabled_p ())
1927     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1928                      "\n");
1929
1930   /* First walk all pattern stmt in the loop and mark defs of uses as
1931      hybrid because immediate uses in them are not recorded.  */
1932   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1933     {
1934       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1935       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1936            gsi_next (&gsi))
1937         {
1938           gimple stmt = gsi_stmt (gsi);
1939           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1940           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
1941             {
1942               walk_stmt_info wi;
1943               memset (&wi, 0, sizeof (wi));
1944               wi.info = LOOP_VINFO_LOOP (loop_vinfo);
1945               gimple_stmt_iterator gsi2
1946                 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
1947               walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
1948                                 vect_detect_hybrid_slp_1, &wi);
1949               walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
1950                                vect_detect_hybrid_slp_2,
1951                                vect_detect_hybrid_slp_1, &wi);
1952             }
1953         }
1954     }
1955
1956   /* Then walk the SLP instance trees marking stmts with uses in
1957      non-SLP stmts as hybrid, also propagating hybrid down the
1958      SLP tree, collecting the above info on-the-fly.  */
1959   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1960     {
1961       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
1962         vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
1963                                       i, pure_slp);
1964     }
1965 }
1966
1967
1968 /* Create and initialize a new bb_vec_info struct for BB, as well as
1969    stmt_vec_info structs for all the stmts in it.  */
1970
1971 static bb_vec_info
1972 new_bb_vec_info (basic_block bb)
1973 {
1974   bb_vec_info res = NULL;
1975   gimple_stmt_iterator gsi;
1976
1977   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1978   BB_VINFO_BB (res) = bb;
1979
1980   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1981     {
1982       gimple stmt = gsi_stmt (gsi);
1983       gimple_set_uid (stmt, 0);
1984       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1985     }
1986
1987   BB_VINFO_GROUPED_STORES (res).create (10);
1988   BB_VINFO_SLP_INSTANCES (res).create (2);
1989   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1990
1991   bb->aux = res;
1992   return res;
1993 }
1994
1995
1996 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1997    stmts in the basic block.  */
1998
1999 static void
2000 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2001 {
2002   vec<slp_instance> slp_instances;
2003   slp_instance instance;
2004   basic_block bb;
2005   gimple_stmt_iterator si;
2006   unsigned i;
2007
2008   if (!bb_vinfo)
2009     return;
2010
2011   bb = BB_VINFO_BB (bb_vinfo);
2012
2013   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2014     {
2015       gimple stmt = gsi_stmt (si);
2016       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2017
2018       if (stmt_info)
2019         /* Free stmt_vec_info.  */
2020         free_stmt_vec_info (stmt);
2021     }
2022
2023   vect_destroy_datarefs (NULL, bb_vinfo);
2024   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2025   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2026   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2027   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2028     vect_free_slp_instance (instance);
2029   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2030   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2031   free (bb_vinfo);
2032   bb->aux = NULL;
2033 }
2034
2035
2036 /* Analyze statements contained in SLP tree node after recursively analyzing
2037    the subtree. Return TRUE if the operations are supported.  */
2038
2039 static bool
2040 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2041 {
2042   bool dummy;
2043   int i;
2044   gimple stmt;
2045   slp_tree child;
2046
2047   if (!node)
2048     return true;
2049
2050   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2051     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2052       return false;
2053
2054   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2055     {
2056       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2057       gcc_assert (stmt_info);
2058       gcc_assert (PURE_SLP_STMT (stmt_info));
2059
2060       if (!vect_analyze_stmt (stmt, &dummy, node))
2061         return false;
2062     }
2063
2064   return true;
2065 }
2066
2067
2068 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2069    operations are supported. */
2070
2071 static bool
2072 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2073 {
2074   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2075   slp_instance instance;
2076   int i;
2077
2078   for (i = 0; slp_instances.iterate (i, &instance); )
2079     {
2080       if (!vect_slp_analyze_node_operations (bb_vinfo,
2081                                              SLP_INSTANCE_TREE (instance)))
2082         {
2083           vect_free_slp_instance (instance);
2084           slp_instances.ordered_remove (i);
2085         }
2086       else
2087         i++;
2088     }
2089
2090   if (!slp_instances.length ())
2091     return false;
2092
2093   return true;
2094 }
2095
2096
2097 /* Compute the scalar cost of the SLP node NODE and its children
2098    and return it.  Do not account defs that are marked in LIFE and
2099    update LIFE according to uses of NODE.  */
2100
2101 static unsigned
2102 vect_bb_slp_scalar_cost (basic_block bb,
2103                          slp_tree node, vec<bool, va_heap> *life)
2104 {
2105   unsigned scalar_cost = 0;
2106   unsigned i;
2107   gimple stmt;
2108   slp_tree child;
2109
2110   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2111     {
2112       unsigned stmt_cost;
2113       ssa_op_iter op_iter;
2114       def_operand_p def_p;
2115       stmt_vec_info stmt_info;
2116
2117       if ((*life)[i])
2118         continue;
2119
2120       /* If there is a non-vectorized use of the defs then the scalar
2121          stmt is kept live in which case we do not account it or any
2122          required defs in the SLP children in the scalar cost.  This
2123          way we make the vectorization more costly when compared to
2124          the scalar cost.  */
2125       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2126         {
2127           imm_use_iterator use_iter;
2128           gimple use_stmt;
2129           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2130             if (!is_gimple_debug (use_stmt)
2131                 && (gimple_code (use_stmt) == GIMPLE_PHI
2132                     || gimple_bb (use_stmt) != bb
2133                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2134               {
2135                 (*life)[i] = true;
2136                 BREAK_FROM_IMM_USE_STMT (use_iter);
2137               }
2138         }
2139       if ((*life)[i])
2140         continue;
2141
2142       stmt_info = vinfo_for_stmt (stmt);
2143       if (STMT_VINFO_DATA_REF (stmt_info))
2144         {
2145           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2146             stmt_cost = vect_get_stmt_cost (scalar_load);
2147           else
2148             stmt_cost = vect_get_stmt_cost (scalar_store);
2149         }
2150       else
2151         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2152
2153       scalar_cost += stmt_cost;
2154     }
2155
2156   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2157     scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2158
2159   return scalar_cost;
2160 }
2161
2162 /* Check if vectorization of the basic block is profitable.  */
2163
2164 static bool
2165 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2166 {
2167   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2168   slp_instance instance;
2169   int i, j;
2170   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2171   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2172   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2173   stmt_vec_info stmt_info = NULL;
2174   stmt_vector_for_cost body_cost_vec;
2175   stmt_info_for_cost *ci;
2176
2177   /* Calculate vector costs.  */
2178   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2179     {
2180       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2181
2182       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2183         {
2184           stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2185           (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2186                                 stmt_info, ci->misalign, vect_body);
2187         }
2188     }
2189
2190   /* Calculate scalar cost.  */
2191   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2192     {
2193       auto_vec<bool, 20> life;
2194       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2195       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2196                                               SLP_INSTANCE_TREE (instance),
2197                                               &life);
2198     }
2199
2200   /* Complete the target-specific cost calculation.  */
2201   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2202                &vec_inside_cost, &vec_epilogue_cost);
2203
2204   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2205
2206   if (dump_enabled_p ())
2207     {
2208       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2209       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2210                    vec_inside_cost);
2211       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2212       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2213       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2214     }
2215
2216   /* Vectorization is profitable if its cost is less than the cost of scalar
2217      version.  */
2218   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2219     return false;
2220
2221   return true;
2222 }
2223
2224 /* Check if the basic block can be vectorized.  */
2225
2226 static bb_vec_info
2227 vect_slp_analyze_bb_1 (basic_block bb)
2228 {
2229   bb_vec_info bb_vinfo;
2230   vec<slp_instance> slp_instances;
2231   slp_instance instance;
2232   int i;
2233   int min_vf = 2;
2234   unsigned n_stmts = 0;
2235
2236   bb_vinfo = new_bb_vec_info (bb);
2237   if (!bb_vinfo)
2238     return NULL;
2239
2240   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2241     {
2242       if (dump_enabled_p ())
2243         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2244                          "not vectorized: unhandled data-ref in basic "
2245                          "block.\n");
2246
2247       destroy_bb_vec_info (bb_vinfo);
2248       return NULL;
2249     }
2250
2251   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2252     {
2253       if (dump_enabled_p ())
2254         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2255                          "not vectorized: not enough data-refs in "
2256                          "basic block.\n");
2257
2258       destroy_bb_vec_info (bb_vinfo);
2259       return NULL;
2260     }
2261
2262   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2263     {
2264      if (dump_enabled_p ())
2265        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2266                         "not vectorized: unhandled data access in "
2267                         "basic block.\n");
2268
2269       destroy_bb_vec_info (bb_vinfo);
2270       return NULL;
2271     }
2272
2273   vect_pattern_recog (NULL, bb_vinfo);
2274
2275   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2276     {
2277       if (dump_enabled_p ())
2278         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279                          "not vectorized: bad data alignment in basic "
2280                          "block.\n");
2281
2282       destroy_bb_vec_info (bb_vinfo);
2283       return NULL;
2284     }
2285
2286   /* Check the SLP opportunities in the basic block, analyze and build SLP
2287      trees.  */
2288   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2289     {
2290       if (dump_enabled_p ())
2291         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2292                          "not vectorized: failed to find SLP opportunities "
2293                          "in basic block.\n");
2294
2295       destroy_bb_vec_info (bb_vinfo);
2296       return NULL;
2297     }
2298
2299   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2300
2301   /* Mark all the statements that we want to vectorize as pure SLP and
2302      relevant.  */
2303   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2304     {
2305       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2306       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2307     }
2308
2309   /* Mark all the statements that we do not want to vectorize.  */
2310   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2311        !gsi_end_p (gsi); gsi_next (&gsi))
2312     {
2313       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2314       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2315         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2316     }
2317
2318   /* Analyze dependences.  At this point all stmts not participating in
2319      vectorization have to be marked.  Dependence analysis assumes
2320      that we either vectorize all SLP instances or none at all.  */
2321   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2322      {
2323        if (dump_enabled_p ())
2324          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2325                           "not vectorized: unhandled data dependence "
2326                           "in basic block.\n");
2327
2328        destroy_bb_vec_info (bb_vinfo);
2329        return NULL;
2330      }
2331
2332   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2333     {
2334       if (dump_enabled_p ())
2335         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336                          "not vectorized: unsupported alignment in basic "
2337                          "block.\n");
2338       destroy_bb_vec_info (bb_vinfo);
2339       return NULL;
2340     }
2341
2342   if (!vect_slp_analyze_operations (bb_vinfo))
2343     {
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "not vectorized: bad operation in basic block.\n");
2347
2348       destroy_bb_vec_info (bb_vinfo);
2349       return NULL;
2350     }
2351
2352   /* Cost model: check if the vectorization is worthwhile.  */
2353   if (!unlimited_cost_model (NULL)
2354       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2355     {
2356       if (dump_enabled_p ())
2357         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2358                          "not vectorized: vectorization is not "
2359                          "profitable.\n");
2360
2361       destroy_bb_vec_info (bb_vinfo);
2362       return NULL;
2363     }
2364
2365   if (dump_enabled_p ())
2366     dump_printf_loc (MSG_NOTE, vect_location,
2367                      "Basic block will be vectorized using SLP\n");
2368
2369   return bb_vinfo;
2370 }
2371
2372
2373 bb_vec_info
2374 vect_slp_analyze_bb (basic_block bb)
2375 {
2376   bb_vec_info bb_vinfo;
2377   int insns = 0;
2378   gimple_stmt_iterator gsi;
2379   unsigned int vector_sizes;
2380
2381   if (dump_enabled_p ())
2382     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2383
2384   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2385     {
2386       gimple stmt = gsi_stmt (gsi);
2387       if (!is_gimple_debug (stmt)
2388           && !gimple_nop_p (stmt)
2389           && gimple_code (stmt) != GIMPLE_LABEL)
2390         insns++;
2391     }
2392
2393   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2394     {
2395       if (dump_enabled_p ())
2396         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397                          "not vectorized: too many instructions in "
2398                          "basic block.\n");
2399
2400       return NULL;
2401     }
2402
2403   /* Autodetect first vector size we try.  */
2404   current_vector_size = 0;
2405   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2406
2407   while (1)
2408     {
2409       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2410       if (bb_vinfo)
2411         return bb_vinfo;
2412
2413       destroy_bb_vec_info (bb_vinfo);
2414
2415       vector_sizes &= ~current_vector_size;
2416       if (vector_sizes == 0
2417           || current_vector_size == 0)
2418         return NULL;
2419
2420       /* Try the next biggest vector size.  */
2421       current_vector_size = 1 << floor_log2 (vector_sizes);
2422       if (dump_enabled_p ())
2423         dump_printf_loc (MSG_NOTE, vect_location,
2424                          "***** Re-trying analysis with "
2425                          "vector size %d\n", current_vector_size);
2426     }
2427 }
2428
2429
2430 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2431    the number of created vector stmts depends on the unrolling factor).
2432    However, the actual number of vector stmts for every SLP node depends on
2433    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2434    should be updated.  In this function we assume that the inside costs
2435    calculated in vect_model_xxx_cost are linear in ncopies.  */
2436
2437 void
2438 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2439 {
2440   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2441   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2442   slp_instance instance;
2443   stmt_vector_for_cost body_cost_vec;
2444   stmt_info_for_cost *si;
2445   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2446
2447   if (dump_enabled_p ())
2448     dump_printf_loc (MSG_NOTE, vect_location,
2449                      "=== vect_update_slp_costs_according_to_vf ===\n");
2450
2451   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2452     {
2453       /* We assume that costs are linear in ncopies.  */
2454       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2455
2456       /* Record the instance's instructions in the target cost model.
2457          This was delayed until here because the count of instructions
2458          isn't known beforehand.  */
2459       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2460
2461       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2462         (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2463                               vinfo_for_stmt (si->stmt), si->misalign,
2464                               vect_body);
2465     }
2466 }
2467
2468
2469 /* For constant and loop invariant defs of SLP_NODE this function returns
2470    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2471    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2472    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2473    REDUC_INDEX is the index of the reduction operand in the statements, unless
2474    it is -1.  */
2475
2476 static void
2477 vect_get_constant_vectors (tree op, slp_tree slp_node,
2478                            vec<tree> *vec_oprnds,
2479                            unsigned int op_num, unsigned int number_of_vectors,
2480                            int reduc_index)
2481 {
2482   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2483   gimple stmt = stmts[0];
2484   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2485   unsigned nunits;
2486   tree vec_cst;
2487   tree *elts;
2488   unsigned j, number_of_places_left_in_vector;
2489   tree vector_type;
2490   tree vop;
2491   int group_size = stmts.length ();
2492   unsigned int vec_num, i;
2493   unsigned number_of_copies = 1;
2494   vec<tree> voprnds;
2495   voprnds.create (number_of_vectors);
2496   bool constant_p, is_store;
2497   tree neutral_op = NULL;
2498   enum tree_code code = gimple_expr_code (stmt);
2499   gimple def_stmt;
2500   struct loop *loop;
2501   gimple_seq ctor_seq = NULL;
2502
2503   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2504       && reduc_index != -1)
2505     {
2506       op_num = reduc_index - 1;
2507       op = gimple_op (stmt, reduc_index);
2508       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2509          we need either neutral operands or the original operands.  See
2510          get_initial_def_for_reduction() for details.  */
2511       switch (code)
2512         {
2513           case WIDEN_SUM_EXPR:
2514           case DOT_PROD_EXPR:
2515           case PLUS_EXPR:
2516           case MINUS_EXPR:
2517           case BIT_IOR_EXPR:
2518           case BIT_XOR_EXPR:
2519              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2520                neutral_op = build_real (TREE_TYPE (op), dconst0);
2521              else
2522                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2523
2524              break;
2525
2526           case MULT_EXPR:
2527              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2528                neutral_op = build_real (TREE_TYPE (op), dconst1);
2529              else
2530                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2531
2532              break;
2533
2534           case BIT_AND_EXPR:
2535             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2536             break;
2537
2538           /* For MIN/MAX we don't have an easy neutral operand but
2539              the initial values can be used fine here.  Only for
2540              a reduction chain we have to force a neutral element.  */
2541           case MAX_EXPR:
2542           case MIN_EXPR:
2543             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2544               neutral_op = NULL;
2545             else
2546               {
2547                 def_stmt = SSA_NAME_DEF_STMT (op);
2548                 loop = (gimple_bb (stmt))->loop_father;
2549                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2550                                                     loop_preheader_edge (loop));
2551               }
2552             break;
2553
2554           default:
2555             neutral_op = NULL;
2556         }
2557     }
2558
2559   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2560     {
2561       is_store = true;
2562       op = gimple_assign_rhs1 (stmt);
2563     }
2564   else
2565     is_store = false;
2566
2567   gcc_assert (op);
2568
2569   if (CONSTANT_CLASS_P (op))
2570     constant_p = true;
2571   else
2572     constant_p = false;
2573
2574   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2575   gcc_assert (vector_type);
2576   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2577
2578   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2579      created vectors. It is greater than 1 if unrolling is performed.
2580
2581      For example, we have two scalar operands, s1 and s2 (e.g., group of
2582      strided accesses of size two), while NUNITS is four (i.e., four scalars
2583      of this type can be packed in a vector).  The output vector will contain
2584      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2585      will be 2).
2586
2587      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2588      containing the operands.
2589
2590      For example, NUNITS is four as before, and the group size is 8
2591      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2592      {s5, s6, s7, s8}.  */
2593
2594   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2595
2596   number_of_places_left_in_vector = nunits;
2597   elts = XALLOCAVEC (tree, nunits);
2598   for (j = 0; j < number_of_copies; j++)
2599     {
2600       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2601         {
2602           if (is_store)
2603             op = gimple_assign_rhs1 (stmt);
2604           else
2605             {
2606               switch (code)
2607                 {
2608                   case COND_EXPR:
2609                     if (op_num == 0 || op_num == 1)
2610                       {
2611                         tree cond = gimple_assign_rhs1 (stmt);
2612                         op = TREE_OPERAND (cond, op_num);
2613                       }
2614                     else
2615                       {
2616                         if (op_num == 2)
2617                           op = gimple_assign_rhs2 (stmt);
2618                         else
2619                           op = gimple_assign_rhs3 (stmt);
2620                       }
2621                     break;
2622
2623                   case CALL_EXPR:
2624                     op = gimple_call_arg (stmt, op_num);
2625                     break;
2626
2627                   case LSHIFT_EXPR:
2628                   case RSHIFT_EXPR:
2629                   case LROTATE_EXPR:
2630                   case RROTATE_EXPR:
2631                     op = gimple_op (stmt, op_num + 1);
2632                     /* Unlike the other binary operators, shifts/rotates have
2633                        the shift count being int, instead of the same type as
2634                        the lhs, so make sure the scalar is the right type if
2635                        we are dealing with vectors of
2636                        long long/long/short/char.  */
2637                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2638                       op = fold_convert (TREE_TYPE (vector_type), op);
2639                     break;
2640
2641                   default:
2642                     op = gimple_op (stmt, op_num + 1);
2643                     break;
2644                 }
2645             }
2646
2647           if (reduc_index != -1)
2648             {
2649               loop = (gimple_bb (stmt))->loop_father;
2650               def_stmt = SSA_NAME_DEF_STMT (op);
2651
2652               gcc_assert (loop);
2653
2654               /* Get the def before the loop.  In reduction chain we have only
2655                  one initial value.  */
2656               if ((j != (number_of_copies - 1)
2657                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2658                        && i != 0))
2659                   && neutral_op)
2660                 op = neutral_op;
2661               else
2662                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2663                                             loop_preheader_edge (loop));
2664             }
2665
2666           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2667           number_of_places_left_in_vector--;
2668           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2669             {
2670               if (CONSTANT_CLASS_P (op))
2671                 {
2672                   op = fold_unary (VIEW_CONVERT_EXPR,
2673                                    TREE_TYPE (vector_type), op);
2674                   gcc_assert (op && CONSTANT_CLASS_P (op));
2675                 }
2676               else
2677                 {
2678                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2679                   gimple init_stmt;
2680                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2681                   init_stmt
2682                     = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2683                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2684                   op = new_temp;
2685                 }
2686             }
2687           elts[number_of_places_left_in_vector] = op;
2688           if (!CONSTANT_CLASS_P (op))
2689             constant_p = false;
2690
2691           if (number_of_places_left_in_vector == 0)
2692             {
2693               number_of_places_left_in_vector = nunits;
2694
2695               if (constant_p)
2696                 vec_cst = build_vector (vector_type, elts);
2697               else
2698                 {
2699                   vec<constructor_elt, va_gc> *v;
2700                   unsigned k;
2701                   vec_alloc (v, nunits);
2702                   for (k = 0; k < nunits; ++k)
2703                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2704                   vec_cst = build_constructor (vector_type, v);
2705                 }
2706               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2707                                                     vector_type, NULL));
2708               if (ctor_seq != NULL)
2709                 {
2710                   gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2711                   gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2712                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2713                                                         GSI_SAME_STMT);
2714                   ctor_seq = NULL;
2715                 }
2716             }
2717         }
2718     }
2719
2720   /* Since the vectors are created in the reverse order, we should invert
2721      them.  */
2722   vec_num = voprnds.length ();
2723   for (j = vec_num; j != 0; j--)
2724     {
2725       vop = voprnds[j - 1];
2726       vec_oprnds->quick_push (vop);
2727     }
2728
2729   voprnds.release ();
2730
2731   /* In case that VF is greater than the unrolling factor needed for the SLP
2732      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2733      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2734      to replicate the vectors.  */
2735   while (number_of_vectors > vec_oprnds->length ())
2736     {
2737       tree neutral_vec = NULL;
2738
2739       if (neutral_op)
2740         {
2741           if (!neutral_vec)
2742             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2743
2744           vec_oprnds->quick_push (neutral_vec);
2745         }
2746       else
2747         {
2748           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2749             vec_oprnds->quick_push (vop);
2750         }
2751     }
2752 }
2753
2754
2755 /* Get vectorized definitions from SLP_NODE that contains corresponding
2756    vectorized def-stmts.  */
2757
2758 static void
2759 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2760 {
2761   tree vec_oprnd;
2762   gimple vec_def_stmt;
2763   unsigned int i;
2764
2765   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2766
2767   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2768     {
2769       gcc_assert (vec_def_stmt);
2770       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2771       vec_oprnds->quick_push (vec_oprnd);
2772     }
2773 }
2774
2775
2776 /* Get vectorized definitions for SLP_NODE.
2777    If the scalar definitions are loop invariants or constants, collect them and
2778    call vect_get_constant_vectors() to create vector stmts.
2779    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2780    must be stored in the corresponding child of SLP_NODE, and we call
2781    vect_get_slp_vect_defs () to retrieve them.  */
2782
2783 void
2784 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2785                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2786 {
2787   gimple first_stmt;
2788   int number_of_vects = 0, i;
2789   unsigned int child_index = 0;
2790   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2791   slp_tree child = NULL;
2792   vec<tree> vec_defs;
2793   tree oprnd;
2794   bool vectorized_defs;
2795
2796   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2797   FOR_EACH_VEC_ELT (ops, i, oprnd)
2798     {
2799       /* For each operand we check if it has vectorized definitions in a child
2800          node or we need to create them (for invariants and constants).  We
2801          check if the LHS of the first stmt of the next child matches OPRND.
2802          If it does, we found the correct child.  Otherwise, we call
2803          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2804          to check this child node for the next operand.  */
2805       vectorized_defs = false;
2806       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2807         {
2808           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2809
2810           /* We have to check both pattern and original def, if available.  */
2811           gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2812           gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2813
2814           if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2815               || (related
2816                   && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2817             {
2818               /* The number of vector defs is determined by the number of
2819                  vector statements in the node from which we get those
2820                  statements.  */
2821               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2822               vectorized_defs = true;
2823               child_index++;
2824             }
2825         }
2826
2827       if (!vectorized_defs)
2828         {
2829           if (i == 0)
2830             {
2831               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2832               /* Number of vector stmts was calculated according to LHS in
2833                  vect_schedule_slp_instance (), fix it by replacing LHS with
2834                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2835                  details.  */
2836               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2837                                              &rhs_size_unit);
2838               if (rhs_size_unit != lhs_size_unit)
2839                 {
2840                   number_of_vects *= rhs_size_unit;
2841                   number_of_vects /= lhs_size_unit;
2842                 }
2843             }
2844         }
2845
2846       /* Allocate memory for vectorized defs.  */
2847       vec_defs = vNULL;
2848       vec_defs.create (number_of_vects);
2849
2850       /* For reduction defs we call vect_get_constant_vectors (), since we are
2851          looking for initial loop invariant values.  */
2852       if (vectorized_defs && reduc_index == -1)
2853         /* The defs are already vectorized.  */
2854         vect_get_slp_vect_defs (child, &vec_defs);
2855       else
2856         /* Build vectors from scalar defs.  */
2857         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2858                                    number_of_vects, reduc_index);
2859
2860       vec_oprnds->quick_push (vec_defs);
2861
2862       /* For reductions, we only need initial values.  */
2863       if (reduc_index != -1)
2864         return;
2865     }
2866 }
2867
2868
2869 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2870    building a vector of type MASK_TYPE from it) and two input vectors placed in
2871    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2872    shifting by STRIDE elements of DR_CHAIN for every copy.
2873    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2874    copies).
2875    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2876    the created stmts must be inserted.  */
2877
2878 static inline void
2879 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2880                            tree mask, int first_vec_indx, int second_vec_indx,
2881                            gimple_stmt_iterator *gsi, slp_tree node,
2882                            tree vectype, vec<tree> dr_chain,
2883                            int ncopies, int vect_stmts_counter)
2884 {
2885   tree perm_dest;
2886   gimple perm_stmt = NULL;
2887   stmt_vec_info next_stmt_info;
2888   int i, stride;
2889   tree first_vec, second_vec, data_ref;
2890
2891   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2892
2893   /* Initialize the vect stmts of NODE to properly insert the generated
2894      stmts later.  */
2895   for (i = SLP_TREE_VEC_STMTS (node).length ();
2896        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2897     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2898
2899   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2900   for (i = 0; i < ncopies; i++)
2901     {
2902       first_vec = dr_chain[first_vec_indx];
2903       second_vec = dr_chain[second_vec_indx];
2904
2905       /* Generate the permute statement.  */
2906       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
2907                                        first_vec, second_vec, mask);
2908       data_ref = make_ssa_name (perm_dest, perm_stmt);
2909       gimple_set_lhs (perm_stmt, data_ref);
2910       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2911
2912       /* Store the vector statement in NODE.  */
2913       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2914
2915       first_vec_indx += stride;
2916       second_vec_indx += stride;
2917     }
2918
2919   /* Mark the scalar stmt as vectorized.  */
2920   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2921   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2922 }
2923
2924
2925 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2926    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2927    representation.  Check that the mask is valid and return FALSE if not.
2928    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2929    the next vector, i.e., the current first vector is not needed.  */
2930
2931 static bool
2932 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2933                        int mask_nunits, bool only_one_vec, int index,
2934                        unsigned char *mask, int *current_mask_element,
2935                        bool *need_next_vector, int *number_of_mask_fixes,
2936                        bool *mask_fixed, bool *needs_first_vector)
2937 {
2938   int i;
2939
2940   /* Convert to target specific representation.  */
2941   *current_mask_element = first_mask_element + m;
2942   /* Adjust the value in case it's a mask for second and third vectors.  */
2943   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2944
2945   if (*current_mask_element < mask_nunits)
2946     *needs_first_vector = true;
2947
2948   /* We have only one input vector to permute but the mask accesses values in
2949      the next vector as well.  */
2950   if (only_one_vec && *current_mask_element >= mask_nunits)
2951     {
2952       if (dump_enabled_p ())
2953         {
2954           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2955                            "permutation requires at least two vectors ");
2956           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2957           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2958         }
2959
2960       return false;
2961     }
2962
2963   /* The mask requires the next vector.  */
2964   while (*current_mask_element >= mask_nunits * 2)
2965     {
2966       if (*needs_first_vector || *mask_fixed)
2967         {
2968           /* We either need the first vector too or have already moved to the
2969              next vector. In both cases, this permutation needs three
2970              vectors.  */
2971           if (dump_enabled_p ())
2972             {
2973               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974                                "permutation requires at "
2975                                "least three vectors ");
2976               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2977               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2978             }
2979
2980           return false;
2981         }
2982
2983       /* We move to the next vector, dropping the first one and working with
2984          the second and the third - we need to adjust the values of the mask
2985          accordingly.  */
2986       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2987
2988       for (i = 0; i < index; i++)
2989         mask[i] -= mask_nunits * *number_of_mask_fixes;
2990
2991       (*number_of_mask_fixes)++;
2992       *mask_fixed = true;
2993     }
2994
2995   *need_next_vector = *mask_fixed;
2996
2997   /* This was the last element of this mask. Start a new one.  */
2998   if (index == mask_nunits - 1)
2999     {
3000       *number_of_mask_fixes = 1;
3001       *mask_fixed = false;
3002       *needs_first_vector = false;
3003     }
3004
3005   return true;
3006 }
3007
3008
3009 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3010    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3011    permute statements for the SLP node NODE of the SLP instance
3012    SLP_NODE_INSTANCE.  */
3013
3014 bool
3015 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3016                               gimple_stmt_iterator *gsi, int vf,
3017                               slp_instance slp_node_instance, bool analyze_only)
3018 {
3019   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3020   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3021   tree mask_element_type = NULL_TREE, mask_type;
3022   int i, j, k, nunits, vec_index = 0, scalar_index;
3023   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3024   gimple next_scalar_stmt;
3025   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3026   int first_mask_element;
3027   int index, unroll_factor, current_mask_element, ncopies;
3028   unsigned char *mask;
3029   bool only_one_vec = false, need_next_vector = false;
3030   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3031   int number_of_mask_fixes = 1;
3032   bool mask_fixed = false;
3033   bool needs_first_vector = false;
3034   machine_mode mode;
3035
3036   mode = TYPE_MODE (vectype);
3037
3038   if (!can_vec_perm_p (mode, false, NULL))
3039     {
3040       if (dump_enabled_p ())
3041         {
3042           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3043                            "no vect permute for ");
3044           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3045           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3046         }
3047       return false;
3048     }
3049
3050   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3051      same size as the vector element being permuted.  */
3052   mask_element_type = lang_hooks.types.type_for_mode
3053                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3054   mask_type = get_vectype_for_scalar_type (mask_element_type);
3055   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3056   mask = XALLOCAVEC (unsigned char, nunits);
3057   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3058
3059   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3060      unrolling factor.  */
3061   orig_vec_stmts_num = group_size *
3062                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3063   if (orig_vec_stmts_num == 1)
3064     only_one_vec = true;
3065
3066   /* Number of copies is determined by the final vectorization factor
3067      relatively to SLP_NODE_INSTANCE unrolling factor.  */
3068   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3069
3070   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3071     return false;
3072
3073   /* Generate permutation masks for every NODE. Number of masks for each NODE
3074      is equal to GROUP_SIZE.
3075      E.g., we have a group of three nodes with three loads from the same
3076      location in each node, and the vector size is 4. I.e., we have a
3077      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3078      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3079      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3080      ...
3081
3082      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3083      The last mask is illegal since we assume two operands for permute
3084      operation, and the mask element values can't be outside that range.
3085      Hence, the last mask must be converted into {2,5,5,5}.
3086      For the first two permutations we need the first and the second input
3087      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3088      we need the second and the third vectors: {b1,c1,a2,b2} and
3089      {c2,a3,b3,c3}.  */
3090
3091     {
3092       scalar_index = 0;
3093       index = 0;
3094       vect_stmts_counter = 0;
3095       vec_index = 0;
3096       first_vec_index = vec_index++;
3097       if (only_one_vec)
3098         second_vec_index = first_vec_index;
3099       else
3100         second_vec_index =  vec_index++;
3101
3102       for (j = 0; j < unroll_factor; j++)
3103         {
3104           for (k = 0; k < group_size; k++)
3105             {
3106               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3107               first_mask_element = i + j * group_size;
3108               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3109                                           nunits, only_one_vec, index,
3110                                           mask, &current_mask_element,
3111                                           &need_next_vector,
3112                                           &number_of_mask_fixes, &mask_fixed,
3113                                           &needs_first_vector))
3114                 return false;
3115               gcc_assert (current_mask_element < 2 * nunits);
3116               mask[index++] = current_mask_element;
3117
3118               if (index == nunits)
3119                 {
3120                   index = 0;
3121                   if (!can_vec_perm_p (mode, false, mask))
3122                     {
3123                       if (dump_enabled_p ())
3124                         {
3125                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3126                                            vect_location, 
3127                                            "unsupported vect permute { ");
3128                           for (i = 0; i < nunits; ++i)
3129                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3130                                          mask[i]);
3131                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3132                         }
3133                       return false;
3134                     }
3135
3136                   if (!analyze_only)
3137                     {
3138                       int l;
3139                       tree mask_vec, *mask_elts;
3140                       mask_elts = XALLOCAVEC (tree, nunits);
3141                       for (l = 0; l < nunits; ++l)
3142                         mask_elts[l] = build_int_cst (mask_element_type,
3143                                                       mask[l]);
3144                       mask_vec = build_vector (mask_type, mask_elts);
3145
3146                       if (need_next_vector)
3147                         {
3148                           first_vec_index = second_vec_index;
3149                           second_vec_index = vec_index;
3150                         }
3151
3152                       next_scalar_stmt
3153                           = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3154
3155                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3156                                mask_vec, first_vec_index, second_vec_index,
3157                                gsi, node, vectype, dr_chain,
3158                                ncopies, vect_stmts_counter++);
3159                     }
3160                 }
3161             }
3162         }
3163     }
3164
3165   return true;
3166 }
3167
3168
3169
3170 /* Vectorize SLP instance tree in postorder.  */
3171
3172 static bool
3173 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3174                             unsigned int vectorization_factor)
3175 {
3176   gimple stmt;
3177   bool grouped_store, is_store;
3178   gimple_stmt_iterator si;
3179   stmt_vec_info stmt_info;
3180   unsigned int vec_stmts_size, nunits, group_size;
3181   tree vectype;
3182   int i;
3183   slp_tree child;
3184
3185   if (!node)
3186     return false;
3187
3188   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3189     vect_schedule_slp_instance (child, instance, vectorization_factor);
3190
3191   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3192   stmt_info = vinfo_for_stmt (stmt);
3193
3194   /* VECTYPE is the type of the destination.  */
3195   vectype = STMT_VINFO_VECTYPE (stmt_info);
3196   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3197   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3198
3199   /* For each SLP instance calculate number of vector stmts to be created
3200      for the scalar stmts in each node of the SLP tree.  Number of vector
3201      elements in one vector iteration is the number of scalar elements in
3202      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3203      size.  */
3204   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3205
3206   if (!SLP_TREE_VEC_STMTS (node).exists ())
3207     {
3208       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3209       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3210     }
3211
3212   if (dump_enabled_p ())
3213     {
3214       dump_printf_loc (MSG_NOTE,vect_location,
3215                        "------>vectorizing SLP node starting from: ");
3216       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3217       dump_printf (MSG_NOTE, "\n");
3218     }
3219
3220   /* Loads should be inserted before the first load.  */
3221   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3222       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3223       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3224       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3225     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3226   else if (is_pattern_stmt_p (stmt_info))
3227     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3228   else
3229     si = gsi_for_stmt (stmt);
3230
3231   /* Stores should be inserted just before the last store.  */
3232   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3233       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3234     { 
3235       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3236       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3237        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3238       si = gsi_for_stmt (last_store);
3239     }
3240
3241   /* Mark the first element of the reduction chain as reduction to properly
3242      transform the node.  In the analysis phase only the last element of the
3243      chain is marked as reduction.  */
3244   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3245       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3246     {
3247       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3248       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3249     }
3250
3251   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3252   return is_store;
3253 }
3254
3255 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3256    For loop vectorization this is done in vectorizable_call, but for SLP
3257    it needs to be deferred until end of vect_schedule_slp, because multiple
3258    SLP instances may refer to the same scalar stmt.  */
3259
3260 static void
3261 vect_remove_slp_scalar_calls (slp_tree node)
3262 {
3263   gimple stmt, new_stmt;
3264   gimple_stmt_iterator gsi;
3265   int i;
3266   slp_tree child;
3267   tree lhs;
3268   stmt_vec_info stmt_info;
3269
3270   if (!node)
3271     return;
3272
3273   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3274     vect_remove_slp_scalar_calls (child);
3275
3276   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3277     {
3278       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3279         continue;
3280       stmt_info = vinfo_for_stmt (stmt);
3281       if (stmt_info == NULL
3282           || is_pattern_stmt_p (stmt_info)
3283           || !PURE_SLP_STMT (stmt_info))
3284         continue;
3285       lhs = gimple_call_lhs (stmt);
3286       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3287       set_vinfo_for_stmt (new_stmt, stmt_info);
3288       set_vinfo_for_stmt (stmt, NULL);
3289       STMT_VINFO_STMT (stmt_info) = new_stmt;
3290       gsi = gsi_for_stmt (stmt);
3291       gsi_replace (&gsi, new_stmt, false);
3292       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3293     }
3294 }
3295
3296 /* Generate vector code for all SLP instances in the loop/basic block.  */
3297
3298 bool
3299 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3300 {
3301   vec<slp_instance> slp_instances;
3302   slp_instance instance;
3303   unsigned int i, vf;
3304   bool is_store = false;
3305
3306   if (loop_vinfo)
3307     {
3308       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3309       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3310     }
3311   else
3312     {
3313       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3314       vf = 1;
3315     }
3316
3317   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3318     {
3319       /* Schedule the tree of INSTANCE.  */
3320       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3321                                              instance, vf);
3322       if (dump_enabled_p ())
3323         dump_printf_loc (MSG_NOTE, vect_location,
3324                          "vectorizing stmts using SLP.\n");
3325     }
3326
3327   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3328     {
3329       slp_tree root = SLP_INSTANCE_TREE (instance);
3330       gimple store;
3331       unsigned int j;
3332       gimple_stmt_iterator gsi;
3333
3334       /* Remove scalar call stmts.  Do not do this for basic-block
3335          vectorization as not all uses may be vectorized.
3336          ???  Why should this be necessary?  DCE should be able to
3337          remove the stmts itself.
3338          ???  For BB vectorization we can as well remove scalar
3339          stmts starting from the SLP tree root if they have no
3340          uses.  */
3341       if (loop_vinfo)
3342         vect_remove_slp_scalar_calls (root);
3343
3344       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3345                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3346         {
3347           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3348             break;
3349
3350          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3351            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3352           /* Free the attached stmt_vec_info and remove the stmt.  */
3353           gsi = gsi_for_stmt (store);
3354           unlink_stmt_vdef (store);
3355           gsi_remove (&gsi, true);
3356           release_defs (store);
3357           free_stmt_vec_info (store);
3358         }
3359     }
3360
3361   return is_store;
3362 }
3363
3364
3365 /* Vectorize the basic block.  */
3366
3367 void
3368 vect_slp_transform_bb (basic_block bb)
3369 {
3370   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3371   gimple_stmt_iterator si;
3372
3373   gcc_assert (bb_vinfo);
3374
3375   if (dump_enabled_p ())
3376     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3377
3378   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3379     {
3380       gimple stmt = gsi_stmt (si);
3381       stmt_vec_info stmt_info;
3382
3383       if (dump_enabled_p ())
3384         {
3385           dump_printf_loc (MSG_NOTE, vect_location,
3386                            "------>SLPing statement: ");
3387           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3388           dump_printf (MSG_NOTE, "\n");
3389         }
3390
3391       stmt_info = vinfo_for_stmt (stmt);
3392       gcc_assert (stmt_info);
3393
3394       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3395       if (STMT_SLP_TYPE (stmt_info))
3396         {
3397           vect_schedule_slp (NULL, bb_vinfo);
3398           break;
3399         }
3400     }
3401
3402   if (dump_enabled_p ())
3403     dump_printf_loc (MSG_NOTE, vect_location,
3404                      "BASIC BLOCK VECTORIZED\n");
3405
3406   destroy_bb_vec_info (bb_vinfo);
3407 }