re PR tree-optimization/65494 (Loop is not vectorized because of operand canonicaliza...
[platform/upstream/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h"              /* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
79
80 /* Extract the location of the basic block in the source code.
81    Return the basic block location if succeed and NULL if not.  */
82
83 source_location
84 find_bb_location (basic_block bb)
85 {
86   gimple stmt = NULL;
87   gimple_stmt_iterator si;
88
89   if (!bb)
90     return UNKNOWN_LOCATION;
91
92   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93     {
94       stmt = gsi_stmt (si);
95       if (gimple_location (stmt) != UNKNOWN_LOCATION)
96         return gimple_location (stmt);
97     }
98
99   return UNKNOWN_LOCATION;
100 }
101
102
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
104
105 static void
106 vect_free_slp_tree (slp_tree node)
107 {
108   int i;
109   slp_tree child;
110
111   if (!node)
112     return;
113
114   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115     vect_free_slp_tree (child);
116
117   SLP_TREE_CHILDREN (node).release ();
118   SLP_TREE_SCALAR_STMTS (node).release ();
119   SLP_TREE_VEC_STMTS (node).release ();
120   SLP_TREE_LOAD_PERMUTATION (node).release ();
121
122   free (node);
123 }
124
125
126 /* Free the memory allocated for the SLP instance.  */
127
128 void
129 vect_free_slp_instance (slp_instance instance)
130 {
131   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132   SLP_INSTANCE_LOADS (instance).release ();
133   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
134   free (instance);
135 }
136
137
138 /* Create an SLP node for SCALAR_STMTS.  */
139
140 static slp_tree
141 vect_create_new_slp_node (vec<gimple> scalar_stmts)
142 {
143   slp_tree node;
144   gimple stmt = scalar_stmts[0];
145   unsigned int nops;
146
147   if (is_gimple_call (stmt))
148     nops = gimple_call_num_args (stmt);
149   else if (is_gimple_assign (stmt))
150     {
151       nops = gimple_num_ops (stmt) - 1;
152       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
153         nops++;
154     }
155   else
156     return NULL;
157
158   node = XNEW (struct _slp_tree);
159   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
160   SLP_TREE_VEC_STMTS (node).create (0);
161   SLP_TREE_CHILDREN (node).create (nops);
162   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
163
164   return node;
165 }
166
167
168 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169    operand.  */
170 static vec<slp_oprnd_info> 
171 vect_create_oprnd_info (int nops, int group_size)
172 {
173   int i;
174   slp_oprnd_info oprnd_info;
175   vec<slp_oprnd_info> oprnds_info;
176
177   oprnds_info.create (nops);
178   for (i = 0; i < nops; i++)
179     {
180       oprnd_info = XNEW (struct _slp_oprnd_info);
181       oprnd_info->def_stmts.create (group_size);
182       oprnd_info->first_dt = vect_uninitialized_def;
183       oprnd_info->first_op_type = NULL_TREE;
184       oprnd_info->first_pattern = false;
185       oprnds_info.quick_push (oprnd_info);
186     }
187
188   return oprnds_info;
189 }
190
191
192 /* Free operands info.  */
193
194 static void
195 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
196 {
197   int i;
198   slp_oprnd_info oprnd_info;
199
200   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
201     {
202       oprnd_info->def_stmts.release ();
203       XDELETE (oprnd_info);
204     }
205
206   oprnds_info.release ();
207 }
208
209
210 /* Find the place of the data-ref in STMT in the interleaving chain that starts
211    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
212
213 static int
214 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
215 {
216   gimple next_stmt = first_stmt;
217   int result = 0;
218
219   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
220     return -1;
221
222   do
223     {
224       if (next_stmt == stmt)
225         return result;
226       result++;
227       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228     }
229   while (next_stmt);
230
231   return -1;
232 }
233
234
235 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
236    they are of a valid type and that they match the defs of the first stmt of
237    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
238    return -1, if the error could be corrected by swapping operands of the
239    operation return 1, if everything is ok return 0.  */
240
241 static int 
242 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
243                              gimple stmt, bool first,
244                              vec<slp_oprnd_info> *oprnds_info)
245 {
246   tree oprnd;
247   unsigned int i, number_of_oprnds;
248   tree def;
249   gimple def_stmt;
250   enum vect_def_type dt = vect_uninitialized_def;
251   struct loop *loop = NULL;
252   bool pattern = false;
253   slp_oprnd_info oprnd_info;
254   int first_op_idx = 1;
255   bool commutative = false;
256   bool first_op_cond = false;
257
258   if (loop_vinfo)
259     loop = LOOP_VINFO_LOOP (loop_vinfo);
260
261   if (is_gimple_call (stmt))
262     {
263       number_of_oprnds = gimple_call_num_args (stmt);
264       first_op_idx = 3;
265     }
266   else if (is_gimple_assign (stmt))
267     {
268       enum tree_code code = gimple_assign_rhs_code (stmt);
269       number_of_oprnds = gimple_num_ops (stmt) - 1;
270       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
271         {
272           first_op_cond = true;
273           commutative = true;
274           number_of_oprnds++;
275         }
276       else
277         commutative = commutative_tree_code (code);
278     }
279   else
280     return -1;
281
282   bool swapped = false;
283   for (i = 0; i < number_of_oprnds; i++)
284     {
285 again:
286       if (first_op_cond)
287         {
288           if (i == 0 || i == 1)
289             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
290                                   swapped ? !i : i);
291           else
292             oprnd = gimple_op (stmt, first_op_idx + i - 1);
293         }
294       else
295         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
296
297       oprnd_info = (*oprnds_info)[i];
298
299       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
300                                &def, &dt)
301           || (!def_stmt && dt != vect_constant_def))
302         {
303           if (dump_enabled_p ())
304             {
305               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306                                "Build SLP failed: can't find def for ");
307               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
308               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
309             }
310
311           return -1;
312         }
313
314       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
315          from the pattern.  Check that all the stmts of the node are in the
316          pattern.  */
317       if (def_stmt && gimple_bb (def_stmt)
318           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
319               || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
320                   && gimple_code (def_stmt) != GIMPLE_PHI))
321           && vinfo_for_stmt (def_stmt)
322           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
323           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
324           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
325         {
326           pattern = true;
327           if (!first && !oprnd_info->first_pattern)
328             {
329               if (i == 0
330                   && !swapped
331                   && commutative)
332                 {
333                   swapped = true;
334                   goto again;
335                 }
336
337               if (dump_enabled_p ())
338                 {
339                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340                                    "Build SLP failed: some of the stmts"
341                                    " are in a pattern, and others are not ");
342                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
343                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344                 }
345
346               return 1;
347             }
348
349           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
350           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
351
352           if (dt == vect_unknown_def_type)
353             {
354               if (dump_enabled_p ())
355                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356                                  "Unsupported pattern.\n");
357               return -1;
358             }
359
360           switch (gimple_code (def_stmt))
361             {
362               case GIMPLE_PHI:
363                 def = gimple_phi_result (def_stmt);
364                 break;
365
366               case GIMPLE_ASSIGN:
367                 def = gimple_assign_lhs (def_stmt);
368                 break;
369
370               default:
371                 if (dump_enabled_p ())
372                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373                                    "unsupported defining stmt:\n");
374                 return -1;
375             }
376         }
377
378       if (first)
379         {
380           oprnd_info->first_dt = dt;
381           oprnd_info->first_pattern = pattern;
382           oprnd_info->first_op_type = TREE_TYPE (oprnd);
383         }
384       else
385         {
386           /* Not first stmt of the group, check that the def-stmt/s match
387              the def-stmt/s of the first stmt.  Allow different definition
388              types for reduction chains: the first stmt must be a
389              vect_reduction_def (a phi node), and the rest
390              vect_internal_def.  */
391           if (((oprnd_info->first_dt != dt
392                 && !(oprnd_info->first_dt == vect_reduction_def
393                      && dt == vect_internal_def)
394                 && !((oprnd_info->first_dt == vect_external_def
395                       || oprnd_info->first_dt == vect_constant_def)
396                      && (dt == vect_external_def
397                          || dt == vect_constant_def)))
398                || !types_compatible_p (oprnd_info->first_op_type,
399                                        TREE_TYPE (oprnd))))
400             {
401               /* Try swapping operands if we got a mismatch.  */
402               if (i == 0
403                   && !swapped
404                   && commutative)
405                 {
406                   swapped = true;
407                   goto again;
408                 }
409
410               if (dump_enabled_p ())
411                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412                                  "Build SLP failed: different types\n");
413
414               return 1;
415             }
416         }
417
418       /* Check the types of the definitions.  */
419       switch (dt)
420         {
421         case vect_constant_def:
422         case vect_external_def:
423         case vect_reduction_def:
424           break;
425
426         case vect_internal_def:
427           oprnd_info->def_stmts.quick_push (def_stmt);
428           break;
429
430         default:
431           /* FORNOW: Not supported.  */
432           if (dump_enabled_p ())
433             {
434               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435                                "Build SLP failed: illegal type of def ");
436               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
438             }
439
440           return -1;
441         }
442     }
443
444   /* Swap operands.  */
445   if (swapped)
446     {
447       if (first_op_cond)
448         {
449           tree cond = gimple_assign_rhs1 (stmt);
450           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451                              &TREE_OPERAND (cond, 1));
452           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
453         }
454       else
455         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456                            gimple_assign_rhs2_ptr (stmt));
457     }
458
459   return 0;
460 }
461
462
463 /* Verify if the scalar stmts STMTS are isomorphic, require data
464    permutation or are of unsupported types of operation.  Return
465    true if they are, otherwise return false and indicate in *MATCHES
466    which stmts are not isomorphic to the first one.  If MATCHES[0]
467    is false then this indicates the comparison could not be
468    carried out or the stmts will never be vectorized by SLP.  */
469
470 static bool
471 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472                        vec<gimple> stmts, unsigned int group_size,
473                        unsigned nops, unsigned int *max_nunits,
474                        unsigned int vectorization_factor, bool *matches)
475 {
476   unsigned int i;
477   gimple stmt = stmts[0];
478   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
479   enum tree_code first_cond_code = ERROR_MARK;
480   tree lhs;
481   bool need_same_oprnds = false;
482   tree vectype, scalar_type, first_op1 = NULL_TREE;
483   optab optab;
484   int icode;
485   machine_mode optab_op2_mode;
486   machine_mode vec_mode;
487   struct data_reference *first_dr;
488   HOST_WIDE_INT dummy;
489   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
490   tree cond;
491
492   /* For every stmt in NODE find its def stmt/s.  */
493   FOR_EACH_VEC_ELT (stmts, i, stmt)
494     {
495       matches[i] = false;
496
497       if (dump_enabled_p ())
498         {
499           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
500           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
501           dump_printf (MSG_NOTE, "\n");
502         }
503
504       /* Fail to vectorize statements marked as unvectorizable.  */
505       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
506         {
507           if (dump_enabled_p ())
508             {
509               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510                                "Build SLP failed: unvectorizable statement ");
511               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513             }
514           /* Fatal mismatch.  */
515           matches[0] = false;
516           return false;
517         }
518
519       lhs = gimple_get_lhs (stmt);
520       if (lhs == NULL_TREE)
521         {
522           if (dump_enabled_p ())
523             {
524               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525                                "Build SLP failed: not GIMPLE_ASSIGN nor "
526                                "GIMPLE_CALL ");
527               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
528               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529             }
530           /* Fatal mismatch.  */
531           matches[0] = false;
532           return false;
533         }
534
535        if (is_gimple_assign (stmt)
536            && gimple_assign_rhs_code (stmt) == COND_EXPR
537            && (cond = gimple_assign_rhs1 (stmt))
538            && !COMPARISON_CLASS_P (cond))
539         {
540           if (dump_enabled_p ())
541             {
542               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
543                                "Build SLP failed: condition is not "
544                                "comparison ");
545               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
546               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547             }
548           /* Fatal mismatch.  */
549           matches[0] = false;
550           return false;
551         }
552
553       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
554       vectype = get_vectype_for_scalar_type (scalar_type);
555       if (!vectype)
556         {
557           if (dump_enabled_p ())
558             {
559               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
560                                "Build SLP failed: unsupported data-type ");
561               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
562                                  scalar_type);
563               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
564             }
565           /* Fatal mismatch.  */
566           matches[0] = false;
567           return false;
568         }
569
570       /* In case of multiple types we need to detect the smallest type.  */
571       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
572         {
573           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
574           if (bb_vinfo)
575             vectorization_factor = *max_nunits;
576         }
577
578       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
579         {
580           rhs_code = CALL_EXPR;
581           if (gimple_call_internal_p (call_stmt)
582               || gimple_call_tail_p (call_stmt)
583               || gimple_call_noreturn_p (call_stmt)
584               || !gimple_call_nothrow_p (call_stmt)
585               || gimple_call_chain (call_stmt))
586             {
587               if (dump_enabled_p ())
588                 {
589                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
590                                    "Build SLP failed: unsupported call type ");
591                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
592                                     call_stmt, 0);
593                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
594                 }
595               /* Fatal mismatch.  */
596               matches[0] = false;
597               return false;
598             }
599         }
600       else
601         rhs_code = gimple_assign_rhs_code (stmt);
602
603       /* Check the operation.  */
604       if (i == 0)
605         {
606           first_stmt_code = rhs_code;
607
608           /* Shift arguments should be equal in all the packed stmts for a
609              vector shift with scalar shift operand.  */
610           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
611               || rhs_code == LROTATE_EXPR
612               || rhs_code == RROTATE_EXPR)
613             {
614               vec_mode = TYPE_MODE (vectype);
615
616               /* First see if we have a vector/vector shift.  */
617               optab = optab_for_tree_code (rhs_code, vectype,
618                                            optab_vector);
619
620               if (!optab
621                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
622                 {
623                   /* No vector/vector shift, try for a vector/scalar shift.  */
624                   optab = optab_for_tree_code (rhs_code, vectype,
625                                                optab_scalar);
626
627                   if (!optab)
628                     {
629                       if (dump_enabled_p ())
630                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
631                                          "Build SLP failed: no optab.\n");
632                       /* Fatal mismatch.  */
633                       matches[0] = false;
634                       return false;
635                     }
636                   icode = (int) optab_handler (optab, vec_mode);
637                   if (icode == CODE_FOR_nothing)
638                     {
639                       if (dump_enabled_p ())
640                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641                                          "Build SLP failed: "
642                                          "op not supported by target.\n");
643                       /* Fatal mismatch.  */
644                       matches[0] = false;
645                       return false;
646                     }
647                   optab_op2_mode = insn_data[icode].operand[2].mode;
648                   if (!VECTOR_MODE_P (optab_op2_mode))
649                     {
650                       need_same_oprnds = true;
651                       first_op1 = gimple_assign_rhs2 (stmt);
652                     }
653                 }
654             }
655           else if (rhs_code == WIDEN_LSHIFT_EXPR)
656             {
657               need_same_oprnds = true;
658               first_op1 = gimple_assign_rhs2 (stmt);
659             }
660         }
661       else
662         {
663           if (first_stmt_code != rhs_code
664               && (first_stmt_code != IMAGPART_EXPR
665                   || rhs_code != REALPART_EXPR)
666               && (first_stmt_code != REALPART_EXPR
667                   || rhs_code != IMAGPART_EXPR)
668               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669                    && (first_stmt_code == ARRAY_REF
670                        || first_stmt_code == BIT_FIELD_REF
671                        || first_stmt_code == INDIRECT_REF
672                        || first_stmt_code == COMPONENT_REF
673                        || first_stmt_code == MEM_REF)))
674             {
675               if (dump_enabled_p ())
676                 {
677                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
678                                    "Build SLP failed: different operation "
679                                    "in stmt ");
680                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
681                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
682                 }
683               /* Mismatch.  */
684               continue;
685             }
686
687           if (need_same_oprnds
688               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
689             {
690               if (dump_enabled_p ())
691                 {
692                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
693                                    "Build SLP failed: different shift "
694                                    "arguments in ");
695                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
696                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
697                 }
698               /* Mismatch.  */
699               continue;
700             }
701
702           if (rhs_code == CALL_EXPR)
703             {
704               gimple first_stmt = stmts[0];
705               if (gimple_call_num_args (stmt) != nops
706                   || !operand_equal_p (gimple_call_fn (first_stmt),
707                                        gimple_call_fn (stmt), 0)
708                   || gimple_call_fntype (first_stmt)
709                      != gimple_call_fntype (stmt))
710                 {
711                   if (dump_enabled_p ())
712                     {
713                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
714                                        "Build SLP failed: different calls in ");
715                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
716                                         stmt, 0);
717                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
718                     }
719                   /* Mismatch.  */
720                   continue;
721                 }
722             }
723         }
724
725       /* Grouped store or load.  */
726       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
727         {
728           if (REFERENCE_CLASS_P (lhs))
729             {
730               /* Store.  */
731               ;
732             }
733           else
734             {
735               /* Load.  */
736               unsigned unrolling_factor
737                 = least_common_multiple
738                     (*max_nunits, group_size) / group_size;
739               /* FORNOW: Check that there is no gap between the loads
740                  and no gap between the groups when we need to load
741                  multiple groups at once.
742                  ???  We should enhance this to only disallow gaps
743                  inside vectors.  */
744               if ((unrolling_factor > 1
745                    && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
746                         && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
747                        /* If the group is split up then GROUP_GAP
748                           isn't correct here, nor is GROUP_FIRST_ELEMENT.  */
749                        || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
750                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
751                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
752                 {
753                   if (dump_enabled_p ())
754                     {
755                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756                                        "Build SLP failed: grouped "
757                                        "loads have gaps ");
758                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
759                                         stmt, 0);
760                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
761                     }
762                   /* Fatal mismatch.  */
763                   matches[0] = false;
764                   return false;
765                 }
766
767               /* Check that the size of interleaved loads group is not
768                  greater than the SLP group size.  */
769               unsigned ncopies
770                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
771               if (loop_vinfo
772                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
774                        - GROUP_GAP (vinfo_for_stmt (stmt)))
775                       > ncopies * group_size))
776                 {
777                   if (dump_enabled_p ())
778                     {
779                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780                                        "Build SLP failed: the number "
781                                        "of interleaved loads is greater than "
782                                        "the SLP group size ");
783                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
784                                         stmt, 0);
785                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
786                     }
787                   /* Fatal mismatch.  */
788                   matches[0] = false;
789                   return false;
790                 }
791
792               old_first_load = first_load;
793               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
794               if (prev_first_load)
795                 {
796                   /* Check that there are no loads from different interleaving
797                      chains in the same node.  */
798                   if (prev_first_load != first_load)
799                     {
800                       if (dump_enabled_p ())
801                         {
802                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803                                            vect_location, 
804                                            "Build SLP failed: different "
805                                            "interleaving chains in one node ");
806                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
807                                             stmt, 0);
808                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
809                         }
810                       /* Mismatch.  */
811                       continue;
812                     }
813                 }
814               else
815                 prev_first_load = first_load;
816
817               /* In some cases a group of loads is just the same load
818                  repeated N times.  Only analyze its cost once.  */
819               if (first_load == stmt && old_first_load != first_load)
820                 {
821                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
822                   if (vect_supportable_dr_alignment (first_dr, false)
823                       == dr_unaligned_unsupported)
824                     {
825                       if (dump_enabled_p ())
826                         {
827                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
828                                            vect_location, 
829                                            "Build SLP failed: unsupported "
830                                            "unaligned load ");
831                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
832                                             stmt, 0);
833                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
834                         }
835                       /* Fatal mismatch.  */
836                       matches[0] = false;
837                       return false;
838                     }
839                 }
840            }
841         } /* Grouped access.  */
842       else
843         {
844           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
845             {
846               /* Not grouped load.  */
847               if (dump_enabled_p ())
848                 {
849                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
850                                    "Build SLP failed: not grouped load ");
851                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
852                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
853                 }
854
855               /* FORNOW: Not grouped loads are not supported.  */
856               /* Fatal mismatch.  */
857               matches[0] = false;
858               return false;
859             }
860
861           /* Not memory operation.  */
862           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
863               && TREE_CODE_CLASS (rhs_code) != tcc_unary
864               && rhs_code != COND_EXPR
865               && rhs_code != CALL_EXPR)
866             {
867               if (dump_enabled_p ())
868                 {
869                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
870                                    "Build SLP failed: operation");
871                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
872                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
873                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
874                 }
875               /* Fatal mismatch.  */
876               matches[0] = false;
877               return false;
878             }
879
880           if (rhs_code == COND_EXPR)
881             {
882               tree cond_expr = gimple_assign_rhs1 (stmt);
883
884               if (i == 0)
885                 first_cond_code = TREE_CODE (cond_expr);
886               else if (first_cond_code != TREE_CODE (cond_expr))
887                 {
888                   if (dump_enabled_p ())
889                     {
890                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891                                        "Build SLP failed: different"
892                                        " operation");
893                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
894                                         stmt, 0);
895                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
896                     }
897                   /* Mismatch.  */
898                   continue;
899                 }
900             }
901         }
902
903       matches[i] = true;
904     }
905
906   for (i = 0; i < group_size; ++i)
907     if (!matches[i])
908       return false;
909
910   return true;
911 }
912
913 /* Recursively build an SLP tree starting from NODE.
914    Fail (and return a value not equal to zero) if def-stmts are not
915    isomorphic, require data permutation or are of unsupported types of
916    operation.  Otherwise, return 0.
917    The value returned is the depth in the SLP tree where a mismatch
918    was found.  */
919
920 static bool
921 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
922                      slp_tree *node, unsigned int group_size,
923                      unsigned int *max_nunits,
924                      vec<slp_tree> *loads,
925                      unsigned int vectorization_factor,
926                      bool *matches, unsigned *npermutes, unsigned *tree_size,
927                      unsigned max_tree_size)
928 {
929   unsigned nops, i, this_tree_size = 0;
930   gimple stmt;
931
932   matches[0] = false;
933
934   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
935   if (is_gimple_call (stmt))
936     nops = gimple_call_num_args (stmt);
937   else if (is_gimple_assign (stmt))
938     {
939       nops = gimple_num_ops (stmt) - 1;
940       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
941         nops++;
942     }
943   else
944     return false;
945
946   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
947                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
948                               max_nunits, vectorization_factor, matches))
949     return false;
950
951   /* If the SLP node is a load, terminate the recursion.  */
952   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
953       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
954     {
955       loads->safe_push (*node);
956       return true;
957     }
958
959   /* Get at the operands, verifying they are compatible.  */
960   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
961   slp_oprnd_info oprnd_info;
962   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
963     {
964       switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
965                                            stmt, (i == 0), &oprnds_info))
966         {
967         case 0:
968           break;
969         case -1:
970           matches[0] = false;
971           vect_free_oprnd_info (oprnds_info);
972           return false;
973         case 1:
974           matches[i] = false;
975           break;
976         }
977     }
978   for (i = 0; i < group_size; ++i)
979     if (!matches[i])
980       {
981         vect_free_oprnd_info (oprnds_info);
982         return false;
983       }
984
985   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
986
987   /* Create SLP_TREE nodes for the definition node/s.  */
988   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
989     {
990       slp_tree child;
991       unsigned old_nloads = loads->length ();
992       unsigned old_max_nunits = *max_nunits;
993
994       if (oprnd_info->first_dt != vect_internal_def)
995         continue;
996
997       if (++this_tree_size > max_tree_size)
998         {
999           vect_free_oprnd_info (oprnds_info);
1000           return false;
1001         }
1002
1003       child = vect_create_new_slp_node (oprnd_info->def_stmts);
1004       if (!child)
1005         {
1006           vect_free_oprnd_info (oprnds_info);
1007           return false;
1008         }
1009
1010       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1011                                group_size, max_nunits, loads,
1012                                vectorization_factor, matches,
1013                                npermutes, &this_tree_size, max_tree_size))
1014         {
1015           oprnd_info->def_stmts = vNULL;
1016           SLP_TREE_CHILDREN (*node).quick_push (child);
1017           continue;
1018         }
1019
1020       /* If the SLP build for operand zero failed and operand zero
1021          and one can be commutated try that for the scalar stmts
1022          that failed the match.  */
1023       if (i == 0
1024           /* A first scalar stmt mismatch signals a fatal mismatch.  */
1025           && matches[0]
1026           /* ???  For COND_EXPRs we can swap the comparison operands
1027              as well as the arms under some constraints.  */
1028           && nops == 2
1029           && oprnds_info[1]->first_dt == vect_internal_def
1030           && is_gimple_assign (stmt)
1031           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1032           /* Do so only if the number of not successful permutes was nor more
1033              than a cut-ff as re-trying the recursive match on
1034              possibly each level of the tree would expose exponential
1035              behavior.  */
1036           && *npermutes < 4)
1037         {
1038           /* Roll back.  */
1039           *max_nunits = old_max_nunits;
1040           loads->truncate (old_nloads);
1041           /* Swap mismatched definition stmts.  */
1042           dump_printf_loc (MSG_NOTE, vect_location,
1043                            "Re-trying with swapped operands of stmts ");
1044           for (unsigned j = 0; j < group_size; ++j)
1045             if (!matches[j])
1046               {
1047                 gimple tem = oprnds_info[0]->def_stmts[j];
1048                 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1049                 oprnds_info[1]->def_stmts[j] = tem;
1050                 dump_printf (MSG_NOTE, "%d ", j);
1051               }
1052           dump_printf (MSG_NOTE, "\n");
1053           /* And try again ... */
1054           if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1055                                    group_size, max_nunits, loads,
1056                                    vectorization_factor,
1057                                    matches, npermutes, &this_tree_size,
1058                                    max_tree_size))
1059             {
1060               oprnd_info->def_stmts = vNULL;
1061               SLP_TREE_CHILDREN (*node).quick_push (child);
1062               continue;
1063             }
1064
1065           ++*npermutes;
1066         }
1067
1068       oprnd_info->def_stmts = vNULL;
1069       vect_free_slp_tree (child);
1070       vect_free_oprnd_info (oprnds_info);
1071       return false;
1072     }
1073
1074   if (tree_size)
1075     *tree_size += this_tree_size;
1076
1077   vect_free_oprnd_info (oprnds_info);
1078   return true;
1079 }
1080
1081 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1082
1083 static void
1084 vect_print_slp_tree (int dump_kind, slp_tree node)
1085 {
1086   int i;
1087   gimple stmt;
1088   slp_tree child;
1089
1090   if (!node)
1091     return;
1092
1093   dump_printf (dump_kind, "node ");
1094   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1095     {
1096       dump_printf (dump_kind, "\n\tstmt %d ", i);
1097       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1098     }
1099   dump_printf (dump_kind, "\n");
1100
1101   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1102     vect_print_slp_tree (dump_kind, child);
1103 }
1104
1105
1106 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1107    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1108    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1109    stmts in NODE are to be marked.  */
1110
1111 static void
1112 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1113 {
1114   int i;
1115   gimple stmt;
1116   slp_tree child;
1117
1118   if (!node)
1119     return;
1120
1121   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1122     if (j < 0 || i == j)
1123       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1124
1125   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1126     vect_mark_slp_stmts (child, mark, j);
1127 }
1128
1129
1130 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1131
1132 static void
1133 vect_mark_slp_stmts_relevant (slp_tree node)
1134 {
1135   int i;
1136   gimple stmt;
1137   stmt_vec_info stmt_info;
1138   slp_tree child;
1139
1140   if (!node)
1141     return;
1142
1143   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1144     {
1145       stmt_info = vinfo_for_stmt (stmt);
1146       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1147                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1148       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1149     }
1150
1151   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1152     vect_mark_slp_stmts_relevant (child);
1153 }
1154
1155
1156 /* Rearrange the statements of NODE according to PERMUTATION.  */
1157
1158 static void
1159 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1160                           vec<unsigned> permutation)
1161 {
1162   gimple stmt;
1163   vec<gimple> tmp_stmts;
1164   unsigned int i;
1165   slp_tree child;
1166
1167   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1168     vect_slp_rearrange_stmts (child, group_size, permutation);
1169
1170   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1171   tmp_stmts.create (group_size);
1172   tmp_stmts.quick_grow_cleared (group_size);
1173
1174   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1175     tmp_stmts[permutation[i]] = stmt;
1176
1177   SLP_TREE_SCALAR_STMTS (node).release ();
1178   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1179 }
1180
1181
1182 /* Check if the required load permutations in the SLP instance
1183    SLP_INSTN are supported.  */
1184
1185 static bool
1186 vect_supported_load_permutation_p (slp_instance slp_instn)
1187 {
1188   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1189   unsigned int i, j, k, next;
1190   sbitmap load_index;
1191   slp_tree node;
1192   gimple stmt, load, next_load, first_load;
1193   struct data_reference *dr;
1194
1195   if (dump_enabled_p ())
1196     {
1197       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1198       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1199         if (node->load_permutation.exists ())
1200           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1201             dump_printf (MSG_NOTE, "%d ", next);
1202         else
1203           for (k = 0; k < group_size; ++k)
1204             dump_printf (MSG_NOTE, "%d ", k);
1205       dump_printf (MSG_NOTE, "\n");
1206     }
1207
1208   /* In case of reduction every load permutation is allowed, since the order
1209      of the reduction statements is not important (as opposed to the case of
1210      grouped stores).  The only condition we need to check is that all the
1211      load nodes are of the same size and have the same permutation (and then
1212      rearrange all the nodes of the SLP instance according to this 
1213      permutation).  */
1214
1215   /* Check that all the load nodes are of the same size.  */
1216   /* ???  Can't we assert this? */
1217   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1218     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1219       return false;
1220
1221   node = SLP_INSTANCE_TREE (slp_instn);
1222   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1223
1224   /* Reduction (there are no data-refs in the root).
1225      In reduction chain the order of the loads is important.  */
1226   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1227       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1228     {
1229       slp_tree load;
1230       unsigned int lidx;
1231
1232       /* Compare all the permutation sequences to the first one.  We know
1233          that at least one load is permuted.  */
1234       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1235       if (!node->load_permutation.exists ())
1236         return false;
1237       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1238         {
1239           if (!load->load_permutation.exists ())
1240             return false;
1241           FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1242             if (lidx != node->load_permutation[j])
1243               return false;
1244         }
1245
1246       /* Check that the loads in the first sequence are different and there
1247          are no gaps between them.  */
1248       load_index = sbitmap_alloc (group_size);
1249       bitmap_clear (load_index);
1250       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1251         {
1252           if (bitmap_bit_p (load_index, lidx))
1253             {
1254               sbitmap_free (load_index);
1255               return false;
1256             }
1257           bitmap_set_bit (load_index, lidx);
1258         }
1259       for (i = 0; i < group_size; i++)
1260         if (!bitmap_bit_p (load_index, i))
1261           {
1262             sbitmap_free (load_index);
1263             return false;
1264           }
1265       sbitmap_free (load_index);
1266
1267       /* This permutation is valid for reduction.  Since the order of the
1268          statements in the nodes is not important unless they are memory
1269          accesses, we can rearrange the statements in all the nodes
1270          according to the order of the loads.  */
1271       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1272                                 node->load_permutation);
1273
1274       /* We are done, no actual permutations need to be generated.  */
1275       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1276         SLP_TREE_LOAD_PERMUTATION (node).release ();
1277       return true;
1278     }
1279
1280   /* In basic block vectorization we allow any subchain of an interleaving
1281      chain.
1282      FORNOW: not supported in loop SLP because of realignment compications.  */
1283   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1284     {
1285       /* Check that for every node in the instance the loads
1286          form a subchain.  */
1287       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1288         {
1289           next_load = NULL;
1290           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1291             {
1292               if (j != 0 && next_load != load)
1293                 return false;
1294               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1295             }
1296         }
1297
1298       /* Check that the alignment of the first load in every subchain, i.e.,
1299          the first statement in every load node, is supported.
1300          ???  This belongs in alignment checking.  */
1301       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1302         {
1303           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1304           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1305             {
1306               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1307               if (vect_supportable_dr_alignment (dr, false)
1308                   == dr_unaligned_unsupported)
1309                 {
1310                   if (dump_enabled_p ())
1311                     {
1312                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1313                                        vect_location,
1314                                        "unsupported unaligned load ");
1315                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1316                                         first_load, 0);
1317                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1318                     }
1319                   return false;
1320                 }
1321             }
1322         }
1323
1324       /* We are done, no actual permutations need to be generated.  */
1325       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1326         SLP_TREE_LOAD_PERMUTATION (node).release ();
1327       return true;
1328     }
1329
1330   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1331      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1332      well (unless it's reduction).  */
1333   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1334     return false;
1335   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1336     if (!node->load_permutation.exists ())
1337       return false;
1338
1339   load_index = sbitmap_alloc (group_size);
1340   bitmap_clear (load_index);
1341   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1342     {
1343       unsigned int lidx = node->load_permutation[0];
1344       if (bitmap_bit_p (load_index, lidx))
1345         {
1346           sbitmap_free (load_index);
1347           return false;
1348         }
1349       bitmap_set_bit (load_index, lidx);
1350       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1351         if (k != lidx)
1352           {
1353             sbitmap_free (load_index);
1354             return false;
1355           }
1356     }
1357   for (i = 0; i < group_size; i++)
1358     if (!bitmap_bit_p (load_index, i))
1359       {
1360         sbitmap_free (load_index);
1361         return false;
1362       }
1363   sbitmap_free (load_index);
1364
1365   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1366     if (node->load_permutation.exists ()
1367         && !vect_transform_slp_perm_load
1368               (node, vNULL, NULL,
1369                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1370       return false;
1371   return true;
1372 }
1373
1374
1375 /* Find the first load in the loop that belongs to INSTANCE.
1376    When loads are in several SLP nodes, there can be a case in which the first
1377    load does not appear in the first SLP node to be transformed, causing
1378    incorrect order of statements.  Since we generate all the loads together,
1379    they must be inserted before the first load of the SLP instance and not
1380    before the first load of the first node of the instance.  */
1381
1382 static gimple
1383 vect_find_first_load_in_slp_instance (slp_instance instance)
1384 {
1385   int i, j;
1386   slp_tree load_node;
1387   gimple first_load = NULL, load;
1388
1389   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1390     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1391       first_load = get_earlier_stmt (load, first_load);
1392
1393   return first_load;
1394 }
1395
1396
1397 /* Find the last store in SLP INSTANCE.  */
1398
1399 static gimple
1400 vect_find_last_store_in_slp_instance (slp_instance instance)
1401 {
1402   int i;
1403   slp_tree node;
1404   gimple last_store = NULL, store;
1405
1406   node = SLP_INSTANCE_TREE (instance);
1407   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1408     last_store = get_later_stmt (store, last_store);
1409
1410   return last_store;
1411 }
1412
1413 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1414
1415 static void
1416 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1417                          slp_instance instance, slp_tree node,
1418                          stmt_vector_for_cost *prologue_cost_vec,
1419                          unsigned ncopies_for_cost)
1420 {
1421   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1422
1423   unsigned i;
1424   slp_tree child;
1425   gimple stmt, s;
1426   stmt_vec_info stmt_info;
1427   tree lhs;
1428   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1429
1430   /* Recurse down the SLP tree.  */
1431   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1432     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1433                              instance, child, prologue_cost_vec,
1434                              ncopies_for_cost);
1435
1436   /* Look at the first scalar stmt to determine the cost.  */
1437   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1438   stmt_info = vinfo_for_stmt (stmt);
1439   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1440     {
1441       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1442         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1443                                vect_uninitialized_def,
1444                                node, prologue_cost_vec, body_cost_vec);
1445       else
1446         {
1447           int i;
1448           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1449           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1450                                 node, prologue_cost_vec, body_cost_vec);
1451           /* If the load is permuted record the cost for the permutation.
1452              ???  Loads from multiple chains are let through here only
1453              for a single special case involving complex numbers where
1454              in the end no permutation is necessary.  */
1455           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1456             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1457                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1458                 && vect_get_place_in_interleaving_chain
1459                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1460               {
1461                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1462                                   stmt_info, 0, vect_body);
1463                 break;
1464               }
1465         }
1466     }
1467   else
1468     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1469                       stmt_info, 0, vect_body);
1470
1471   /* Scan operands and account for prologue cost of constants/externals.
1472      ???  This over-estimates cost for multiple uses and should be
1473      re-engineered.  */
1474   lhs = gimple_get_lhs (stmt);
1475   for (i = 0; i < gimple_num_ops (stmt); ++i)
1476     {
1477       tree def, op = gimple_op (stmt, i);
1478       gimple def_stmt;
1479       enum vect_def_type dt;
1480       if (!op || op == lhs)
1481         continue;
1482       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1483                               &def_stmt, &def, &dt)
1484           && (dt == vect_constant_def || dt == vect_external_def))
1485         record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1486                           stmt_info, 0, vect_prologue);
1487     }
1488 }
1489
1490 /* Compute the cost for the SLP instance INSTANCE.  */
1491
1492 static void
1493 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1494                        slp_instance instance, unsigned nunits)
1495 {
1496   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1497   unsigned ncopies_for_cost;
1498   stmt_info_for_cost *si;
1499   unsigned i;
1500
1501   /* Calculate the number of vector stmts to create based on the unrolling
1502      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1503      GROUP_SIZE / NUNITS otherwise.  */
1504   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1505   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1506
1507   prologue_cost_vec.create (10);
1508   body_cost_vec.create (10);
1509   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1510   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1511                            instance, SLP_INSTANCE_TREE (instance),
1512                            &prologue_cost_vec, ncopies_for_cost);
1513
1514   /* Record the prologue costs, which were delayed until we were
1515      sure that SLP was successful.  Unlike the body costs, we know
1516      the final values now regardless of the loop vectorization factor.  */
1517   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1518                 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1519   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1520     {
1521       struct _stmt_vec_info *stmt_info
1522         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1523       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1524                             si->misalign, vect_prologue);
1525     }
1526
1527   prologue_cost_vec.release ();
1528 }
1529
1530 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1531    vect_build_slp_tree to build a tree of packed stmts if possible.
1532    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1533
1534 static bool
1535 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1536                            gimple stmt, unsigned max_tree_size)
1537 {
1538   slp_instance new_instance;
1539   slp_tree node;
1540   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1541   unsigned int unrolling_factor = 1, nunits;
1542   tree vectype, scalar_type = NULL_TREE;
1543   gimple next;
1544   unsigned int vectorization_factor = 0;
1545   int i;
1546   unsigned int max_nunits = 0;
1547   vec<slp_tree> loads;
1548   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1549   vec<gimple> scalar_stmts;
1550
1551   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1552     {
1553       if (dr)
1554         {
1555           scalar_type = TREE_TYPE (DR_REF (dr));
1556           vectype = get_vectype_for_scalar_type (scalar_type);
1557         }
1558       else
1559         {
1560           gcc_assert (loop_vinfo);
1561           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1562         }
1563
1564       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1565     }
1566   else
1567     {
1568       gcc_assert (loop_vinfo);
1569       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1570       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1571     }
1572
1573   if (!vectype)
1574     {
1575       if (dump_enabled_p ())
1576         {
1577           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1578                            "Build SLP failed: unsupported data-type ");
1579           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1580           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1581         }
1582
1583       return false;
1584     }
1585
1586   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1587   if (loop_vinfo)
1588     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1589   else
1590     vectorization_factor = nunits;
1591
1592   /* Calculate the unrolling factor.  */
1593   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1594   if (unrolling_factor != 1 && !loop_vinfo)
1595     {
1596       if (dump_enabled_p ())
1597         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1598                          "Build SLP failed: unrolling required in basic"
1599                          " block SLP\n");
1600
1601       return false;
1602     }
1603
1604   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1605   scalar_stmts.create (group_size);
1606   next = stmt;
1607   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1608     {
1609       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1610       while (next)
1611         {
1612           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1613               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1614             scalar_stmts.safe_push (
1615                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1616           else
1617             scalar_stmts.safe_push (next);
1618           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1619         }
1620     }
1621   else
1622     {
1623       /* Collect reduction statements.  */
1624       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1625       for (i = 0; reductions.iterate (i, &next); i++)
1626         scalar_stmts.safe_push (next);
1627     }
1628
1629   node = vect_create_new_slp_node (scalar_stmts);
1630
1631   loads.create (group_size);
1632
1633   /* Build the tree for the SLP instance.  */
1634   bool *matches = XALLOCAVEC (bool, group_size);
1635   unsigned npermutes = 0;
1636   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1637                            &max_nunits, &loads,
1638                            vectorization_factor, matches, &npermutes, NULL,
1639                            max_tree_size))
1640     {
1641       /* Calculate the unrolling factor based on the smallest type.  */
1642       if (max_nunits > nunits)
1643         unrolling_factor = least_common_multiple (max_nunits, group_size)
1644                            / group_size;
1645
1646       if (unrolling_factor != 1 && !loop_vinfo)
1647         {
1648           if (dump_enabled_p ())
1649             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1650                              "Build SLP failed: unrolling required in basic"
1651                              " block SLP\n");
1652           vect_free_slp_tree (node);
1653           loads.release ();
1654           return false;
1655         }
1656
1657       /* Create a new SLP instance.  */
1658       new_instance = XNEW (struct _slp_instance);
1659       SLP_INSTANCE_TREE (new_instance) = node;
1660       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1661       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1662       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1663       SLP_INSTANCE_LOADS (new_instance) = loads;
1664       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1665
1666       /* Compute the load permutation.  */
1667       slp_tree load_node;
1668       bool loads_permuted = false;
1669       FOR_EACH_VEC_ELT (loads, i, load_node)
1670         {
1671           vec<unsigned> load_permutation;
1672           int j;
1673           gimple load, first_stmt;
1674           bool this_load_permuted = false;
1675           load_permutation.create (group_size);
1676           first_stmt = GROUP_FIRST_ELEMENT
1677               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1678           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1679             {
1680               int load_place
1681                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1682               gcc_assert (load_place != -1);
1683               if (load_place != j)
1684                 this_load_permuted = true;
1685               load_permutation.safe_push (load_place);
1686             }
1687           if (!this_load_permuted)
1688             {
1689               load_permutation.release ();
1690               continue;
1691             }
1692           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1693           loads_permuted = true;
1694         }
1695
1696       if (loads_permuted)
1697         {
1698           if (!vect_supported_load_permutation_p (new_instance))
1699             {
1700               if (dump_enabled_p ())
1701                 {
1702                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1703                                    "Build SLP failed: unsupported load "
1704                                    "permutation ");
1705                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1706                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1707                 }
1708               vect_free_slp_instance (new_instance);
1709               return false;
1710             }
1711
1712           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1713             = vect_find_first_load_in_slp_instance (new_instance);
1714         }
1715
1716       /* Compute the costs of this SLP instance.  */
1717       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1718                              new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1719
1720       if (loop_vinfo)
1721         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1722       else
1723         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1724
1725       if (dump_enabled_p ())
1726         vect_print_slp_tree (MSG_NOTE, node);
1727
1728       return true;
1729     }
1730
1731   /* Failed to SLP.  */
1732   /* Free the allocated memory.  */
1733   vect_free_slp_tree (node);
1734   loads.release ();
1735
1736   return false;
1737 }
1738
1739
1740 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1741    trees of packed scalar stmts if SLP is possible.  */
1742
1743 bool
1744 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1745                   unsigned max_tree_size)
1746 {
1747   unsigned int i;
1748   vec<gimple> grouped_stores;
1749   vec<gimple> reductions = vNULL;
1750   vec<gimple> reduc_chains = vNULL;
1751   gimple first_element;
1752   bool ok = false;
1753
1754   if (dump_enabled_p ())
1755     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1756
1757   if (loop_vinfo)
1758     {
1759       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1760       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1761       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1762     }
1763   else
1764     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1765
1766   /* Find SLP sequences starting from groups of grouped stores.  */
1767   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1768     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1769                                    max_tree_size))
1770       ok = true;
1771
1772   if (bb_vinfo && !ok)
1773     {
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1776                          "Failed to SLP the basic block.\n");
1777
1778       return false;
1779     }
1780
1781   if (loop_vinfo
1782       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1783     {
1784       /* Find SLP sequences starting from reduction chains.  */
1785       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1786         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1787                                        max_tree_size))
1788           ok = true;
1789         else
1790           return false;
1791
1792       /* Don't try to vectorize SLP reductions if reduction chain was
1793          detected.  */
1794       return ok;
1795     }
1796
1797   /* Find SLP sequences starting from groups of reductions.  */
1798   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1799       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1800                                     max_tree_size))
1801     ok = true;
1802
1803   return true;
1804 }
1805
1806
1807 /* For each possible SLP instance decide whether to SLP it and calculate overall
1808    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1809    least one instance.  */
1810
1811 bool
1812 vect_make_slp_decision (loop_vec_info loop_vinfo)
1813 {
1814   unsigned int i, unrolling_factor = 1;
1815   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1816   slp_instance instance;
1817   int decided_to_slp = 0;
1818
1819   if (dump_enabled_p ())
1820     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1821                      "\n");
1822
1823   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1824     {
1825       /* FORNOW: SLP if you can.  */
1826       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1827         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1828
1829       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1830          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1831          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1832       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1833       decided_to_slp++;
1834     }
1835
1836   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1837
1838   if (decided_to_slp && dump_enabled_p ())
1839     dump_printf_loc (MSG_NOTE, vect_location,
1840                      "Decided to SLP %d instances. Unrolling factor %d\n",
1841                      decided_to_slp, unrolling_factor);
1842
1843   return (decided_to_slp > 0);
1844 }
1845
1846
1847 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1848    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1849
1850 static void
1851 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1852 {
1853   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1854   imm_use_iterator imm_iter;
1855   gimple use_stmt;
1856   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1857   slp_tree child;
1858   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1859   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1860   int j;
1861
1862   /* Propagate hybrid down the SLP tree.  */
1863   if (stype == hybrid)
1864     ;
1865   else if (HYBRID_SLP_STMT (stmt_vinfo))
1866     stype = hybrid;
1867   else
1868     {
1869       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1870       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1871       if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1872         FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1873           if (gimple_bb (use_stmt)
1874               && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1875               && (use_vinfo = vinfo_for_stmt (use_stmt))
1876               && !STMT_SLP_TYPE (use_vinfo)
1877               && (STMT_VINFO_RELEVANT (use_vinfo)
1878                   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))
1879                   || (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1880                       && STMT_VINFO_RELATED_STMT (use_vinfo)
1881                       && !STMT_SLP_TYPE (vinfo_for_stmt
1882                             (STMT_VINFO_RELATED_STMT (use_vinfo)))))
1883               && !(gimple_code (use_stmt) == GIMPLE_PHI
1884                    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1885             stype = hybrid;
1886     }
1887
1888   if (stype == hybrid)
1889     STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1890
1891   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1892     vect_detect_hybrid_slp_stmts (child, i, stype);
1893 }
1894
1895 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
1896
1897 static tree
1898 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1899 {
1900   walk_stmt_info *wi = (walk_stmt_info *)data;
1901   struct loop *loopp = (struct loop *)wi->info;
1902
1903   if (wi->is_lhs)
1904     return NULL_TREE;
1905
1906   if (TREE_CODE (*tp) == SSA_NAME
1907       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
1908     {
1909       gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
1910       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
1911           && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
1912         STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
1913     }
1914
1915   return NULL_TREE;
1916 }
1917
1918 static tree
1919 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
1920                           walk_stmt_info *)
1921 {
1922   /* If the stmt is in a SLP instance then this isn't a reason
1923      to mark use definitions in other SLP instances as hybrid.  */
1924   if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
1925     *handled = true;
1926   return NULL_TREE;
1927 }
1928
1929 /* Find stmts that must be both vectorized and SLPed.  */
1930
1931 void
1932 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1933 {
1934   unsigned int i;
1935   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1936   slp_instance instance;
1937
1938   if (dump_enabled_p ())
1939     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1940                      "\n");
1941
1942   /* First walk all pattern stmt in the loop and mark defs of uses as
1943      hybrid because immediate uses in them are not recorded.  */
1944   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1945     {
1946       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1947       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1948            gsi_next (&gsi))
1949         {
1950           gimple stmt = gsi_stmt (gsi);
1951           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1952           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
1953             {
1954               walk_stmt_info wi;
1955               memset (&wi, 0, sizeof (wi));
1956               wi.info = LOOP_VINFO_LOOP (loop_vinfo);
1957               gimple_stmt_iterator gsi2
1958                 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
1959               walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
1960                                 vect_detect_hybrid_slp_1, &wi);
1961               walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
1962                                vect_detect_hybrid_slp_2,
1963                                vect_detect_hybrid_slp_1, &wi);
1964             }
1965         }
1966     }
1967
1968   /* Then walk the SLP instance trees marking stmts with uses in
1969      non-SLP stmts as hybrid, also propagating hybrid down the
1970      SLP tree, collecting the above info on-the-fly.  */
1971   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1972     {
1973       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
1974         vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
1975                                       i, pure_slp);
1976     }
1977 }
1978
1979
1980 /* Create and initialize a new bb_vec_info struct for BB, as well as
1981    stmt_vec_info structs for all the stmts in it.  */
1982
1983 static bb_vec_info
1984 new_bb_vec_info (basic_block bb)
1985 {
1986   bb_vec_info res = NULL;
1987   gimple_stmt_iterator gsi;
1988
1989   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1990   BB_VINFO_BB (res) = bb;
1991
1992   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1993     {
1994       gimple stmt = gsi_stmt (gsi);
1995       gimple_set_uid (stmt, 0);
1996       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1997     }
1998
1999   BB_VINFO_GROUPED_STORES (res).create (10);
2000   BB_VINFO_SLP_INSTANCES (res).create (2);
2001   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2002
2003   bb->aux = res;
2004   return res;
2005 }
2006
2007
2008 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2009    stmts in the basic block.  */
2010
2011 static void
2012 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2013 {
2014   vec<slp_instance> slp_instances;
2015   slp_instance instance;
2016   basic_block bb;
2017   gimple_stmt_iterator si;
2018   unsigned i;
2019
2020   if (!bb_vinfo)
2021     return;
2022
2023   bb = BB_VINFO_BB (bb_vinfo);
2024
2025   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2026     {
2027       gimple stmt = gsi_stmt (si);
2028       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2029
2030       if (stmt_info)
2031         /* Free stmt_vec_info.  */
2032         free_stmt_vec_info (stmt);
2033     }
2034
2035   vect_destroy_datarefs (NULL, bb_vinfo);
2036   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2037   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2038   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2039   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2040     vect_free_slp_instance (instance);
2041   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2042   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2043   free (bb_vinfo);
2044   bb->aux = NULL;
2045 }
2046
2047
2048 /* Analyze statements contained in SLP tree node after recursively analyzing
2049    the subtree. Return TRUE if the operations are supported.  */
2050
2051 static bool
2052 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2053 {
2054   bool dummy;
2055   int i;
2056   gimple stmt;
2057   slp_tree child;
2058
2059   if (!node)
2060     return true;
2061
2062   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2063     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2064       return false;
2065
2066   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2067     {
2068       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2069       gcc_assert (stmt_info);
2070       gcc_assert (PURE_SLP_STMT (stmt_info));
2071
2072       if (!vect_analyze_stmt (stmt, &dummy, node))
2073         return false;
2074     }
2075
2076   return true;
2077 }
2078
2079
2080 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2081    operations are supported. */
2082
2083 static bool
2084 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2085 {
2086   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2087   slp_instance instance;
2088   int i;
2089
2090   for (i = 0; slp_instances.iterate (i, &instance); )
2091     {
2092       if (!vect_slp_analyze_node_operations (bb_vinfo,
2093                                              SLP_INSTANCE_TREE (instance)))
2094         {
2095           vect_free_slp_instance (instance);
2096           slp_instances.ordered_remove (i);
2097         }
2098       else
2099         i++;
2100     }
2101
2102   if (!slp_instances.length ())
2103     return false;
2104
2105   return true;
2106 }
2107
2108
2109 /* Compute the scalar cost of the SLP node NODE and its children
2110    and return it.  Do not account defs that are marked in LIFE and
2111    update LIFE according to uses of NODE.  */
2112
2113 static unsigned
2114 vect_bb_slp_scalar_cost (basic_block bb,
2115                          slp_tree node, vec<bool, va_heap> *life)
2116 {
2117   unsigned scalar_cost = 0;
2118   unsigned i;
2119   gimple stmt;
2120   slp_tree child;
2121
2122   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2123     {
2124       unsigned stmt_cost;
2125       ssa_op_iter op_iter;
2126       def_operand_p def_p;
2127       stmt_vec_info stmt_info;
2128
2129       if ((*life)[i])
2130         continue;
2131
2132       /* If there is a non-vectorized use of the defs then the scalar
2133          stmt is kept live in which case we do not account it or any
2134          required defs in the SLP children in the scalar cost.  This
2135          way we make the vectorization more costly when compared to
2136          the scalar cost.  */
2137       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2138         {
2139           imm_use_iterator use_iter;
2140           gimple use_stmt;
2141           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2142             if (!is_gimple_debug (use_stmt)
2143                 && (gimple_code (use_stmt) == GIMPLE_PHI
2144                     || gimple_bb (use_stmt) != bb
2145                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2146               {
2147                 (*life)[i] = true;
2148                 BREAK_FROM_IMM_USE_STMT (use_iter);
2149               }
2150         }
2151       if ((*life)[i])
2152         continue;
2153
2154       stmt_info = vinfo_for_stmt (stmt);
2155       if (STMT_VINFO_DATA_REF (stmt_info))
2156         {
2157           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2158             stmt_cost = vect_get_stmt_cost (scalar_load);
2159           else
2160             stmt_cost = vect_get_stmt_cost (scalar_store);
2161         }
2162       else
2163         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2164
2165       scalar_cost += stmt_cost;
2166     }
2167
2168   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2169     scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2170
2171   return scalar_cost;
2172 }
2173
2174 /* Check if vectorization of the basic block is profitable.  */
2175
2176 static bool
2177 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2178 {
2179   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2180   slp_instance instance;
2181   int i, j;
2182   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2183   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2184   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2185   stmt_vec_info stmt_info = NULL;
2186   stmt_vector_for_cost body_cost_vec;
2187   stmt_info_for_cost *ci;
2188
2189   /* Calculate vector costs.  */
2190   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2191     {
2192       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2193
2194       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2195         {
2196           stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2197           (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2198                                 stmt_info, ci->misalign, vect_body);
2199         }
2200     }
2201
2202   /* Calculate scalar cost.  */
2203   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2204     {
2205       auto_vec<bool, 20> life;
2206       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2207       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2208                                               SLP_INSTANCE_TREE (instance),
2209                                               &life);
2210     }
2211
2212   /* Complete the target-specific cost calculation.  */
2213   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2214                &vec_inside_cost, &vec_epilogue_cost);
2215
2216   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2217
2218   if (dump_enabled_p ())
2219     {
2220       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2221       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2222                    vec_inside_cost);
2223       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2224       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2225       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2226     }
2227
2228   /* Vectorization is profitable if its cost is less than the cost of scalar
2229      version.  */
2230   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2231     return false;
2232
2233   return true;
2234 }
2235
2236 /* Check if the basic block can be vectorized.  */
2237
2238 static bb_vec_info
2239 vect_slp_analyze_bb_1 (basic_block bb)
2240 {
2241   bb_vec_info bb_vinfo;
2242   vec<slp_instance> slp_instances;
2243   slp_instance instance;
2244   int i;
2245   int min_vf = 2;
2246   unsigned n_stmts = 0;
2247
2248   bb_vinfo = new_bb_vec_info (bb);
2249   if (!bb_vinfo)
2250     return NULL;
2251
2252   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2253     {
2254       if (dump_enabled_p ())
2255         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256                          "not vectorized: unhandled data-ref in basic "
2257                          "block.\n");
2258
2259       destroy_bb_vec_info (bb_vinfo);
2260       return NULL;
2261     }
2262
2263   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2264     {
2265       if (dump_enabled_p ())
2266         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2267                          "not vectorized: not enough data-refs in "
2268                          "basic block.\n");
2269
2270       destroy_bb_vec_info (bb_vinfo);
2271       return NULL;
2272     }
2273
2274   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2275     {
2276      if (dump_enabled_p ())
2277        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278                         "not vectorized: unhandled data access in "
2279                         "basic block.\n");
2280
2281       destroy_bb_vec_info (bb_vinfo);
2282       return NULL;
2283     }
2284
2285   vect_pattern_recog (NULL, bb_vinfo);
2286
2287   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2288     {
2289       if (dump_enabled_p ())
2290         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291                          "not vectorized: bad data alignment in basic "
2292                          "block.\n");
2293
2294       destroy_bb_vec_info (bb_vinfo);
2295       return NULL;
2296     }
2297
2298   /* Check the SLP opportunities in the basic block, analyze and build SLP
2299      trees.  */
2300   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2301     {
2302       if (dump_enabled_p ())
2303         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2304                          "not vectorized: failed to find SLP opportunities "
2305                          "in basic block.\n");
2306
2307       destroy_bb_vec_info (bb_vinfo);
2308       return NULL;
2309     }
2310
2311   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2312
2313   /* Mark all the statements that we want to vectorize as pure SLP and
2314      relevant.  */
2315   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2316     {
2317       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2318       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2319     }
2320
2321   /* Mark all the statements that we do not want to vectorize.  */
2322   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2323        !gsi_end_p (gsi); gsi_next (&gsi))
2324     {
2325       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2326       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2327         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2328     }
2329
2330   /* Analyze dependences.  At this point all stmts not participating in
2331      vectorization have to be marked.  Dependence analysis assumes
2332      that we either vectorize all SLP instances or none at all.  */
2333   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2334      {
2335        if (dump_enabled_p ())
2336          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337                           "not vectorized: unhandled data dependence "
2338                           "in basic block.\n");
2339
2340        destroy_bb_vec_info (bb_vinfo);
2341        return NULL;
2342      }
2343
2344   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2345     {
2346       if (dump_enabled_p ())
2347         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348                          "not vectorized: unsupported alignment in basic "
2349                          "block.\n");
2350       destroy_bb_vec_info (bb_vinfo);
2351       return NULL;
2352     }
2353
2354   if (!vect_slp_analyze_operations (bb_vinfo))
2355     {
2356       if (dump_enabled_p ())
2357         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2358                          "not vectorized: bad operation in basic block.\n");
2359
2360       destroy_bb_vec_info (bb_vinfo);
2361       return NULL;
2362     }
2363
2364   /* Cost model: check if the vectorization is worthwhile.  */
2365   if (!unlimited_cost_model (NULL)
2366       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2367     {
2368       if (dump_enabled_p ())
2369         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2370                          "not vectorized: vectorization is not "
2371                          "profitable.\n");
2372
2373       destroy_bb_vec_info (bb_vinfo);
2374       return NULL;
2375     }
2376
2377   if (dump_enabled_p ())
2378     dump_printf_loc (MSG_NOTE, vect_location,
2379                      "Basic block will be vectorized using SLP\n");
2380
2381   return bb_vinfo;
2382 }
2383
2384
2385 bb_vec_info
2386 vect_slp_analyze_bb (basic_block bb)
2387 {
2388   bb_vec_info bb_vinfo;
2389   int insns = 0;
2390   gimple_stmt_iterator gsi;
2391   unsigned int vector_sizes;
2392
2393   if (dump_enabled_p ())
2394     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2395
2396   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2397     {
2398       gimple stmt = gsi_stmt (gsi);
2399       if (!is_gimple_debug (stmt)
2400           && !gimple_nop_p (stmt)
2401           && gimple_code (stmt) != GIMPLE_LABEL)
2402         insns++;
2403     }
2404
2405   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2406     {
2407       if (dump_enabled_p ())
2408         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2409                          "not vectorized: too many instructions in "
2410                          "basic block.\n");
2411
2412       return NULL;
2413     }
2414
2415   /* Autodetect first vector size we try.  */
2416   current_vector_size = 0;
2417   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2418
2419   while (1)
2420     {
2421       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2422       if (bb_vinfo)
2423         return bb_vinfo;
2424
2425       destroy_bb_vec_info (bb_vinfo);
2426
2427       vector_sizes &= ~current_vector_size;
2428       if (vector_sizes == 0
2429           || current_vector_size == 0)
2430         return NULL;
2431
2432       /* Try the next biggest vector size.  */
2433       current_vector_size = 1 << floor_log2 (vector_sizes);
2434       if (dump_enabled_p ())
2435         dump_printf_loc (MSG_NOTE, vect_location,
2436                          "***** Re-trying analysis with "
2437                          "vector size %d\n", current_vector_size);
2438     }
2439 }
2440
2441
2442 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2443    the number of created vector stmts depends on the unrolling factor).
2444    However, the actual number of vector stmts for every SLP node depends on
2445    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2446    should be updated.  In this function we assume that the inside costs
2447    calculated in vect_model_xxx_cost are linear in ncopies.  */
2448
2449 void
2450 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2451 {
2452   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2453   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2454   slp_instance instance;
2455   stmt_vector_for_cost body_cost_vec;
2456   stmt_info_for_cost *si;
2457   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2458
2459   if (dump_enabled_p ())
2460     dump_printf_loc (MSG_NOTE, vect_location,
2461                      "=== vect_update_slp_costs_according_to_vf ===\n");
2462
2463   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2464     {
2465       /* We assume that costs are linear in ncopies.  */
2466       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2467
2468       /* Record the instance's instructions in the target cost model.
2469          This was delayed until here because the count of instructions
2470          isn't known beforehand.  */
2471       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2472
2473       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2474         (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2475                               vinfo_for_stmt (si->stmt), si->misalign,
2476                               vect_body);
2477     }
2478 }
2479
2480
2481 /* For constant and loop invariant defs of SLP_NODE this function returns
2482    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2483    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2484    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2485    REDUC_INDEX is the index of the reduction operand in the statements, unless
2486    it is -1.  */
2487
2488 static void
2489 vect_get_constant_vectors (tree op, slp_tree slp_node,
2490                            vec<tree> *vec_oprnds,
2491                            unsigned int op_num, unsigned int number_of_vectors,
2492                            int reduc_index)
2493 {
2494   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2495   gimple stmt = stmts[0];
2496   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2497   unsigned nunits;
2498   tree vec_cst;
2499   tree *elts;
2500   unsigned j, number_of_places_left_in_vector;
2501   tree vector_type;
2502   tree vop;
2503   int group_size = stmts.length ();
2504   unsigned int vec_num, i;
2505   unsigned number_of_copies = 1;
2506   vec<tree> voprnds;
2507   voprnds.create (number_of_vectors);
2508   bool constant_p, is_store;
2509   tree neutral_op = NULL;
2510   enum tree_code code = gimple_expr_code (stmt);
2511   gimple def_stmt;
2512   struct loop *loop;
2513   gimple_seq ctor_seq = NULL;
2514
2515   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2516       && reduc_index != -1)
2517     {
2518       op_num = reduc_index - 1;
2519       op = gimple_op (stmt, reduc_index);
2520       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2521          we need either neutral operands or the original operands.  See
2522          get_initial_def_for_reduction() for details.  */
2523       switch (code)
2524         {
2525           case WIDEN_SUM_EXPR:
2526           case DOT_PROD_EXPR:
2527           case PLUS_EXPR:
2528           case MINUS_EXPR:
2529           case BIT_IOR_EXPR:
2530           case BIT_XOR_EXPR:
2531              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2532                neutral_op = build_real (TREE_TYPE (op), dconst0);
2533              else
2534                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2535
2536              break;
2537
2538           case MULT_EXPR:
2539              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2540                neutral_op = build_real (TREE_TYPE (op), dconst1);
2541              else
2542                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2543
2544              break;
2545
2546           case BIT_AND_EXPR:
2547             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2548             break;
2549
2550           /* For MIN/MAX we don't have an easy neutral operand but
2551              the initial values can be used fine here.  Only for
2552              a reduction chain we have to force a neutral element.  */
2553           case MAX_EXPR:
2554           case MIN_EXPR:
2555             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2556               neutral_op = NULL;
2557             else
2558               {
2559                 def_stmt = SSA_NAME_DEF_STMT (op);
2560                 loop = (gimple_bb (stmt))->loop_father;
2561                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2562                                                     loop_preheader_edge (loop));
2563               }
2564             break;
2565
2566           default:
2567             neutral_op = NULL;
2568         }
2569     }
2570
2571   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2572     {
2573       is_store = true;
2574       op = gimple_assign_rhs1 (stmt);
2575     }
2576   else
2577     is_store = false;
2578
2579   gcc_assert (op);
2580
2581   if (CONSTANT_CLASS_P (op))
2582     constant_p = true;
2583   else
2584     constant_p = false;
2585
2586   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2587   gcc_assert (vector_type);
2588   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2589
2590   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2591      created vectors. It is greater than 1 if unrolling is performed.
2592
2593      For example, we have two scalar operands, s1 and s2 (e.g., group of
2594      strided accesses of size two), while NUNITS is four (i.e., four scalars
2595      of this type can be packed in a vector).  The output vector will contain
2596      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2597      will be 2).
2598
2599      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2600      containing the operands.
2601
2602      For example, NUNITS is four as before, and the group size is 8
2603      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2604      {s5, s6, s7, s8}.  */
2605
2606   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2607
2608   number_of_places_left_in_vector = nunits;
2609   elts = XALLOCAVEC (tree, nunits);
2610   for (j = 0; j < number_of_copies; j++)
2611     {
2612       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2613         {
2614           if (is_store)
2615             op = gimple_assign_rhs1 (stmt);
2616           else
2617             {
2618               switch (code)
2619                 {
2620                   case COND_EXPR:
2621                     if (op_num == 0 || op_num == 1)
2622                       {
2623                         tree cond = gimple_assign_rhs1 (stmt);
2624                         op = TREE_OPERAND (cond, op_num);
2625                       }
2626                     else
2627                       {
2628                         if (op_num == 2)
2629                           op = gimple_assign_rhs2 (stmt);
2630                         else
2631                           op = gimple_assign_rhs3 (stmt);
2632                       }
2633                     break;
2634
2635                   case CALL_EXPR:
2636                     op = gimple_call_arg (stmt, op_num);
2637                     break;
2638
2639                   case LSHIFT_EXPR:
2640                   case RSHIFT_EXPR:
2641                   case LROTATE_EXPR:
2642                   case RROTATE_EXPR:
2643                     op = gimple_op (stmt, op_num + 1);
2644                     /* Unlike the other binary operators, shifts/rotates have
2645                        the shift count being int, instead of the same type as
2646                        the lhs, so make sure the scalar is the right type if
2647                        we are dealing with vectors of
2648                        long long/long/short/char.  */
2649                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2650                       op = fold_convert (TREE_TYPE (vector_type), op);
2651                     break;
2652
2653                   default:
2654                     op = gimple_op (stmt, op_num + 1);
2655                     break;
2656                 }
2657             }
2658
2659           if (reduc_index != -1)
2660             {
2661               loop = (gimple_bb (stmt))->loop_father;
2662               def_stmt = SSA_NAME_DEF_STMT (op);
2663
2664               gcc_assert (loop);
2665
2666               /* Get the def before the loop.  In reduction chain we have only
2667                  one initial value.  */
2668               if ((j != (number_of_copies - 1)
2669                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2670                        && i != 0))
2671                   && neutral_op)
2672                 op = neutral_op;
2673               else
2674                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2675                                             loop_preheader_edge (loop));
2676             }
2677
2678           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2679           number_of_places_left_in_vector--;
2680           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2681             {
2682               if (CONSTANT_CLASS_P (op))
2683                 {
2684                   op = fold_unary (VIEW_CONVERT_EXPR,
2685                                    TREE_TYPE (vector_type), op);
2686                   gcc_assert (op && CONSTANT_CLASS_P (op));
2687                 }
2688               else
2689                 {
2690                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2691                   gimple init_stmt;
2692                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2693                   init_stmt
2694                     = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2695                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2696                   op = new_temp;
2697                 }
2698             }
2699           elts[number_of_places_left_in_vector] = op;
2700           if (!CONSTANT_CLASS_P (op))
2701             constant_p = false;
2702
2703           if (number_of_places_left_in_vector == 0)
2704             {
2705               number_of_places_left_in_vector = nunits;
2706
2707               if (constant_p)
2708                 vec_cst = build_vector (vector_type, elts);
2709               else
2710                 {
2711                   vec<constructor_elt, va_gc> *v;
2712                   unsigned k;
2713                   vec_alloc (v, nunits);
2714                   for (k = 0; k < nunits; ++k)
2715                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2716                   vec_cst = build_constructor (vector_type, v);
2717                 }
2718               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2719                                                     vector_type, NULL));
2720               if (ctor_seq != NULL)
2721                 {
2722                   gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2723                   gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2724                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2725                                                         GSI_SAME_STMT);
2726                   ctor_seq = NULL;
2727                 }
2728             }
2729         }
2730     }
2731
2732   /* Since the vectors are created in the reverse order, we should invert
2733      them.  */
2734   vec_num = voprnds.length ();
2735   for (j = vec_num; j != 0; j--)
2736     {
2737       vop = voprnds[j - 1];
2738       vec_oprnds->quick_push (vop);
2739     }
2740
2741   voprnds.release ();
2742
2743   /* In case that VF is greater than the unrolling factor needed for the SLP
2744      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2745      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2746      to replicate the vectors.  */
2747   while (number_of_vectors > vec_oprnds->length ())
2748     {
2749       tree neutral_vec = NULL;
2750
2751       if (neutral_op)
2752         {
2753           if (!neutral_vec)
2754             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2755
2756           vec_oprnds->quick_push (neutral_vec);
2757         }
2758       else
2759         {
2760           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2761             vec_oprnds->quick_push (vop);
2762         }
2763     }
2764 }
2765
2766
2767 /* Get vectorized definitions from SLP_NODE that contains corresponding
2768    vectorized def-stmts.  */
2769
2770 static void
2771 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2772 {
2773   tree vec_oprnd;
2774   gimple vec_def_stmt;
2775   unsigned int i;
2776
2777   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2778
2779   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2780     {
2781       gcc_assert (vec_def_stmt);
2782       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2783       vec_oprnds->quick_push (vec_oprnd);
2784     }
2785 }
2786
2787
2788 /* Get vectorized definitions for SLP_NODE.
2789    If the scalar definitions are loop invariants or constants, collect them and
2790    call vect_get_constant_vectors() to create vector stmts.
2791    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2792    must be stored in the corresponding child of SLP_NODE, and we call
2793    vect_get_slp_vect_defs () to retrieve them.  */
2794
2795 void
2796 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2797                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2798 {
2799   gimple first_stmt;
2800   int number_of_vects = 0, i;
2801   unsigned int child_index = 0;
2802   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2803   slp_tree child = NULL;
2804   vec<tree> vec_defs;
2805   tree oprnd;
2806   bool vectorized_defs;
2807
2808   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2809   FOR_EACH_VEC_ELT (ops, i, oprnd)
2810     {
2811       /* For each operand we check if it has vectorized definitions in a child
2812          node or we need to create them (for invariants and constants).  We
2813          check if the LHS of the first stmt of the next child matches OPRND.
2814          If it does, we found the correct child.  Otherwise, we call
2815          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2816          to check this child node for the next operand.  */
2817       vectorized_defs = false;
2818       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2819         {
2820           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2821
2822           /* We have to check both pattern and original def, if available.  */
2823           gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2824           gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2825
2826           if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2827               || (related
2828                   && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2829             {
2830               /* The number of vector defs is determined by the number of
2831                  vector statements in the node from which we get those
2832                  statements.  */
2833               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2834               vectorized_defs = true;
2835               child_index++;
2836             }
2837         }
2838
2839       if (!vectorized_defs)
2840         {
2841           if (i == 0)
2842             {
2843               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2844               /* Number of vector stmts was calculated according to LHS in
2845                  vect_schedule_slp_instance (), fix it by replacing LHS with
2846                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2847                  details.  */
2848               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2849                                              &rhs_size_unit);
2850               if (rhs_size_unit != lhs_size_unit)
2851                 {
2852                   number_of_vects *= rhs_size_unit;
2853                   number_of_vects /= lhs_size_unit;
2854                 }
2855             }
2856         }
2857
2858       /* Allocate memory for vectorized defs.  */
2859       vec_defs = vNULL;
2860       vec_defs.create (number_of_vects);
2861
2862       /* For reduction defs we call vect_get_constant_vectors (), since we are
2863          looking for initial loop invariant values.  */
2864       if (vectorized_defs && reduc_index == -1)
2865         /* The defs are already vectorized.  */
2866         vect_get_slp_vect_defs (child, &vec_defs);
2867       else
2868         /* Build vectors from scalar defs.  */
2869         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2870                                    number_of_vects, reduc_index);
2871
2872       vec_oprnds->quick_push (vec_defs);
2873
2874       /* For reductions, we only need initial values.  */
2875       if (reduc_index != -1)
2876         return;
2877     }
2878 }
2879
2880
2881 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2882    building a vector of type MASK_TYPE from it) and two input vectors placed in
2883    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2884    shifting by STRIDE elements of DR_CHAIN for every copy.
2885    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2886    copies).
2887    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2888    the created stmts must be inserted.  */
2889
2890 static inline void
2891 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2892                            tree mask, int first_vec_indx, int second_vec_indx,
2893                            gimple_stmt_iterator *gsi, slp_tree node,
2894                            tree vectype, vec<tree> dr_chain,
2895                            int ncopies, int vect_stmts_counter)
2896 {
2897   tree perm_dest;
2898   gimple perm_stmt = NULL;
2899   stmt_vec_info next_stmt_info;
2900   int i, stride;
2901   tree first_vec, second_vec, data_ref;
2902
2903   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2904
2905   /* Initialize the vect stmts of NODE to properly insert the generated
2906      stmts later.  */
2907   for (i = SLP_TREE_VEC_STMTS (node).length ();
2908        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2909     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2910
2911   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2912   for (i = 0; i < ncopies; i++)
2913     {
2914       first_vec = dr_chain[first_vec_indx];
2915       second_vec = dr_chain[second_vec_indx];
2916
2917       /* Generate the permute statement.  */
2918       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
2919                                        first_vec, second_vec, mask);
2920       data_ref = make_ssa_name (perm_dest, perm_stmt);
2921       gimple_set_lhs (perm_stmt, data_ref);
2922       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2923
2924       /* Store the vector statement in NODE.  */
2925       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2926
2927       first_vec_indx += stride;
2928       second_vec_indx += stride;
2929     }
2930
2931   /* Mark the scalar stmt as vectorized.  */
2932   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2933   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2934 }
2935
2936
2937 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2938    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2939    representation.  Check that the mask is valid and return FALSE if not.
2940    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2941    the next vector, i.e., the current first vector is not needed.  */
2942
2943 static bool
2944 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2945                        int mask_nunits, bool only_one_vec, int index,
2946                        unsigned char *mask, int *current_mask_element,
2947                        bool *need_next_vector, int *number_of_mask_fixes,
2948                        bool *mask_fixed, bool *needs_first_vector)
2949 {
2950   int i;
2951
2952   /* Convert to target specific representation.  */
2953   *current_mask_element = first_mask_element + m;
2954   /* Adjust the value in case it's a mask for second and third vectors.  */
2955   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2956
2957   if (*current_mask_element < mask_nunits)
2958     *needs_first_vector = true;
2959
2960   /* We have only one input vector to permute but the mask accesses values in
2961      the next vector as well.  */
2962   if (only_one_vec && *current_mask_element >= mask_nunits)
2963     {
2964       if (dump_enabled_p ())
2965         {
2966           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2967                            "permutation requires at least two vectors ");
2968           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2969           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2970         }
2971
2972       return false;
2973     }
2974
2975   /* The mask requires the next vector.  */
2976   while (*current_mask_element >= mask_nunits * 2)
2977     {
2978       if (*needs_first_vector || *mask_fixed)
2979         {
2980           /* We either need the first vector too or have already moved to the
2981              next vector. In both cases, this permutation needs three
2982              vectors.  */
2983           if (dump_enabled_p ())
2984             {
2985               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2986                                "permutation requires at "
2987                                "least three vectors ");
2988               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2989               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2990             }
2991
2992           return false;
2993         }
2994
2995       /* We move to the next vector, dropping the first one and working with
2996          the second and the third - we need to adjust the values of the mask
2997          accordingly.  */
2998       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2999
3000       for (i = 0; i < index; i++)
3001         mask[i] -= mask_nunits * *number_of_mask_fixes;
3002
3003       (*number_of_mask_fixes)++;
3004       *mask_fixed = true;
3005     }
3006
3007   *need_next_vector = *mask_fixed;
3008
3009   /* This was the last element of this mask. Start a new one.  */
3010   if (index == mask_nunits - 1)
3011     {
3012       *number_of_mask_fixes = 1;
3013       *mask_fixed = false;
3014       *needs_first_vector = false;
3015     }
3016
3017   return true;
3018 }
3019
3020
3021 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3022    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3023    permute statements for the SLP node NODE of the SLP instance
3024    SLP_NODE_INSTANCE.  */
3025
3026 bool
3027 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3028                               gimple_stmt_iterator *gsi, int vf,
3029                               slp_instance slp_node_instance, bool analyze_only)
3030 {
3031   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3032   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3033   tree mask_element_type = NULL_TREE, mask_type;
3034   int i, j, k, nunits, vec_index = 0, scalar_index;
3035   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3036   gimple next_scalar_stmt;
3037   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3038   int first_mask_element;
3039   int index, unroll_factor, current_mask_element, ncopies;
3040   unsigned char *mask;
3041   bool only_one_vec = false, need_next_vector = false;
3042   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3043   int number_of_mask_fixes = 1;
3044   bool mask_fixed = false;
3045   bool needs_first_vector = false;
3046   machine_mode mode;
3047
3048   mode = TYPE_MODE (vectype);
3049
3050   if (!can_vec_perm_p (mode, false, NULL))
3051     {
3052       if (dump_enabled_p ())
3053         {
3054           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3055                            "no vect permute for ");
3056           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3057           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3058         }
3059       return false;
3060     }
3061
3062   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3063      same size as the vector element being permuted.  */
3064   mask_element_type = lang_hooks.types.type_for_mode
3065                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3066   mask_type = get_vectype_for_scalar_type (mask_element_type);
3067   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3068   mask = XALLOCAVEC (unsigned char, nunits);
3069   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3070
3071   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3072      unrolling factor.  */
3073   orig_vec_stmts_num = group_size *
3074                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3075   if (orig_vec_stmts_num == 1)
3076     only_one_vec = true;
3077
3078   /* Number of copies is determined by the final vectorization factor
3079      relatively to SLP_NODE_INSTANCE unrolling factor.  */
3080   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3081
3082   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3083     return false;
3084
3085   /* Generate permutation masks for every NODE. Number of masks for each NODE
3086      is equal to GROUP_SIZE.
3087      E.g., we have a group of three nodes with three loads from the same
3088      location in each node, and the vector size is 4. I.e., we have a
3089      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3090      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3091      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3092      ...
3093
3094      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3095      The last mask is illegal since we assume two operands for permute
3096      operation, and the mask element values can't be outside that range.
3097      Hence, the last mask must be converted into {2,5,5,5}.
3098      For the first two permutations we need the first and the second input
3099      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3100      we need the second and the third vectors: {b1,c1,a2,b2} and
3101      {c2,a3,b3,c3}.  */
3102
3103     {
3104       scalar_index = 0;
3105       index = 0;
3106       vect_stmts_counter = 0;
3107       vec_index = 0;
3108       first_vec_index = vec_index++;
3109       if (only_one_vec)
3110         second_vec_index = first_vec_index;
3111       else
3112         second_vec_index =  vec_index++;
3113
3114       for (j = 0; j < unroll_factor; j++)
3115         {
3116           for (k = 0; k < group_size; k++)
3117             {
3118               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3119               first_mask_element = i + j * group_size;
3120               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3121                                           nunits, only_one_vec, index,
3122                                           mask, &current_mask_element,
3123                                           &need_next_vector,
3124                                           &number_of_mask_fixes, &mask_fixed,
3125                                           &needs_first_vector))
3126                 return false;
3127               gcc_assert (current_mask_element < 2 * nunits);
3128               mask[index++] = current_mask_element;
3129
3130               if (index == nunits)
3131                 {
3132                   index = 0;
3133                   if (!can_vec_perm_p (mode, false, mask))
3134                     {
3135                       if (dump_enabled_p ())
3136                         {
3137                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3138                                            vect_location, 
3139                                            "unsupported vect permute { ");
3140                           for (i = 0; i < nunits; ++i)
3141                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3142                                          mask[i]);
3143                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3144                         }
3145                       return false;
3146                     }
3147
3148                   if (!analyze_only)
3149                     {
3150                       int l;
3151                       tree mask_vec, *mask_elts;
3152                       mask_elts = XALLOCAVEC (tree, nunits);
3153                       for (l = 0; l < nunits; ++l)
3154                         mask_elts[l] = build_int_cst (mask_element_type,
3155                                                       mask[l]);
3156                       mask_vec = build_vector (mask_type, mask_elts);
3157
3158                       if (need_next_vector)
3159                         {
3160                           first_vec_index = second_vec_index;
3161                           second_vec_index = vec_index;
3162                         }
3163
3164                       next_scalar_stmt
3165                           = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3166
3167                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3168                                mask_vec, first_vec_index, second_vec_index,
3169                                gsi, node, vectype, dr_chain,
3170                                ncopies, vect_stmts_counter++);
3171                     }
3172                 }
3173             }
3174         }
3175     }
3176
3177   return true;
3178 }
3179
3180
3181
3182 /* Vectorize SLP instance tree in postorder.  */
3183
3184 static bool
3185 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3186                             unsigned int vectorization_factor)
3187 {
3188   gimple stmt;
3189   bool grouped_store, is_store;
3190   gimple_stmt_iterator si;
3191   stmt_vec_info stmt_info;
3192   unsigned int vec_stmts_size, nunits, group_size;
3193   tree vectype;
3194   int i;
3195   slp_tree child;
3196
3197   if (!node)
3198     return false;
3199
3200   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3201     vect_schedule_slp_instance (child, instance, vectorization_factor);
3202
3203   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3204   stmt_info = vinfo_for_stmt (stmt);
3205
3206   /* VECTYPE is the type of the destination.  */
3207   vectype = STMT_VINFO_VECTYPE (stmt_info);
3208   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3209   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3210
3211   /* For each SLP instance calculate number of vector stmts to be created
3212      for the scalar stmts in each node of the SLP tree.  Number of vector
3213      elements in one vector iteration is the number of scalar elements in
3214      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3215      size.  */
3216   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3217
3218   if (!SLP_TREE_VEC_STMTS (node).exists ())
3219     {
3220       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3221       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3222     }
3223
3224   if (dump_enabled_p ())
3225     {
3226       dump_printf_loc (MSG_NOTE,vect_location,
3227                        "------>vectorizing SLP node starting from: ");
3228       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3229       dump_printf (MSG_NOTE, "\n");
3230     }
3231
3232   /* Loads should be inserted before the first load.  */
3233   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3234       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3235       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3236       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3237     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3238   else if (is_pattern_stmt_p (stmt_info))
3239     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3240   else
3241     si = gsi_for_stmt (stmt);
3242
3243   /* Stores should be inserted just before the last store.  */
3244   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3245       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3246     { 
3247       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3248       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3249        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3250       si = gsi_for_stmt (last_store);
3251     }
3252
3253   /* Mark the first element of the reduction chain as reduction to properly
3254      transform the node.  In the analysis phase only the last element of the
3255      chain is marked as reduction.  */
3256   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3257       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3258     {
3259       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3260       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3261     }
3262
3263   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3264   return is_store;
3265 }
3266
3267 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3268    For loop vectorization this is done in vectorizable_call, but for SLP
3269    it needs to be deferred until end of vect_schedule_slp, because multiple
3270    SLP instances may refer to the same scalar stmt.  */
3271
3272 static void
3273 vect_remove_slp_scalar_calls (slp_tree node)
3274 {
3275   gimple stmt, new_stmt;
3276   gimple_stmt_iterator gsi;
3277   int i;
3278   slp_tree child;
3279   tree lhs;
3280   stmt_vec_info stmt_info;
3281
3282   if (!node)
3283     return;
3284
3285   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3286     vect_remove_slp_scalar_calls (child);
3287
3288   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3289     {
3290       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3291         continue;
3292       stmt_info = vinfo_for_stmt (stmt);
3293       if (stmt_info == NULL
3294           || is_pattern_stmt_p (stmt_info)
3295           || !PURE_SLP_STMT (stmt_info))
3296         continue;
3297       lhs = gimple_call_lhs (stmt);
3298       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3299       set_vinfo_for_stmt (new_stmt, stmt_info);
3300       set_vinfo_for_stmt (stmt, NULL);
3301       STMT_VINFO_STMT (stmt_info) = new_stmt;
3302       gsi = gsi_for_stmt (stmt);
3303       gsi_replace (&gsi, new_stmt, false);
3304       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3305     }
3306 }
3307
3308 /* Generate vector code for all SLP instances in the loop/basic block.  */
3309
3310 bool
3311 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3312 {
3313   vec<slp_instance> slp_instances;
3314   slp_instance instance;
3315   unsigned int i, vf;
3316   bool is_store = false;
3317
3318   if (loop_vinfo)
3319     {
3320       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3321       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3322     }
3323   else
3324     {
3325       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3326       vf = 1;
3327     }
3328
3329   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3330     {
3331       /* Schedule the tree of INSTANCE.  */
3332       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3333                                              instance, vf);
3334       if (dump_enabled_p ())
3335         dump_printf_loc (MSG_NOTE, vect_location,
3336                          "vectorizing stmts using SLP.\n");
3337     }
3338
3339   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3340     {
3341       slp_tree root = SLP_INSTANCE_TREE (instance);
3342       gimple store;
3343       unsigned int j;
3344       gimple_stmt_iterator gsi;
3345
3346       /* Remove scalar call stmts.  Do not do this for basic-block
3347          vectorization as not all uses may be vectorized.
3348          ???  Why should this be necessary?  DCE should be able to
3349          remove the stmts itself.
3350          ???  For BB vectorization we can as well remove scalar
3351          stmts starting from the SLP tree root if they have no
3352          uses.  */
3353       if (loop_vinfo)
3354         vect_remove_slp_scalar_calls (root);
3355
3356       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3357                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3358         {
3359           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3360             break;
3361
3362          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3363            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3364           /* Free the attached stmt_vec_info and remove the stmt.  */
3365           gsi = gsi_for_stmt (store);
3366           unlink_stmt_vdef (store);
3367           gsi_remove (&gsi, true);
3368           release_defs (store);
3369           free_stmt_vec_info (store);
3370         }
3371     }
3372
3373   return is_store;
3374 }
3375
3376
3377 /* Vectorize the basic block.  */
3378
3379 void
3380 vect_slp_transform_bb (basic_block bb)
3381 {
3382   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3383   gimple_stmt_iterator si;
3384
3385   gcc_assert (bb_vinfo);
3386
3387   if (dump_enabled_p ())
3388     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3389
3390   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3391     {
3392       gimple stmt = gsi_stmt (si);
3393       stmt_vec_info stmt_info;
3394
3395       if (dump_enabled_p ())
3396         {
3397           dump_printf_loc (MSG_NOTE, vect_location,
3398                            "------>SLPing statement: ");
3399           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3400           dump_printf (MSG_NOTE, "\n");
3401         }
3402
3403       stmt_info = vinfo_for_stmt (stmt);
3404       gcc_assert (stmt_info);
3405
3406       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3407       if (STMT_SLP_TYPE (stmt_info))
3408         {
3409           vect_schedule_slp (NULL, bb_vinfo);
3410           break;
3411         }
3412     }
3413
3414   if (dump_enabled_p ())
3415     dump_printf_loc (MSG_NOTE, vect_location,
3416                      "BASIC BLOCK VECTORIZED\n");
3417
3418   destroy_bb_vec_info (bb_vinfo);
3419 }