tree-vectorizer.h (vec_info): New base class for...
[platform/upstream/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "params.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "target.h"
36 #include "gimple-pretty-print.h"
37 #include "internal-fn.h"
38 #include "gimple-iterator.h"
39 #include "tree-pass.h"
40 #include "cfgloop.h"
41 #include "flags.h"
42 #include "insn-config.h"
43 #include "recog.h"              /* FIXME: for insn_data */
44 #include "insn-codes.h"
45 #include "optabs-tree.h"
46 #include "tree-vectorizer.h"
47 #include "langhooks.h"
48 #include "gimple-walk.h"
49
50 /* Extract the location of the basic block in the source code.
51    Return the basic block location if succeed and NULL if not.  */
52
53 source_location
54 find_bb_location (basic_block bb)
55 {
56   gimple *stmt = NULL;
57   gimple_stmt_iterator si;
58
59   if (!bb)
60     return UNKNOWN_LOCATION;
61
62   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
63     {
64       stmt = gsi_stmt (si);
65       if (gimple_location (stmt) != UNKNOWN_LOCATION)
66         return gimple_location (stmt);
67     }
68
69   return UNKNOWN_LOCATION;
70 }
71
72
73 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
74
75 static void
76 vect_free_slp_tree (slp_tree node)
77 {
78   int i;
79   slp_tree child;
80
81   if (!node)
82     return;
83
84   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
85     vect_free_slp_tree (child);
86
87   SLP_TREE_CHILDREN (node).release ();
88   SLP_TREE_SCALAR_STMTS (node).release ();
89   SLP_TREE_VEC_STMTS (node).release ();
90   SLP_TREE_LOAD_PERMUTATION (node).release ();
91
92   free (node);
93 }
94
95
96 /* Free the memory allocated for the SLP instance.  */
97
98 void
99 vect_free_slp_instance (slp_instance instance)
100 {
101   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
102   SLP_INSTANCE_LOADS (instance).release ();
103   free (instance);
104 }
105
106
107 /* Create an SLP node for SCALAR_STMTS.  */
108
109 static slp_tree
110 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
111 {
112   slp_tree node;
113   gimple *stmt = scalar_stmts[0];
114   unsigned int nops;
115
116   if (is_gimple_call (stmt))
117     nops = gimple_call_num_args (stmt);
118   else if (is_gimple_assign (stmt))
119     {
120       nops = gimple_num_ops (stmt) - 1;
121       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
122         nops++;
123     }
124   else
125     return NULL;
126
127   node = XNEW (struct _slp_tree);
128   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
129   SLP_TREE_VEC_STMTS (node).create (0);
130   SLP_TREE_CHILDREN (node).create (nops);
131   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
132   SLP_TREE_TWO_OPERATORS (node) = false;
133
134   return node;
135 }
136
137
138 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
139    operand.  */
140 static vec<slp_oprnd_info> 
141 vect_create_oprnd_info (int nops, int group_size)
142 {
143   int i;
144   slp_oprnd_info oprnd_info;
145   vec<slp_oprnd_info> oprnds_info;
146
147   oprnds_info.create (nops);
148   for (i = 0; i < nops; i++)
149     {
150       oprnd_info = XNEW (struct _slp_oprnd_info);
151       oprnd_info->def_stmts.create (group_size);
152       oprnd_info->first_dt = vect_uninitialized_def;
153       oprnd_info->first_op_type = NULL_TREE;
154       oprnd_info->first_pattern = false;
155       oprnd_info->second_pattern = false;
156       oprnds_info.quick_push (oprnd_info);
157     }
158
159   return oprnds_info;
160 }
161
162
163 /* Free operands info.  */
164
165 static void
166 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
167 {
168   int i;
169   slp_oprnd_info oprnd_info;
170
171   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
172     {
173       oprnd_info->def_stmts.release ();
174       XDELETE (oprnd_info);
175     }
176
177   oprnds_info.release ();
178 }
179
180
181 /* Find the place of the data-ref in STMT in the interleaving chain that starts
182    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
183
184 static int
185 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
186 {
187   gimple *next_stmt = first_stmt;
188   int result = 0;
189
190   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
191     return -1;
192
193   do
194     {
195       if (next_stmt == stmt)
196         return result;
197       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
198       if (next_stmt)
199         result += GROUP_GAP (vinfo_for_stmt (next_stmt));
200     }
201   while (next_stmt);
202
203   return -1;
204 }
205
206
207 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
208    they are of a valid type and that they match the defs of the first stmt of
209    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
210    return -1, if the error could be corrected by swapping operands of the
211    operation return 1, if everything is ok return 0.  */
212
213 static int 
214 vect_get_and_check_slp_defs (vec_info *vinfo,
215                              gimple *stmt, unsigned stmt_num,
216                              vec<slp_oprnd_info> *oprnds_info)
217 {
218   tree oprnd;
219   unsigned int i, number_of_oprnds;
220   tree def;
221   gimple *def_stmt;
222   enum vect_def_type dt = vect_uninitialized_def;
223   struct loop *loop = NULL;
224   bool pattern = false;
225   slp_oprnd_info oprnd_info;
226   int first_op_idx = 1;
227   bool commutative = false;
228   bool first_op_cond = false;
229   bool first = stmt_num == 0;
230   bool second = stmt_num == 1;
231
232   if (is_a <loop_vec_info> (vinfo))
233     loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
234
235   if (is_gimple_call (stmt))
236     {
237       number_of_oprnds = gimple_call_num_args (stmt);
238       first_op_idx = 3;
239     }
240   else if (is_gimple_assign (stmt))
241     {
242       enum tree_code code = gimple_assign_rhs_code (stmt);
243       number_of_oprnds = gimple_num_ops (stmt) - 1;
244       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
245         {
246           first_op_cond = true;
247           commutative = true;
248           number_of_oprnds++;
249         }
250       else
251         commutative = commutative_tree_code (code);
252     }
253   else
254     return -1;
255
256   bool swapped = false;
257   for (i = 0; i < number_of_oprnds; i++)
258     {
259 again:
260       if (first_op_cond)
261         {
262           if (i == 0 || i == 1)
263             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
264                                   swapped ? !i : i);
265           else
266             oprnd = gimple_op (stmt, first_op_idx + i - 1);
267         }
268       else
269         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
270
271       oprnd_info = (*oprnds_info)[i];
272
273       if (!vect_is_simple_use (oprnd, NULL, vinfo, &def_stmt,
274                                &def, &dt))
275         {
276           if (dump_enabled_p ())
277             {
278               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
279                                "Build SLP failed: can't analyze def for ");
280               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
281               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
282             }
283
284           return -1;
285         }
286
287       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
288          from the pattern.  Check that all the stmts of the node are in the
289          pattern.  */
290       if (def_stmt && gimple_bb (def_stmt)
291           && ((is_a <loop_vec_info> (vinfo)
292                && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
293               || (is_a <bb_vec_info> (vinfo)
294                   && gimple_bb (def_stmt) == as_a <bb_vec_info> (vinfo)->bb
295                   && gimple_code (def_stmt) != GIMPLE_PHI))
296           && vinfo_for_stmt (def_stmt)
297           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
298           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
299           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
300         {
301           pattern = true;
302           if (!first && !oprnd_info->first_pattern
303               /* Allow different pattern state for the defs of the
304                  first stmt in reduction chains.  */
305               && (oprnd_info->first_dt != vect_reduction_def
306                   || (!second && !oprnd_info->second_pattern)))
307             {
308               if (i == 0
309                   && !swapped
310                   && commutative)
311                 {
312                   swapped = true;
313                   goto again;
314                 }
315
316               if (dump_enabled_p ())
317                 {
318                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
319                                    "Build SLP failed: some of the stmts"
320                                    " are in a pattern, and others are not ");
321                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
322                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
323                 }
324
325               return 1;
326             }
327
328           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
329           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
330
331           if (dt == vect_unknown_def_type)
332             {
333               if (dump_enabled_p ())
334                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
335                                  "Unsupported pattern.\n");
336               return -1;
337             }
338
339           switch (gimple_code (def_stmt))
340             {
341               case GIMPLE_PHI:
342                 def = gimple_phi_result (def_stmt);
343                 break;
344
345               case GIMPLE_ASSIGN:
346                 def = gimple_assign_lhs (def_stmt);
347                 break;
348
349               default:
350                 if (dump_enabled_p ())
351                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
352                                    "unsupported defining stmt:\n");
353                 return -1;
354             }
355         }
356
357       if (second)
358         oprnd_info->second_pattern = pattern;
359
360       if (first)
361         {
362           oprnd_info->first_dt = dt;
363           oprnd_info->first_pattern = pattern;
364           oprnd_info->first_op_type = TREE_TYPE (oprnd);
365         }
366       else
367         {
368           /* Not first stmt of the group, check that the def-stmt/s match
369              the def-stmt/s of the first stmt.  Allow different definition
370              types for reduction chains: the first stmt must be a
371              vect_reduction_def (a phi node), and the rest
372              vect_internal_def.  */
373           if (((oprnd_info->first_dt != dt
374                 && !(oprnd_info->first_dt == vect_reduction_def
375                      && dt == vect_internal_def)
376                 && !((oprnd_info->first_dt == vect_external_def
377                       || oprnd_info->first_dt == vect_constant_def)
378                      && (dt == vect_external_def
379                          || dt == vect_constant_def)))
380                || !types_compatible_p (oprnd_info->first_op_type,
381                                        TREE_TYPE (oprnd))))
382             {
383               /* Try swapping operands if we got a mismatch.  */
384               if (i == 0
385                   && !swapped
386                   && commutative)
387                 {
388                   swapped = true;
389                   goto again;
390                 }
391
392               if (dump_enabled_p ())
393                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394                                  "Build SLP failed: different types\n");
395
396               return 1;
397             }
398         }
399
400       /* Check the types of the definitions.  */
401       switch (dt)
402         {
403         case vect_constant_def:
404         case vect_external_def:
405         case vect_reduction_def:
406           break;
407
408         case vect_internal_def:
409           oprnd_info->def_stmts.quick_push (def_stmt);
410           break;
411
412         default:
413           /* FORNOW: Not supported.  */
414           if (dump_enabled_p ())
415             {
416               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
417                                "Build SLP failed: illegal type of def ");
418               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
419               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
420             }
421
422           return -1;
423         }
424     }
425
426   /* Swap operands.  */
427   if (swapped)
428     {
429       if (first_op_cond)
430         {
431           tree cond = gimple_assign_rhs1 (stmt);
432           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
433                              &TREE_OPERAND (cond, 1));
434           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
435         }
436       else
437         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
438                            gimple_assign_rhs2_ptr (stmt));
439     }
440
441   return 0;
442 }
443
444
445 /* Verify if the scalar stmts STMTS are isomorphic, require data
446    permutation or are of unsupported types of operation.  Return
447    true if they are, otherwise return false and indicate in *MATCHES
448    which stmts are not isomorphic to the first one.  If MATCHES[0]
449    is false then this indicates the comparison could not be
450    carried out or the stmts will never be vectorized by SLP.  */
451
452 static bool
453 vect_build_slp_tree_1 (vec_info *vinfo,
454                        vec<gimple *> stmts, unsigned int group_size,
455                        unsigned nops, unsigned int *max_nunits,
456                        unsigned int vectorization_factor, bool *matches,
457                        bool *two_operators)
458 {
459   unsigned int i;
460   gimple *first_stmt = stmts[0], *stmt = stmts[0];
461   enum tree_code first_stmt_code = ERROR_MARK;
462   enum tree_code alt_stmt_code = ERROR_MARK;
463   enum tree_code rhs_code = ERROR_MARK;
464   enum tree_code first_cond_code = ERROR_MARK;
465   tree lhs;
466   bool need_same_oprnds = false;
467   tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
468   optab optab;
469   int icode;
470   machine_mode optab_op2_mode;
471   machine_mode vec_mode;
472   HOST_WIDE_INT dummy;
473   gimple *first_load = NULL, *prev_first_load = NULL;
474   tree cond;
475
476   /* For every stmt in NODE find its def stmt/s.  */
477   FOR_EACH_VEC_ELT (stmts, i, stmt)
478     {
479       matches[i] = false;
480
481       if (dump_enabled_p ())
482         {
483           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
484           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
485           dump_printf (MSG_NOTE, "\n");
486         }
487
488       /* Fail to vectorize statements marked as unvectorizable.  */
489       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
490         {
491           if (dump_enabled_p ())
492             {
493               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494                                "Build SLP failed: unvectorizable statement ");
495               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
496               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
497             }
498           /* Fatal mismatch.  */
499           matches[0] = false;
500           return false;
501         }
502
503       lhs = gimple_get_lhs (stmt);
504       if (lhs == NULL_TREE)
505         {
506           if (dump_enabled_p ())
507             {
508               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509                                "Build SLP failed: not GIMPLE_ASSIGN nor "
510                                "GIMPLE_CALL ");
511               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513             }
514           /* Fatal mismatch.  */
515           matches[0] = false;
516           return false;
517         }
518
519        if (is_gimple_assign (stmt)
520            && gimple_assign_rhs_code (stmt) == COND_EXPR
521            && (cond = gimple_assign_rhs1 (stmt))
522            && !COMPARISON_CLASS_P (cond))
523         {
524           if (dump_enabled_p ())
525             {
526               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
527                                "Build SLP failed: condition is not "
528                                "comparison ");
529               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
530               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
531             }
532           /* Fatal mismatch.  */
533           matches[0] = false;
534           return false;
535         }
536
537       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
538       vectype = get_vectype_for_scalar_type (scalar_type);
539       if (!vectype)
540         {
541           if (dump_enabled_p ())
542             {
543               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
544                                "Build SLP failed: unsupported data-type ");
545               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
546                                  scalar_type);
547               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
548             }
549           /* Fatal mismatch.  */
550           matches[0] = false;
551           return false;
552         }
553
554       /* If populating the vector type requires unrolling then fail
555          before adjusting *max_nunits for basic-block vectorization.  */
556       if (is_a <bb_vec_info> (vinfo)
557           && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
558         {
559           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
560                            "Build SLP failed: unrolling required "
561                            "in basic block SLP\n");
562           /* Fatal mismatch.  */
563           matches[0] = false;
564           return false;
565         }
566
567       /* In case of multiple types we need to detect the smallest type.  */
568       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
569         {
570           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
571           if (is_a <bb_vec_info> (vinfo))
572             vectorization_factor = *max_nunits;
573         }
574
575       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
576         {
577           rhs_code = CALL_EXPR;
578           if (gimple_call_internal_p (call_stmt)
579               || gimple_call_tail_p (call_stmt)
580               || gimple_call_noreturn_p (call_stmt)
581               || !gimple_call_nothrow_p (call_stmt)
582               || gimple_call_chain (call_stmt))
583             {
584               if (dump_enabled_p ())
585                 {
586                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
587                                    "Build SLP failed: unsupported call type ");
588                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
589                                     call_stmt, 0);
590                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
591                 }
592               /* Fatal mismatch.  */
593               matches[0] = false;
594               return false;
595             }
596         }
597       else
598         rhs_code = gimple_assign_rhs_code (stmt);
599
600       /* Check the operation.  */
601       if (i == 0)
602         {
603           first_stmt_code = rhs_code;
604
605           /* Shift arguments should be equal in all the packed stmts for a
606              vector shift with scalar shift operand.  */
607           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
608               || rhs_code == LROTATE_EXPR
609               || rhs_code == RROTATE_EXPR)
610             {
611               vec_mode = TYPE_MODE (vectype);
612
613               /* First see if we have a vector/vector shift.  */
614               optab = optab_for_tree_code (rhs_code, vectype,
615                                            optab_vector);
616
617               if (!optab
618                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
619                 {
620                   /* No vector/vector shift, try for a vector/scalar shift.  */
621                   optab = optab_for_tree_code (rhs_code, vectype,
622                                                optab_scalar);
623
624                   if (!optab)
625                     {
626                       if (dump_enabled_p ())
627                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
628                                          "Build SLP failed: no optab.\n");
629                       /* Fatal mismatch.  */
630                       matches[0] = false;
631                       return false;
632                     }
633                   icode = (int) optab_handler (optab, vec_mode);
634                   if (icode == CODE_FOR_nothing)
635                     {
636                       if (dump_enabled_p ())
637                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
638                                          "Build SLP failed: "
639                                          "op not supported by target.\n");
640                       /* Fatal mismatch.  */
641                       matches[0] = false;
642                       return false;
643                     }
644                   optab_op2_mode = insn_data[icode].operand[2].mode;
645                   if (!VECTOR_MODE_P (optab_op2_mode))
646                     {
647                       need_same_oprnds = true;
648                       first_op1 = gimple_assign_rhs2 (stmt);
649                     }
650                 }
651             }
652           else if (rhs_code == WIDEN_LSHIFT_EXPR)
653             {
654               need_same_oprnds = true;
655               first_op1 = gimple_assign_rhs2 (stmt);
656             }
657         }
658       else
659         {
660           if (first_stmt_code != rhs_code
661               && alt_stmt_code == ERROR_MARK)
662             alt_stmt_code = rhs_code;
663           if (first_stmt_code != rhs_code
664               && (first_stmt_code != IMAGPART_EXPR
665                   || rhs_code != REALPART_EXPR)
666               && (first_stmt_code != REALPART_EXPR
667                   || rhs_code != IMAGPART_EXPR)
668               /* Handle mismatches in plus/minus by computing both
669                  and merging the results.  */
670               && !((first_stmt_code == PLUS_EXPR
671                     || first_stmt_code == MINUS_EXPR)
672                    && (alt_stmt_code == PLUS_EXPR
673                        || alt_stmt_code == MINUS_EXPR)
674                    && rhs_code == alt_stmt_code)
675               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
676                    && (first_stmt_code == ARRAY_REF
677                        || first_stmt_code == BIT_FIELD_REF
678                        || first_stmt_code == INDIRECT_REF
679                        || first_stmt_code == COMPONENT_REF
680                        || first_stmt_code == MEM_REF)))
681             {
682               if (dump_enabled_p ())
683                 {
684                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
685                                    "Build SLP failed: different operation "
686                                    "in stmt ");
687                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
688                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
689                                    "original stmt ");
690                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
691                                     first_stmt, 0);
692                 }
693               /* Mismatch.  */
694               continue;
695             }
696
697           if (need_same_oprnds
698               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
699             {
700               if (dump_enabled_p ())
701                 {
702                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
703                                    "Build SLP failed: different shift "
704                                    "arguments in ");
705                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
706                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
707                 }
708               /* Mismatch.  */
709               continue;
710             }
711
712           if (rhs_code == CALL_EXPR)
713             {
714               gimple *first_stmt = stmts[0];
715               if (gimple_call_num_args (stmt) != nops
716                   || !operand_equal_p (gimple_call_fn (first_stmt),
717                                        gimple_call_fn (stmt), 0)
718                   || gimple_call_fntype (first_stmt)
719                      != gimple_call_fntype (stmt))
720                 {
721                   if (dump_enabled_p ())
722                     {
723                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
724                                        "Build SLP failed: different calls in ");
725                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
726                                         stmt, 0);
727                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
728                     }
729                   /* Mismatch.  */
730                   continue;
731                 }
732             }
733         }
734
735       /* Grouped store or load.  */
736       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
737         {
738           if (REFERENCE_CLASS_P (lhs))
739             {
740               /* Store.  */
741               ;
742             }
743           else
744             {
745               /* Load.  */
746               /* Check that the size of interleaved loads group is not
747                  greater than the SLP group size.  */
748               unsigned ncopies
749                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
750               if (is_a <loop_vec_info> (vinfo)
751                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
752                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
753                        - GROUP_GAP (vinfo_for_stmt (stmt)))
754                       > ncopies * group_size))
755                 {
756                   if (dump_enabled_p ())
757                     {
758                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
759                                        "Build SLP failed: the number "
760                                        "of interleaved loads is greater than "
761                                        "the SLP group size ");
762                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
763                                         stmt, 0);
764                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
765                     }
766                   /* Fatal mismatch.  */
767                   matches[0] = false;
768                   return false;
769                 }
770
771               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
772               if (prev_first_load)
773                 {
774                   /* Check that there are no loads from different interleaving
775                      chains in the same node.  */
776                   if (prev_first_load != first_load)
777                     {
778                       if (dump_enabled_p ())
779                         {
780                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
781                                            vect_location, 
782                                            "Build SLP failed: different "
783                                            "interleaving chains in one node ");
784                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
785                                             stmt, 0);
786                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787                         }
788                       /* Mismatch.  */
789                       continue;
790                     }
791                 }
792               else
793                 prev_first_load = first_load;
794            }
795         } /* Grouped access.  */
796       else
797         {
798           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
799             {
800               /* Not grouped load.  */
801               if (dump_enabled_p ())
802                 {
803                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
804                                    "Build SLP failed: not grouped load ");
805                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
806                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
807                 }
808
809               /* FORNOW: Not grouped loads are not supported.  */
810               /* Fatal mismatch.  */
811               matches[0] = false;
812               return false;
813             }
814
815           /* Not memory operation.  */
816           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
817               && TREE_CODE_CLASS (rhs_code) != tcc_unary
818               && TREE_CODE_CLASS (rhs_code) != tcc_expression
819               && rhs_code != CALL_EXPR)
820             {
821               if (dump_enabled_p ())
822                 {
823                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
824                                    "Build SLP failed: operation");
825                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
826                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
827                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
828                 }
829               /* Fatal mismatch.  */
830               matches[0] = false;
831               return false;
832             }
833
834           if (rhs_code == COND_EXPR)
835             {
836               tree cond_expr = gimple_assign_rhs1 (stmt);
837
838               if (i == 0)
839                 first_cond_code = TREE_CODE (cond_expr);
840               else if (first_cond_code != TREE_CODE (cond_expr))
841                 {
842                   if (dump_enabled_p ())
843                     {
844                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845                                        "Build SLP failed: different"
846                                        " operation");
847                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
848                                         stmt, 0);
849                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
850                     }
851                   /* Mismatch.  */
852                   continue;
853                 }
854             }
855         }
856
857       matches[i] = true;
858     }
859
860   for (i = 0; i < group_size; ++i)
861     if (!matches[i])
862       return false;
863
864   /* If we allowed a two-operation SLP node verify the target can cope
865      with the permute we are going to use.  */
866   if (alt_stmt_code != ERROR_MARK
867       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
868     {
869       unsigned char *sel
870         = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
871       for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
872         {
873           sel[i] = i;
874           if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
875             sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
876         }
877       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
878         {
879           for (i = 0; i < group_size; ++i)
880             if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
881               {
882                 matches[i] = false;
883                 if (dump_enabled_p ())
884                   {
885                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
886                                      "Build SLP failed: different operation "
887                                      "in stmt ");
888                     dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
889                                       stmts[i], 0);
890                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891                                      "original stmt ");
892                     dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
893                                       first_stmt, 0);
894                   }
895               }
896           return false;
897         }
898       *two_operators = true;
899     }
900
901   return true;
902 }
903
904 /* Recursively build an SLP tree starting from NODE.
905    Fail (and return a value not equal to zero) if def-stmts are not
906    isomorphic, require data permutation or are of unsupported types of
907    operation.  Otherwise, return 0.
908    The value returned is the depth in the SLP tree where a mismatch
909    was found.  */
910
911 static bool
912 vect_build_slp_tree (vec_info *vinfo,
913                      slp_tree *node, unsigned int group_size,
914                      unsigned int *max_nunits,
915                      vec<slp_tree> *loads,
916                      unsigned int vectorization_factor,
917                      bool *matches, unsigned *npermutes, unsigned *tree_size,
918                      unsigned max_tree_size)
919 {
920   unsigned nops, i, this_tree_size = 0;
921   gimple *stmt;
922
923   matches[0] = false;
924
925   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
926   if (is_gimple_call (stmt))
927     nops = gimple_call_num_args (stmt);
928   else if (is_gimple_assign (stmt))
929     {
930       nops = gimple_num_ops (stmt) - 1;
931       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
932         nops++;
933     }
934   else
935     return false;
936
937   bool two_operators = false;
938   if (!vect_build_slp_tree_1 (vinfo,
939                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
940                               max_nunits, vectorization_factor, matches,
941                               &two_operators))
942     return false;
943   SLP_TREE_TWO_OPERATORS (*node) = two_operators;
944
945   /* If the SLP node is a load, terminate the recursion.  */
946   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
947       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
948     {
949       loads->safe_push (*node);
950       return true;
951     }
952
953   /* Get at the operands, verifying they are compatible.  */
954   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
955   slp_oprnd_info oprnd_info;
956   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
957     {
958       switch (vect_get_and_check_slp_defs (vinfo, stmt, i, &oprnds_info))
959         {
960         case 0:
961           break;
962         case -1:
963           matches[0] = false;
964           vect_free_oprnd_info (oprnds_info);
965           return false;
966         case 1:
967           matches[i] = false;
968           break;
969         }
970     }
971   for (i = 0; i < group_size; ++i)
972     if (!matches[i])
973       {
974         vect_free_oprnd_info (oprnds_info);
975         return false;
976       }
977
978   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
979
980   /* Create SLP_TREE nodes for the definition node/s.  */
981   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
982     {
983       slp_tree child;
984       unsigned old_nloads = loads->length ();
985       unsigned old_max_nunits = *max_nunits;
986
987       if (oprnd_info->first_dt != vect_internal_def)
988         continue;
989
990       if (++this_tree_size > max_tree_size)
991         {
992           vect_free_oprnd_info (oprnds_info);
993           return false;
994         }
995
996       child = vect_create_new_slp_node (oprnd_info->def_stmts);
997       if (!child)
998         {
999           vect_free_oprnd_info (oprnds_info);
1000           return false;
1001         }
1002
1003       if (vect_build_slp_tree (vinfo, &child,
1004                                group_size, max_nunits, loads,
1005                                vectorization_factor, matches,
1006                                npermutes, &this_tree_size, max_tree_size))
1007         {
1008           /* If we have all children of child built up from scalars then just
1009              throw that away and build it up this node from scalars.  */
1010           if (!SLP_TREE_CHILDREN (child).is_empty ())
1011             {
1012               unsigned int j;
1013               slp_tree grandchild;
1014
1015               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1016                 if (grandchild != NULL)
1017                   break;
1018               if (!grandchild)
1019                 {
1020                   /* Roll back.  */
1021                   *max_nunits = old_max_nunits;
1022                   loads->truncate (old_nloads);
1023                   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1024                       vect_free_slp_tree (grandchild);
1025                   SLP_TREE_CHILDREN (child).truncate (0);
1026
1027                   dump_printf_loc (MSG_NOTE, vect_location,
1028                                    "Building parent vector operands from "
1029                                    "scalars instead\n");
1030                   oprnd_info->def_stmts = vNULL;
1031                   vect_free_slp_tree (child);
1032                   SLP_TREE_CHILDREN (*node).quick_push (NULL);
1033                   continue;
1034                 }
1035             }
1036
1037           oprnd_info->def_stmts = vNULL;
1038           SLP_TREE_CHILDREN (*node).quick_push (child);
1039           continue;
1040         }
1041
1042       /* If the SLP build failed fatally and we analyze a basic-block
1043          simply treat nodes we fail to build as externally defined
1044          (and thus build vectors from the scalar defs).
1045          The cost model will reject outright expensive cases.
1046          ???  This doesn't treat cases where permutation ultimatively
1047          fails (or we don't try permutation below).  Ideally we'd
1048          even compute a permutation that will end up with the maximum
1049          SLP tree size...  */
1050       if (is_a <bb_vec_info> (vinfo)
1051           && !matches[0]
1052           /* ???  Rejecting patterns this way doesn't work.  We'd have to
1053              do extra work to cancel the pattern so the uses see the
1054              scalar version.  */
1055           && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1056         {
1057           unsigned int j;
1058           slp_tree grandchild;
1059
1060           /* Roll back.  */
1061           *max_nunits = old_max_nunits;
1062           loads->truncate (old_nloads);
1063           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1064             vect_free_slp_tree (grandchild);
1065           SLP_TREE_CHILDREN (child).truncate (0);
1066
1067           dump_printf_loc (MSG_NOTE, vect_location,
1068                            "Building vector operands from scalars\n");
1069           oprnd_info->def_stmts = vNULL;
1070           vect_free_slp_tree (child);
1071           SLP_TREE_CHILDREN (*node).quick_push (NULL);
1072           continue;
1073         }
1074
1075       /* If the SLP build for operand zero failed and operand zero
1076          and one can be commutated try that for the scalar stmts
1077          that failed the match.  */
1078       if (i == 0
1079           /* A first scalar stmt mismatch signals a fatal mismatch.  */
1080           && matches[0]
1081           /* ???  For COND_EXPRs we can swap the comparison operands
1082              as well as the arms under some constraints.  */
1083           && nops == 2
1084           && oprnds_info[1]->first_dt == vect_internal_def
1085           && is_gimple_assign (stmt)
1086           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1087           && !SLP_TREE_TWO_OPERATORS (*node)
1088           /* Do so only if the number of not successful permutes was nor more
1089              than a cut-ff as re-trying the recursive match on
1090              possibly each level of the tree would expose exponential
1091              behavior.  */
1092           && *npermutes < 4)
1093         {
1094           unsigned int j;
1095           slp_tree grandchild;
1096
1097           /* Roll back.  */
1098           *max_nunits = old_max_nunits;
1099           loads->truncate (old_nloads);
1100           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1101             vect_free_slp_tree (grandchild);
1102           SLP_TREE_CHILDREN (child).truncate (0);
1103
1104           /* Swap mismatched definition stmts.  */
1105           dump_printf_loc (MSG_NOTE, vect_location,
1106                            "Re-trying with swapped operands of stmts ");
1107           for (j = 0; j < group_size; ++j)
1108             if (!matches[j])
1109               {
1110                 std::swap (oprnds_info[0]->def_stmts[j],
1111                            oprnds_info[1]->def_stmts[j]);
1112                 dump_printf (MSG_NOTE, "%d ", j);
1113               }
1114           dump_printf (MSG_NOTE, "\n");
1115           /* And try again with scratch 'matches' ... */
1116           bool *tem = XALLOCAVEC (bool, group_size);
1117           if (vect_build_slp_tree (vinfo, &child,
1118                                    group_size, max_nunits, loads,
1119                                    vectorization_factor,
1120                                    tem, npermutes, &this_tree_size,
1121                                    max_tree_size))
1122             {
1123               /* ... so if successful we can apply the operand swapping
1124                  to the GIMPLE IL.  This is necessary because for example
1125                  vect_get_slp_defs uses operand indexes and thus expects
1126                  canonical operand order.  */
1127               for (j = 0; j < group_size; ++j)
1128                 if (!matches[j])
1129                   {
1130                     gimple *stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1131                     swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1132                                        gimple_assign_rhs2_ptr (stmt));
1133                   }
1134               oprnd_info->def_stmts = vNULL;
1135               SLP_TREE_CHILDREN (*node).quick_push (child);
1136               continue;
1137             }
1138
1139           ++*npermutes;
1140         }
1141
1142       oprnd_info->def_stmts = vNULL;
1143       vect_free_slp_tree (child);
1144       vect_free_oprnd_info (oprnds_info);
1145       return false;
1146     }
1147
1148   if (tree_size)
1149     *tree_size += this_tree_size;
1150
1151   vect_free_oprnd_info (oprnds_info);
1152   return true;
1153 }
1154
1155 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1156
1157 static void
1158 vect_print_slp_tree (int dump_kind, slp_tree node)
1159 {
1160   int i;
1161   gimple *stmt;
1162   slp_tree child;
1163
1164   if (!node)
1165     return;
1166
1167   dump_printf (dump_kind, "node ");
1168   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1169     {
1170       dump_printf (dump_kind, "\n\tstmt %d ", i);
1171       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1172     }
1173   dump_printf (dump_kind, "\n");
1174
1175   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1176     vect_print_slp_tree (dump_kind, child);
1177 }
1178
1179
1180 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1181    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1182    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1183    stmts in NODE are to be marked.  */
1184
1185 static void
1186 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1187 {
1188   int i;
1189   gimple *stmt;
1190   slp_tree child;
1191
1192   if (!node)
1193     return;
1194
1195   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1196     if (j < 0 || i == j)
1197       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1198
1199   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1200     vect_mark_slp_stmts (child, mark, j);
1201 }
1202
1203
1204 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1205
1206 static void
1207 vect_mark_slp_stmts_relevant (slp_tree node)
1208 {
1209   int i;
1210   gimple *stmt;
1211   stmt_vec_info stmt_info;
1212   slp_tree child;
1213
1214   if (!node)
1215     return;
1216
1217   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1218     {
1219       stmt_info = vinfo_for_stmt (stmt);
1220       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1221                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1222       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1223     }
1224
1225   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1226     vect_mark_slp_stmts_relevant (child);
1227 }
1228
1229
1230 /* Rearrange the statements of NODE according to PERMUTATION.  */
1231
1232 static void
1233 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1234                           vec<unsigned> permutation)
1235 {
1236   gimple *stmt;
1237   vec<gimple *> tmp_stmts;
1238   unsigned int i;
1239   slp_tree child;
1240
1241   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1242     vect_slp_rearrange_stmts (child, group_size, permutation);
1243
1244   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1245   tmp_stmts.create (group_size);
1246   tmp_stmts.quick_grow_cleared (group_size);
1247
1248   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1249     tmp_stmts[permutation[i]] = stmt;
1250
1251   SLP_TREE_SCALAR_STMTS (node).release ();
1252   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1253 }
1254
1255
1256 /* Attempt to reorder stmts in a reduction chain so that we don't
1257    require any load permutation.  Return true if that was possible,
1258    otherwise return false.  */
1259
1260 static bool
1261 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1262 {
1263   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1264   unsigned int i, j;
1265   sbitmap load_index;
1266   unsigned int lidx;
1267   slp_tree node, load;
1268
1269   /* Compare all the permutation sequences to the first one.  We know
1270      that at least one load is permuted.  */
1271   node = SLP_INSTANCE_LOADS (slp_instn)[0];
1272   if (!node->load_permutation.exists ())
1273     return false;
1274   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1275     {
1276       if (!load->load_permutation.exists ())
1277         return false;
1278       FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1279         if (lidx != node->load_permutation[j])
1280           return false;
1281     }
1282
1283   /* Check that the loads in the first sequence are different and there
1284      are no gaps between them.  */
1285   load_index = sbitmap_alloc (group_size);
1286   bitmap_clear (load_index);
1287   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1288     {
1289       if (bitmap_bit_p (load_index, lidx))
1290         {
1291           sbitmap_free (load_index);
1292           return false;
1293         }
1294       bitmap_set_bit (load_index, lidx);
1295     }
1296   for (i = 0; i < group_size; i++)
1297     if (!bitmap_bit_p (load_index, i))
1298       {
1299         sbitmap_free (load_index);
1300         return false;
1301       }
1302   sbitmap_free (load_index);
1303
1304   /* This permutation is valid for reduction.  Since the order of the
1305      statements in the nodes is not important unless they are memory
1306      accesses, we can rearrange the statements in all the nodes
1307      according to the order of the loads.  */
1308   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1309                             node->load_permutation);
1310
1311   /* We are done, no actual permutations need to be generated.  */
1312   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1313     SLP_TREE_LOAD_PERMUTATION (node).release ();
1314   return true;
1315 }
1316
1317 /* Check if the required load permutations in the SLP instance
1318    SLP_INSTN are supported.  */
1319
1320 static bool
1321 vect_supported_load_permutation_p (slp_instance slp_instn)
1322 {
1323   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1324   unsigned int i, j, k, next;
1325   slp_tree node;
1326   gimple *stmt, *load, *next_load, *first_load;
1327   struct data_reference *dr;
1328
1329   if (dump_enabled_p ())
1330     {
1331       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1332       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1333         if (node->load_permutation.exists ())
1334           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1335             dump_printf (MSG_NOTE, "%d ", next);
1336         else
1337           for (k = 0; k < group_size; ++k)
1338             dump_printf (MSG_NOTE, "%d ", k);
1339       dump_printf (MSG_NOTE, "\n");
1340     }
1341
1342   /* In case of reduction every load permutation is allowed, since the order
1343      of the reduction statements is not important (as opposed to the case of
1344      grouped stores).  The only condition we need to check is that all the
1345      load nodes are of the same size and have the same permutation (and then
1346      rearrange all the nodes of the SLP instance according to this 
1347      permutation).  */
1348
1349   /* Check that all the load nodes are of the same size.  */
1350   /* ???  Can't we assert this? */
1351   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1352     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1353       return false;
1354
1355   node = SLP_INSTANCE_TREE (slp_instn);
1356   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1357
1358   /* Reduction (there are no data-refs in the root).
1359      In reduction chain the order of the loads is not important.  */
1360   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1361       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1362     {
1363       if (vect_attempt_slp_rearrange_stmts (slp_instn))
1364         return true;
1365
1366       /* Fallthru to general load permutation handling.  */
1367     }
1368
1369   /* In basic block vectorization we allow any subchain of an interleaving
1370      chain.
1371      FORNOW: not supported in loop SLP because of realignment compications.  */
1372   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1373     {
1374       /* Check whether the loads in an instance form a subchain and thus
1375          no permutation is necessary.  */
1376       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1377         {
1378           if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1379             continue;
1380           bool subchain_p = true;
1381           next_load = NULL;
1382           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1383             {
1384               if (j != 0
1385                   && (next_load != load
1386                       || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1387                 {
1388                   subchain_p = false;
1389                   break;
1390                 }
1391               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1392             }
1393           if (subchain_p)
1394             SLP_TREE_LOAD_PERMUTATION (node).release ();
1395           else
1396             {
1397               /* Verify the permutation can be generated.  */
1398               vec<tree> tem;
1399               if (!vect_transform_slp_perm_load (node, tem, NULL,
1400                                                  1, slp_instn, true))
1401                 {
1402                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1403                                    vect_location,
1404                                    "unsupported load permutation\n");
1405                   return false;
1406                 }
1407             }
1408         }
1409
1410       /* Check that the alignment of the first load in every subchain, i.e.,
1411          the first statement in every load node, is supported.
1412          ???  This belongs in alignment checking.  */
1413       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1414         {
1415           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1416           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1417             {
1418               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1419               if (vect_supportable_dr_alignment (dr, false)
1420                   == dr_unaligned_unsupported)
1421                 {
1422                   if (dump_enabled_p ())
1423                     {
1424                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1425                                        vect_location,
1426                                        "unsupported unaligned load ");
1427                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1428                                         first_load, 0);
1429                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1430                     }
1431                   return false;
1432                 }
1433             }
1434         }
1435
1436       return true;
1437     }
1438
1439   /* For loop vectorization verify we can generate the permutation.  */
1440   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1441     if (node->load_permutation.exists ()
1442         && !vect_transform_slp_perm_load
1443               (node, vNULL, NULL,
1444                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1445       return false;
1446
1447   return true;
1448 }
1449
1450
1451 /* Find the last store in SLP INSTANCE.  */
1452
1453 static gimple *
1454 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1455 {
1456   gimple *last = NULL, *stmt;
1457
1458   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1459     {
1460       stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1461       if (is_pattern_stmt_p (stmt_vinfo))
1462         last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1463       else
1464         last = get_later_stmt (stmt, last);
1465     }
1466
1467   return last;
1468 }
1469
1470 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1471
1472 static void
1473 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1474                          stmt_vector_for_cost *prologue_cost_vec,
1475                          stmt_vector_for_cost *body_cost_vec,
1476                          unsigned ncopies_for_cost)
1477 {
1478   unsigned i;
1479   slp_tree child;
1480   gimple *stmt, *s;
1481   stmt_vec_info stmt_info;
1482   tree lhs;
1483   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1484
1485   /* Recurse down the SLP tree.  */
1486   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1487     if (child)
1488       vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1489                                body_cost_vec, ncopies_for_cost);
1490
1491   /* Look at the first scalar stmt to determine the cost.  */
1492   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1493   stmt_info = vinfo_for_stmt (stmt);
1494   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1495     {
1496       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1497         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1498                                vect_uninitialized_def,
1499                                node, prologue_cost_vec, body_cost_vec);
1500       else
1501         {
1502           int i;
1503           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1504           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1505                                 node, prologue_cost_vec, body_cost_vec);
1506           /* If the load is permuted record the cost for the permutation.
1507              ???  Loads from multiple chains are let through here only
1508              for a single special case involving complex numbers where
1509              in the end no permutation is necessary.  */
1510           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1511             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1512                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1513                 && vect_get_place_in_interleaving_chain
1514                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1515               {
1516                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1517                                   stmt_info, 0, vect_body);
1518                 break;
1519               }
1520         }
1521     }
1522   else
1523     {
1524       record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1525                         stmt_info, 0, vect_body);
1526       if (SLP_TREE_TWO_OPERATORS (node))
1527         {
1528           record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1529                             stmt_info, 0, vect_body);
1530           record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1531                             stmt_info, 0, vect_body);
1532         }
1533     }
1534
1535   /* Scan operands and account for prologue cost of constants/externals.
1536      ???  This over-estimates cost for multiple uses and should be
1537      re-engineered.  */
1538   lhs = gimple_get_lhs (stmt);
1539   for (i = 0; i < gimple_num_ops (stmt); ++i)
1540     {
1541       tree def, op = gimple_op (stmt, i);
1542       gimple *def_stmt;
1543       enum vect_def_type dt;
1544       if (!op || op == lhs)
1545         continue;
1546       if (vect_is_simple_use (op, NULL, stmt_info->vinfo, &def_stmt, &def, &dt))
1547         {
1548           /* Without looking at the actual initializer a vector of
1549              constants can be implemented as load from the constant pool.
1550              ???  We need to pass down stmt_info for a vector type
1551              even if it points to the wrong stmt.  */
1552           if (dt == vect_constant_def)
1553             record_stmt_cost (prologue_cost_vec, 1, vector_load,
1554                               stmt_info, 0, vect_prologue);
1555           else if (dt == vect_external_def)
1556             record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1557                               stmt_info, 0, vect_prologue);
1558         }
1559     }
1560 }
1561
1562 /* Compute the cost for the SLP instance INSTANCE.  */
1563
1564 static void
1565 vect_analyze_slp_cost (slp_instance instance, void *data)
1566 {
1567   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1568   unsigned ncopies_for_cost;
1569   stmt_info_for_cost *si;
1570   unsigned i;
1571
1572   /* Calculate the number of vector stmts to create based on the unrolling
1573      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1574      GROUP_SIZE / NUNITS otherwise.  */
1575   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1576   slp_tree node = SLP_INSTANCE_TREE (instance);
1577   stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1578   /* Adjust the group_size by the vectorization factor which is always one
1579      for basic-block vectorization.  */
1580   if (STMT_VINFO_LOOP_VINFO (stmt_info))
1581     group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1582   unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1583   /* For reductions look at a reduction operand in case the reduction
1584      operation is widening like DOT_PROD or SAD.  */
1585   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1586     {
1587       gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1588       switch (gimple_assign_rhs_code (stmt))
1589         {
1590         case DOT_PROD_EXPR:
1591         case SAD_EXPR:
1592           nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1593                                 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1594           break;
1595         default:;
1596         }
1597     }
1598   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1599
1600   prologue_cost_vec.create (10);
1601   body_cost_vec.create (10);
1602   vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1603                            &prologue_cost_vec, &body_cost_vec,
1604                            ncopies_for_cost);
1605
1606   /* Record the prologue costs, which were delayed until we were
1607      sure that SLP was successful.  */
1608   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1609     {
1610       struct _stmt_vec_info *stmt_info
1611         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1612       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1613                             si->misalign, vect_prologue);
1614     }
1615
1616   /* Record the instance's instructions in the target cost model.  */
1617   FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1618     {
1619       struct _stmt_vec_info *stmt_info
1620         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1621       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1622                             si->misalign, vect_body);
1623     }
1624
1625   prologue_cost_vec.release ();
1626   body_cost_vec.release ();
1627 }
1628
1629 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1630    vect_build_slp_tree to build a tree of packed stmts if possible.
1631    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1632
1633 static bool
1634 vect_analyze_slp_instance (vec_info *vinfo,
1635                            gimple *stmt, unsigned max_tree_size)
1636 {
1637   slp_instance new_instance;
1638   slp_tree node;
1639   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1640   unsigned int unrolling_factor = 1, nunits;
1641   tree vectype, scalar_type = NULL_TREE;
1642   gimple *next;
1643   unsigned int vectorization_factor = 0;
1644   int i;
1645   unsigned int max_nunits = 0;
1646   vec<slp_tree> loads;
1647   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1648   vec<gimple *> scalar_stmts;
1649
1650   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1651     {
1652       if (dr)
1653         {
1654           scalar_type = TREE_TYPE (DR_REF (dr));
1655           vectype = get_vectype_for_scalar_type (scalar_type);
1656         }
1657       else
1658         {
1659           gcc_assert (is_a <loop_vec_info> (vinfo));
1660           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1661         }
1662
1663       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1664     }
1665   else
1666     {
1667       gcc_assert (is_a <loop_vec_info> (vinfo));
1668       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1669       group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1670     }
1671
1672   if (!vectype)
1673     {
1674       if (dump_enabled_p ())
1675         {
1676           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677                            "Build SLP failed: unsupported data-type ");
1678           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1679           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1680         }
1681
1682       return false;
1683     }
1684
1685   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1686   if (is_a <loop_vec_info> (vinfo))
1687     vectorization_factor = as_a <loop_vec_info> (vinfo)->vectorization_factor;
1688   else
1689     vectorization_factor = nunits;
1690
1691   /* Calculate the unrolling factor.  */
1692   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1693   if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo))
1694     {
1695       if (dump_enabled_p ())
1696         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697                          "Build SLP failed: unrolling required in basic"
1698                          " block SLP\n");
1699
1700       return false;
1701     }
1702
1703   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1704   scalar_stmts.create (group_size);
1705   next = stmt;
1706   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1707     {
1708       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1709       while (next)
1710         {
1711           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1712               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1713             scalar_stmts.safe_push (
1714                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1715           else
1716             scalar_stmts.safe_push (next);
1717           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1718         }
1719       /* Mark the first element of the reduction chain as reduction to properly
1720          transform the node.  In the reduction analysis phase only the last
1721          element of the chain is marked as reduction.  */
1722       if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1723         STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1724     }
1725   else
1726     {
1727       /* Collect reduction statements.  */
1728       vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1729       for (i = 0; reductions.iterate (i, &next); i++)
1730         scalar_stmts.safe_push (next);
1731     }
1732
1733   node = vect_create_new_slp_node (scalar_stmts);
1734
1735   loads.create (group_size);
1736
1737   /* Build the tree for the SLP instance.  */
1738   bool *matches = XALLOCAVEC (bool, group_size);
1739   unsigned npermutes = 0;
1740   if (vect_build_slp_tree (vinfo, &node, group_size,
1741                            &max_nunits, &loads,
1742                            vectorization_factor, matches, &npermutes, NULL,
1743                            max_tree_size))
1744     {
1745       /* Calculate the unrolling factor based on the smallest type.  */
1746       if (max_nunits > nunits)
1747         unrolling_factor = least_common_multiple (max_nunits, group_size)
1748                            / group_size;
1749
1750       if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo))
1751         {
1752           if (dump_enabled_p ())
1753             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1754                              "Build SLP failed: unrolling required in basic"
1755                              " block SLP\n");
1756           vect_free_slp_tree (node);
1757           loads.release ();
1758           return false;
1759         }
1760
1761       /* Create a new SLP instance.  */
1762       new_instance = XNEW (struct _slp_instance);
1763       SLP_INSTANCE_TREE (new_instance) = node;
1764       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1765       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1766       SLP_INSTANCE_LOADS (new_instance) = loads;
1767
1768       /* Compute the load permutation.  */
1769       slp_tree load_node;
1770       bool loads_permuted = false;
1771       FOR_EACH_VEC_ELT (loads, i, load_node)
1772         {
1773           vec<unsigned> load_permutation;
1774           int j;
1775           gimple *load, *first_stmt;
1776           bool this_load_permuted = false;
1777           load_permutation.create (group_size);
1778           first_stmt = GROUP_FIRST_ELEMENT
1779               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1780           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1781             {
1782               int load_place
1783                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1784               gcc_assert (load_place != -1);
1785               if (load_place != j)
1786                 this_load_permuted = true;
1787               load_permutation.safe_push (load_place);
1788             }
1789           if (!this_load_permuted
1790               /* The load requires permutation when unrolling exposes
1791                  a gap either because the group is larger than the SLP
1792                  group-size or because there is a gap between the groups.  */
1793               && (unrolling_factor == 1
1794                   || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1795                       && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
1796             {
1797               load_permutation.release ();
1798               continue;
1799             }
1800           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1801           loads_permuted = true;
1802         }
1803
1804       if (loads_permuted)
1805         {
1806           if (!vect_supported_load_permutation_p (new_instance))
1807             {
1808               if (dump_enabled_p ())
1809                 {
1810                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1811                                    "Build SLP failed: unsupported load "
1812                                    "permutation ");
1813                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1814                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1815                 }
1816               vect_free_slp_instance (new_instance);
1817               return false;
1818             }
1819         }
1820
1821       vinfo->slp_instances.safe_push (new_instance);
1822
1823       if (dump_enabled_p ())
1824         vect_print_slp_tree (MSG_NOTE, node);
1825
1826       return true;
1827     }
1828
1829   /* Failed to SLP.  */
1830   /* Free the allocated memory.  */
1831   vect_free_slp_tree (node);
1832   loads.release ();
1833
1834   return false;
1835 }
1836
1837
1838 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1839    trees of packed scalar stmts if SLP is possible.  */
1840
1841 bool
1842 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
1843 {
1844   unsigned int i;
1845   gimple *first_element;
1846   bool ok = false;
1847
1848   if (dump_enabled_p ())
1849     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1850
1851   /* Find SLP sequences starting from groups of grouped stores.  */
1852   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
1853     if (vect_analyze_slp_instance (vinfo, first_element, max_tree_size))
1854       ok = true;
1855
1856   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1857     {
1858       if (loop_vinfo->reduction_chains.length () > 0)
1859         {
1860           /* Find SLP sequences starting from reduction chains.  */
1861           FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
1862               if (vect_analyze_slp_instance (vinfo, first_element,
1863                                              max_tree_size))
1864                 ok = true;
1865               else
1866                 return false;
1867
1868           /* Don't try to vectorize SLP reductions if reduction chain was
1869              detected.  */
1870           return ok;
1871         }
1872
1873       /* Find SLP sequences starting from groups of reductions.  */
1874       if (loop_vinfo->reductions.length () > 1
1875           && vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
1876                                         max_tree_size))
1877         ok = true;
1878     }
1879
1880   return true;
1881 }
1882
1883
1884 /* For each possible SLP instance decide whether to SLP it and calculate overall
1885    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1886    least one instance.  */
1887
1888 bool
1889 vect_make_slp_decision (loop_vec_info loop_vinfo)
1890 {
1891   unsigned int i, unrolling_factor = 1;
1892   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1893   slp_instance instance;
1894   int decided_to_slp = 0;
1895
1896   if (dump_enabled_p ())
1897     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1898                      "\n");
1899
1900   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1901     {
1902       /* FORNOW: SLP if you can.  */
1903       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1904         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1905
1906       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1907          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1908          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1909       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1910       decided_to_slp++;
1911     }
1912
1913   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1914
1915   if (decided_to_slp && dump_enabled_p ())
1916     dump_printf_loc (MSG_NOTE, vect_location,
1917                      "Decided to SLP %d instances. Unrolling factor %d\n",
1918                      decided_to_slp, unrolling_factor);
1919
1920   return (decided_to_slp > 0);
1921 }
1922
1923
1924 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1925    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1926
1927 static void
1928 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1929 {
1930   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1931   imm_use_iterator imm_iter;
1932   gimple *use_stmt;
1933   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1934   slp_tree child;
1935   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1936   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1937   int j;
1938
1939   /* Propagate hybrid down the SLP tree.  */
1940   if (stype == hybrid)
1941     ;
1942   else if (HYBRID_SLP_STMT (stmt_vinfo))
1943     stype = hybrid;
1944   else
1945     {
1946       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1947       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1948       /* We always get the pattern stmt here, but for immediate
1949          uses we have to use the LHS of the original stmt.  */
1950       gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1951       if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
1952         stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
1953       if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1954         FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1955           {
1956             if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1957               continue;
1958             use_vinfo = vinfo_for_stmt (use_stmt);
1959             if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1960                 && STMT_VINFO_RELATED_STMT (use_vinfo))
1961               use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
1962             if (!STMT_SLP_TYPE (use_vinfo)
1963                 && (STMT_VINFO_RELEVANT (use_vinfo)
1964                     || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
1965                 && !(gimple_code (use_stmt) == GIMPLE_PHI
1966                      && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1967               {
1968                 if (dump_enabled_p ())
1969                   {
1970                     dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
1971                                      "def in non-SLP stmt: ");
1972                     dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
1973                   }
1974                 stype = hybrid;
1975               }
1976           }
1977     }
1978
1979   if (stype == hybrid
1980       && !HYBRID_SLP_STMT (stmt_vinfo))
1981     {
1982       if (dump_enabled_p ())
1983         {
1984           dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
1985           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
1986         }
1987       STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1988     }
1989
1990   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1991     if (child)
1992       vect_detect_hybrid_slp_stmts (child, i, stype);
1993 }
1994
1995 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
1996
1997 static tree
1998 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1999 {
2000   walk_stmt_info *wi = (walk_stmt_info *)data;
2001   struct loop *loopp = (struct loop *)wi->info;
2002
2003   if (wi->is_lhs)
2004     return NULL_TREE;
2005
2006   if (TREE_CODE (*tp) == SSA_NAME
2007       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2008     {
2009       gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2010       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2011           && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2012         {
2013           if (dump_enabled_p ())
2014             {
2015               dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2016               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2017             }
2018           STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2019         }
2020     }
2021
2022   return NULL_TREE;
2023 }
2024
2025 static tree
2026 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2027                           walk_stmt_info *)
2028 {
2029   /* If the stmt is in a SLP instance then this isn't a reason
2030      to mark use definitions in other SLP instances as hybrid.  */
2031   if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2032     *handled = true;
2033   return NULL_TREE;
2034 }
2035
2036 /* Find stmts that must be both vectorized and SLPed.  */
2037
2038 void
2039 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2040 {
2041   unsigned int i;
2042   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2043   slp_instance instance;
2044
2045   if (dump_enabled_p ())
2046     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2047                      "\n");
2048
2049   /* First walk all pattern stmt in the loop and mark defs of uses as
2050      hybrid because immediate uses in them are not recorded.  */
2051   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2052     {
2053       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2054       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2055            gsi_next (&gsi))
2056         {
2057           gimple *stmt = gsi_stmt (gsi);
2058           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2059           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2060             {
2061               walk_stmt_info wi;
2062               memset (&wi, 0, sizeof (wi));
2063               wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2064               gimple_stmt_iterator gsi2
2065                 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2066               walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2067                                 vect_detect_hybrid_slp_1, &wi);
2068               walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2069                                vect_detect_hybrid_slp_2,
2070                                vect_detect_hybrid_slp_1, &wi);
2071             }
2072         }
2073     }
2074
2075   /* Then walk the SLP instance trees marking stmts with uses in
2076      non-SLP stmts as hybrid, also propagating hybrid down the
2077      SLP tree, collecting the above info on-the-fly.  */
2078   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2079     {
2080       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2081         vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2082                                       i, pure_slp);
2083     }
2084 }
2085
2086
2087 /* Create and initialize a new bb_vec_info struct for BB, as well as
2088    stmt_vec_info structs for all the stmts in it.  */
2089
2090 static bb_vec_info
2091 new_bb_vec_info (basic_block bb)
2092 {
2093   bb_vec_info res = NULL;
2094   gimple_stmt_iterator gsi;
2095
2096   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2097   res->kind = vec_info::bb;
2098   BB_VINFO_BB (res) = bb;
2099
2100   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2101     {
2102       gimple *stmt = gsi_stmt (gsi);
2103       gimple_set_uid (stmt, 0);
2104       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
2105     }
2106
2107   BB_VINFO_GROUPED_STORES (res).create (10);
2108   BB_VINFO_SLP_INSTANCES (res).create (2);
2109   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2110
2111   bb->aux = res;
2112   return res;
2113 }
2114
2115
2116 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2117    stmts in the basic block.  */
2118
2119 static void
2120 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2121 {
2122   vec<slp_instance> slp_instances;
2123   slp_instance instance;
2124   basic_block bb;
2125   gimple_stmt_iterator si;
2126   unsigned i;
2127
2128   if (!bb_vinfo)
2129     return;
2130
2131   bb = BB_VINFO_BB (bb_vinfo);
2132
2133   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2134     {
2135       gimple *stmt = gsi_stmt (si);
2136       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2137
2138       if (stmt_info)
2139         /* Free stmt_vec_info.  */
2140         free_stmt_vec_info (stmt);
2141     }
2142
2143   vect_destroy_datarefs (bb_vinfo);
2144   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2145   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2146   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2147   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2148     vect_free_slp_instance (instance);
2149   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2150   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2151   free (bb_vinfo);
2152   bb->aux = NULL;
2153 }
2154
2155
2156 /* Analyze statements contained in SLP tree node after recursively analyzing
2157    the subtree. Return TRUE if the operations are supported.  */
2158
2159 static bool
2160 vect_slp_analyze_node_operations (slp_tree node)
2161 {
2162   bool dummy;
2163   int i;
2164   gimple *stmt;
2165   slp_tree child;
2166
2167   if (!node)
2168     return true;
2169
2170   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2171     if (!vect_slp_analyze_node_operations (child))
2172       return false;
2173
2174   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2175     {
2176       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2177       gcc_assert (stmt_info);
2178       gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2179
2180       if (!vect_analyze_stmt (stmt, &dummy, node))
2181         return false;
2182     }
2183
2184   return true;
2185 }
2186
2187
2188 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2189    operations are supported. */
2190
2191 bool
2192 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2193 {
2194   slp_instance instance;
2195   int i;
2196
2197   if (dump_enabled_p ())
2198     dump_printf_loc (MSG_NOTE, vect_location,
2199                      "=== vect_slp_analyze_operations ===\n");
2200
2201   for (i = 0; slp_instances.iterate (i, &instance); )
2202     {
2203       if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2204         {
2205           dump_printf_loc (MSG_NOTE, vect_location,
2206                            "removing SLP instance operations starting from: ");
2207           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2208                             SLP_TREE_SCALAR_STMTS
2209                               (SLP_INSTANCE_TREE (instance))[0], 0);
2210           vect_free_slp_instance (instance);
2211           slp_instances.ordered_remove (i);
2212         }
2213       else
2214         {
2215           /* Compute the costs of the SLP instance.  */
2216           vect_analyze_slp_cost (instance, data);
2217           i++;
2218         }
2219     }
2220
2221   if (!slp_instances.length ())
2222     return false;
2223
2224   return true;
2225 }
2226
2227
2228 /* Compute the scalar cost of the SLP node NODE and its children
2229    and return it.  Do not account defs that are marked in LIFE and
2230    update LIFE according to uses of NODE.  */
2231
2232 static unsigned
2233 vect_bb_slp_scalar_cost (basic_block bb,
2234                          slp_tree node, vec<bool, va_heap> *life)
2235 {
2236   unsigned scalar_cost = 0;
2237   unsigned i;
2238   gimple *stmt;
2239   slp_tree child;
2240
2241   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2242     {
2243       unsigned stmt_cost;
2244       ssa_op_iter op_iter;
2245       def_operand_p def_p;
2246       stmt_vec_info stmt_info;
2247
2248       if ((*life)[i])
2249         continue;
2250
2251       /* If there is a non-vectorized use of the defs then the scalar
2252          stmt is kept live in which case we do not account it or any
2253          required defs in the SLP children in the scalar cost.  This
2254          way we make the vectorization more costly when compared to
2255          the scalar cost.  */
2256       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2257         {
2258           imm_use_iterator use_iter;
2259           gimple *use_stmt;
2260           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2261             if (!is_gimple_debug (use_stmt)
2262                 && (gimple_code (use_stmt) == GIMPLE_PHI
2263                     || gimple_bb (use_stmt) != bb
2264                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2265               {
2266                 (*life)[i] = true;
2267                 BREAK_FROM_IMM_USE_STMT (use_iter);
2268               }
2269         }
2270       if ((*life)[i])
2271         continue;
2272
2273       stmt_info = vinfo_for_stmt (stmt);
2274       if (STMT_VINFO_DATA_REF (stmt_info))
2275         {
2276           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2277             stmt_cost = vect_get_stmt_cost (scalar_load);
2278           else
2279             stmt_cost = vect_get_stmt_cost (scalar_store);
2280         }
2281       else
2282         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2283
2284       scalar_cost += stmt_cost;
2285     }
2286
2287   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2288     if (child)
2289       scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2290
2291   return scalar_cost;
2292 }
2293
2294 /* Check if vectorization of the basic block is profitable.  */
2295
2296 static bool
2297 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2298 {
2299   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2300   slp_instance instance;
2301   int i;
2302   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2303   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2304
2305   /* Calculate scalar cost.  */
2306   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2307     {
2308       auto_vec<bool, 20> life;
2309       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2310       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2311                                               SLP_INSTANCE_TREE (instance),
2312                                               &life);
2313     }
2314
2315   /* Complete the target-specific cost calculation.  */
2316   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2317                &vec_inside_cost, &vec_epilogue_cost);
2318
2319   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2320
2321   if (dump_enabled_p ())
2322     {
2323       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2324       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2325                    vec_inside_cost);
2326       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2327       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2328       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2329     }
2330
2331   /* Vectorization is profitable if its cost is less than the cost of scalar
2332      version.  */
2333   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2334     return false;
2335
2336   return true;
2337 }
2338
2339 /* Check if the basic block can be vectorized.  */
2340
2341 static bb_vec_info
2342 vect_slp_analyze_bb_1 (basic_block bb)
2343 {
2344   bb_vec_info bb_vinfo;
2345   vec<slp_instance> slp_instances;
2346   slp_instance instance;
2347   int i;
2348   int min_vf = 2;
2349   unsigned n_stmts = 0;
2350
2351   bb_vinfo = new_bb_vec_info (bb);
2352   if (!bb_vinfo)
2353     return NULL;
2354
2355   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, &n_stmts))
2356     {
2357       if (dump_enabled_p ())
2358         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359                          "not vectorized: unhandled data-ref in basic "
2360                          "block.\n");
2361
2362       destroy_bb_vec_info (bb_vinfo);
2363       return NULL;
2364     }
2365
2366   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2367     {
2368       if (dump_enabled_p ())
2369         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2370                          "not vectorized: not enough data-refs in "
2371                          "basic block.\n");
2372
2373       destroy_bb_vec_info (bb_vinfo);
2374       return NULL;
2375     }
2376
2377   if (!vect_analyze_data_ref_accesses (bb_vinfo))
2378     {
2379      if (dump_enabled_p ())
2380        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381                         "not vectorized: unhandled data access in "
2382                         "basic block.\n");
2383
2384       destroy_bb_vec_info (bb_vinfo);
2385       return NULL;
2386     }
2387
2388   vect_pattern_recog (bb_vinfo);
2389
2390   if (!vect_analyze_data_refs_alignment (bb_vinfo))
2391     {
2392       if (dump_enabled_p ())
2393         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2394                          "not vectorized: bad data alignment in basic "
2395                          "block.\n");
2396
2397       destroy_bb_vec_info (bb_vinfo);
2398       return NULL;
2399     }
2400
2401   /* Check the SLP opportunities in the basic block, analyze and build SLP
2402      trees.  */
2403   if (!vect_analyze_slp (bb_vinfo, n_stmts))
2404     {
2405       if (dump_enabled_p ())
2406         {
2407           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408                            "Failed to SLP the basic block.\n");
2409           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2410                            "not vectorized: failed to find SLP opportunities "
2411                            "in basic block.\n");
2412         }
2413
2414       destroy_bb_vec_info (bb_vinfo);
2415       return NULL;
2416     }
2417
2418   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2419
2420   /* Mark all the statements that we want to vectorize as pure SLP and
2421      relevant.  */
2422   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2423     {
2424       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2425       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2426     }
2427
2428   /* Mark all the statements that we do not want to vectorize.  */
2429   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2430        !gsi_end_p (gsi); gsi_next (&gsi))
2431     {
2432       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2433       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2434         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2435     }
2436
2437   /* Analyze dependences.  At this point all stmts not participating in
2438      vectorization have to be marked.  Dependence analysis assumes
2439      that we either vectorize all SLP instances or none at all.  */
2440   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2441      {
2442        if (dump_enabled_p ())
2443          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2444                           "not vectorized: unhandled data dependence "
2445                           "in basic block.\n");
2446
2447        destroy_bb_vec_info (bb_vinfo);
2448        return NULL;
2449      }
2450
2451   if (!vect_verify_datarefs_alignment (bb_vinfo))
2452     {
2453       if (dump_enabled_p ())
2454         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2455                          "not vectorized: unsupported alignment in basic "
2456                          "block.\n");
2457       destroy_bb_vec_info (bb_vinfo);
2458       return NULL;
2459     }
2460
2461   if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2462                                     BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2463     {
2464       if (dump_enabled_p ())
2465         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2466                          "not vectorized: bad operation in basic block.\n");
2467
2468       destroy_bb_vec_info (bb_vinfo);
2469       return NULL;
2470     }
2471
2472   /* Cost model: check if the vectorization is worthwhile.  */
2473   if (!unlimited_cost_model (NULL)
2474       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2475     {
2476       if (dump_enabled_p ())
2477         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2478                          "not vectorized: vectorization is not "
2479                          "profitable.\n");
2480
2481       destroy_bb_vec_info (bb_vinfo);
2482       return NULL;
2483     }
2484
2485   if (dump_enabled_p ())
2486     dump_printf_loc (MSG_NOTE, vect_location,
2487                      "Basic block will be vectorized using SLP\n");
2488
2489   return bb_vinfo;
2490 }
2491
2492
2493 bb_vec_info
2494 vect_slp_analyze_bb (basic_block bb)
2495 {
2496   bb_vec_info bb_vinfo;
2497   int insns = 0;
2498   gimple_stmt_iterator gsi;
2499   unsigned int vector_sizes;
2500
2501   if (dump_enabled_p ())
2502     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2503
2504   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2505     {
2506       gimple *stmt = gsi_stmt (gsi);
2507       if (!is_gimple_debug (stmt)
2508           && !gimple_nop_p (stmt)
2509           && gimple_code (stmt) != GIMPLE_LABEL)
2510         insns++;
2511     }
2512
2513   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2514     {
2515       if (dump_enabled_p ())
2516         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2517                          "not vectorized: too many instructions in "
2518                          "basic block.\n");
2519
2520       return NULL;
2521     }
2522
2523   /* Autodetect first vector size we try.  */
2524   current_vector_size = 0;
2525   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2526
2527   while (1)
2528     {
2529       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2530       if (bb_vinfo)
2531         return bb_vinfo;
2532
2533       destroy_bb_vec_info (bb_vinfo);
2534
2535       vector_sizes &= ~current_vector_size;
2536       if (vector_sizes == 0
2537           || current_vector_size == 0)
2538         return NULL;
2539
2540       /* Try the next biggest vector size.  */
2541       current_vector_size = 1 << floor_log2 (vector_sizes);
2542       if (dump_enabled_p ())
2543         dump_printf_loc (MSG_NOTE, vect_location,
2544                          "***** Re-trying analysis with "
2545                          "vector size %d\n", current_vector_size);
2546     }
2547 }
2548
2549
2550 /* For constant and loop invariant defs of SLP_NODE this function returns
2551    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2552    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2553    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2554    REDUC_INDEX is the index of the reduction operand in the statements, unless
2555    it is -1.  */
2556
2557 static void
2558 vect_get_constant_vectors (tree op, slp_tree slp_node,
2559                            vec<tree> *vec_oprnds,
2560                            unsigned int op_num, unsigned int number_of_vectors,
2561                            int reduc_index)
2562 {
2563   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2564   gimple *stmt = stmts[0];
2565   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2566   unsigned nunits;
2567   tree vec_cst;
2568   tree *elts;
2569   unsigned j, number_of_places_left_in_vector;
2570   tree vector_type;
2571   tree vop;
2572   int group_size = stmts.length ();
2573   unsigned int vec_num, i;
2574   unsigned number_of_copies = 1;
2575   vec<tree> voprnds;
2576   voprnds.create (number_of_vectors);
2577   bool constant_p, is_store;
2578   tree neutral_op = NULL;
2579   enum tree_code code = gimple_expr_code (stmt);
2580   gimple *def_stmt;
2581   struct loop *loop;
2582   gimple_seq ctor_seq = NULL;
2583
2584   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2585   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2586
2587   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2588       && reduc_index != -1)
2589     {
2590       op_num = reduc_index;
2591       op = gimple_op (stmt, op_num + 1);
2592       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2593          we need either neutral operands or the original operands.  See
2594          get_initial_def_for_reduction() for details.  */
2595       switch (code)
2596         {
2597           case WIDEN_SUM_EXPR:
2598           case DOT_PROD_EXPR:
2599           case SAD_EXPR:
2600           case PLUS_EXPR:
2601           case MINUS_EXPR:
2602           case BIT_IOR_EXPR:
2603           case BIT_XOR_EXPR:
2604              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2605                neutral_op = build_real (TREE_TYPE (op), dconst0);
2606              else
2607                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2608
2609              break;
2610
2611           case MULT_EXPR:
2612              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2613                neutral_op = build_real (TREE_TYPE (op), dconst1);
2614              else
2615                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2616
2617              break;
2618
2619           case BIT_AND_EXPR:
2620             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2621             break;
2622
2623           /* For MIN/MAX we don't have an easy neutral operand but
2624              the initial values can be used fine here.  Only for
2625              a reduction chain we have to force a neutral element.  */
2626           case MAX_EXPR:
2627           case MIN_EXPR:
2628             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2629               neutral_op = NULL;
2630             else
2631               {
2632                 def_stmt = SSA_NAME_DEF_STMT (op);
2633                 loop = (gimple_bb (stmt))->loop_father;
2634                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2635                                                     loop_preheader_edge (loop));
2636               }
2637             break;
2638
2639           default:
2640             gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2641             neutral_op = NULL;
2642         }
2643     }
2644
2645   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2646     {
2647       is_store = true;
2648       op = gimple_assign_rhs1 (stmt);
2649     }
2650   else
2651     is_store = false;
2652
2653   gcc_assert (op);
2654
2655   if (CONSTANT_CLASS_P (op))
2656     constant_p = true;
2657   else
2658     constant_p = false;
2659
2660   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2661      created vectors. It is greater than 1 if unrolling is performed.
2662
2663      For example, we have two scalar operands, s1 and s2 (e.g., group of
2664      strided accesses of size two), while NUNITS is four (i.e., four scalars
2665      of this type can be packed in a vector).  The output vector will contain
2666      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2667      will be 2).
2668
2669      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2670      containing the operands.
2671
2672      For example, NUNITS is four as before, and the group size is 8
2673      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2674      {s5, s6, s7, s8}.  */
2675
2676   number_of_copies = nunits * number_of_vectors / group_size;
2677
2678   number_of_places_left_in_vector = nunits;
2679   elts = XALLOCAVEC (tree, nunits);
2680   bool place_after_defs = false;
2681   for (j = 0; j < number_of_copies; j++)
2682     {
2683       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2684         {
2685           if (is_store)
2686             op = gimple_assign_rhs1 (stmt);
2687           else
2688             {
2689               switch (code)
2690                 {
2691                   case COND_EXPR:
2692                     if (op_num == 0 || op_num == 1)
2693                       {
2694                         tree cond = gimple_assign_rhs1 (stmt);
2695                         op = TREE_OPERAND (cond, op_num);
2696                       }
2697                     else
2698                       {
2699                         if (op_num == 2)
2700                           op = gimple_assign_rhs2 (stmt);
2701                         else
2702                           op = gimple_assign_rhs3 (stmt);
2703                       }
2704                     break;
2705
2706                   case CALL_EXPR:
2707                     op = gimple_call_arg (stmt, op_num);
2708                     break;
2709
2710                   case LSHIFT_EXPR:
2711                   case RSHIFT_EXPR:
2712                   case LROTATE_EXPR:
2713                   case RROTATE_EXPR:
2714                     op = gimple_op (stmt, op_num + 1);
2715                     /* Unlike the other binary operators, shifts/rotates have
2716                        the shift count being int, instead of the same type as
2717                        the lhs, so make sure the scalar is the right type if
2718                        we are dealing with vectors of
2719                        long long/long/short/char.  */
2720                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2721                       op = fold_convert (TREE_TYPE (vector_type), op);
2722                     break;
2723
2724                   default:
2725                     op = gimple_op (stmt, op_num + 1);
2726                     break;
2727                 }
2728             }
2729
2730           if (reduc_index != -1)
2731             {
2732               loop = (gimple_bb (stmt))->loop_father;
2733               def_stmt = SSA_NAME_DEF_STMT (op);
2734
2735               gcc_assert (loop);
2736
2737               /* Get the def before the loop.  In reduction chain we have only
2738                  one initial value.  */
2739               if ((j != (number_of_copies - 1)
2740                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2741                        && i != 0))
2742                   && neutral_op)
2743                 op = neutral_op;
2744               else
2745                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2746                                             loop_preheader_edge (loop));
2747             }
2748
2749           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2750           number_of_places_left_in_vector--;
2751           tree orig_op = op;
2752           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2753             {
2754               if (CONSTANT_CLASS_P (op))
2755                 {
2756                   op = fold_unary (VIEW_CONVERT_EXPR,
2757                                    TREE_TYPE (vector_type), op);
2758                   gcc_assert (op && CONSTANT_CLASS_P (op));
2759                 }
2760               else
2761                 {
2762                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2763                   gimple *init_stmt;
2764                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2765                   init_stmt
2766                     = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2767                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2768                   op = new_temp;
2769                 }
2770             }
2771           elts[number_of_places_left_in_vector] = op;
2772           if (!CONSTANT_CLASS_P (op))
2773             constant_p = false;
2774           if (TREE_CODE (orig_op) == SSA_NAME
2775               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2776               && STMT_VINFO_BB_VINFO (stmt_vinfo)
2777               && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2778                   == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2779             place_after_defs = true;
2780
2781           if (number_of_places_left_in_vector == 0)
2782             {
2783               number_of_places_left_in_vector = nunits;
2784
2785               if (constant_p)
2786                 vec_cst = build_vector (vector_type, elts);
2787               else
2788                 {
2789                   vec<constructor_elt, va_gc> *v;
2790                   unsigned k;
2791                   vec_alloc (v, nunits);
2792                   for (k = 0; k < nunits; ++k)
2793                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2794                   vec_cst = build_constructor (vector_type, v);
2795                 }
2796               tree init;
2797               gimple_stmt_iterator gsi;
2798               if (place_after_defs)
2799                 {
2800                   gsi = gsi_for_stmt
2801                           (vect_find_last_scalar_stmt_in_slp (slp_node));
2802                   init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2803                 }
2804               else
2805                 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2806               if (ctor_seq != NULL)
2807                 {
2808                   gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2809                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2810                                                         GSI_SAME_STMT);
2811                   ctor_seq = NULL;
2812                 }
2813               voprnds.quick_push (init);
2814               place_after_defs = false;
2815             }
2816         }
2817     }
2818
2819   /* Since the vectors are created in the reverse order, we should invert
2820      them.  */
2821   vec_num = voprnds.length ();
2822   for (j = vec_num; j != 0; j--)
2823     {
2824       vop = voprnds[j - 1];
2825       vec_oprnds->quick_push (vop);
2826     }
2827
2828   voprnds.release ();
2829
2830   /* In case that VF is greater than the unrolling factor needed for the SLP
2831      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2832      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2833      to replicate the vectors.  */
2834   while (number_of_vectors > vec_oprnds->length ())
2835     {
2836       tree neutral_vec = NULL;
2837
2838       if (neutral_op)
2839         {
2840           if (!neutral_vec)
2841             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2842
2843           vec_oprnds->quick_push (neutral_vec);
2844         }
2845       else
2846         {
2847           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2848             vec_oprnds->quick_push (vop);
2849         }
2850     }
2851 }
2852
2853
2854 /* Get vectorized definitions from SLP_NODE that contains corresponding
2855    vectorized def-stmts.  */
2856
2857 static void
2858 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2859 {
2860   tree vec_oprnd;
2861   gimple *vec_def_stmt;
2862   unsigned int i;
2863
2864   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2865
2866   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2867     {
2868       gcc_assert (vec_def_stmt);
2869       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2870       vec_oprnds->quick_push (vec_oprnd);
2871     }
2872 }
2873
2874
2875 /* Get vectorized definitions for SLP_NODE.
2876    If the scalar definitions are loop invariants or constants, collect them and
2877    call vect_get_constant_vectors() to create vector stmts.
2878    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2879    must be stored in the corresponding child of SLP_NODE, and we call
2880    vect_get_slp_vect_defs () to retrieve them.  */
2881
2882 void
2883 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2884                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2885 {
2886   gimple *first_stmt;
2887   int number_of_vects = 0, i;
2888   unsigned int child_index = 0;
2889   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2890   slp_tree child = NULL;
2891   vec<tree> vec_defs;
2892   tree oprnd;
2893   bool vectorized_defs;
2894
2895   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2896   FOR_EACH_VEC_ELT (ops, i, oprnd)
2897     {
2898       /* For each operand we check if it has vectorized definitions in a child
2899          node or we need to create them (for invariants and constants).  We
2900          check if the LHS of the first stmt of the next child matches OPRND.
2901          If it does, we found the correct child.  Otherwise, we call
2902          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2903          to check this child node for the next operand.  */
2904       vectorized_defs = false;
2905       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2906         {
2907           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2908
2909           /* We have to check both pattern and original def, if available.  */
2910           if (child)
2911             {
2912               gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2913               gimple *related
2914                 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2915
2916               if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2917                   || (related
2918                       && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2919                 {
2920                   /* The number of vector defs is determined by the number of
2921                      vector statements in the node from which we get those
2922                      statements.  */
2923                   number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2924                   vectorized_defs = true;
2925                   child_index++;
2926                 }
2927             }
2928           else
2929             child_index++;
2930         }
2931
2932       if (!vectorized_defs)
2933         {
2934           if (i == 0)
2935             {
2936               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2937               /* Number of vector stmts was calculated according to LHS in
2938                  vect_schedule_slp_instance (), fix it by replacing LHS with
2939                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2940                  details.  */
2941               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2942                                              &rhs_size_unit);
2943               if (rhs_size_unit != lhs_size_unit)
2944                 {
2945                   number_of_vects *= rhs_size_unit;
2946                   number_of_vects /= lhs_size_unit;
2947                 }
2948             }
2949         }
2950
2951       /* Allocate memory for vectorized defs.  */
2952       vec_defs = vNULL;
2953       vec_defs.create (number_of_vects);
2954
2955       /* For reduction defs we call vect_get_constant_vectors (), since we are
2956          looking for initial loop invariant values.  */
2957       if (vectorized_defs && reduc_index == -1)
2958         /* The defs are already vectorized.  */
2959         vect_get_slp_vect_defs (child, &vec_defs);
2960       else
2961         /* Build vectors from scalar defs.  */
2962         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2963                                    number_of_vects, reduc_index);
2964
2965       vec_oprnds->quick_push (vec_defs);
2966
2967       /* For reductions, we only need initial values.  */
2968       if (reduc_index != -1)
2969         return;
2970     }
2971 }
2972
2973
2974 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2975    building a vector of type MASK_TYPE from it) and two input vectors placed in
2976    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2977    shifting by STRIDE elements of DR_CHAIN for every copy.
2978    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2979    copies).
2980    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2981    the created stmts must be inserted.  */
2982
2983 static inline void
2984 vect_create_mask_and_perm (gimple *stmt,
2985                            tree mask, int first_vec_indx, int second_vec_indx,
2986                            gimple_stmt_iterator *gsi, slp_tree node,
2987                            tree vectype, vec<tree> dr_chain,
2988                            int ncopies, int vect_stmts_counter)
2989 {
2990   tree perm_dest;
2991   gimple *perm_stmt = NULL;
2992   int i, stride;
2993   tree first_vec, second_vec, data_ref;
2994
2995   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2996
2997   /* Initialize the vect stmts of NODE to properly insert the generated
2998      stmts later.  */
2999   for (i = SLP_TREE_VEC_STMTS (node).length ();
3000        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3001     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3002
3003   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3004   for (i = 0; i < ncopies; i++)
3005     {
3006       first_vec = dr_chain[first_vec_indx];
3007       second_vec = dr_chain[second_vec_indx];
3008
3009       /* Generate the permute statement.  */
3010       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3011                                        first_vec, second_vec, mask);
3012       data_ref = make_ssa_name (perm_dest, perm_stmt);
3013       gimple_set_lhs (perm_stmt, data_ref);
3014       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3015
3016       /* Store the vector statement in NODE.  */
3017       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3018
3019       first_vec_indx += stride;
3020       second_vec_indx += stride;
3021     }
3022 }
3023
3024
3025 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3026    return in CURRENT_MASK_ELEMENT its equivalent in target specific
3027    representation.  Check that the mask is valid and return FALSE if not.
3028    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3029    the next vector, i.e., the current first vector is not needed.  */
3030
3031 static bool
3032 vect_get_mask_element (gimple *stmt, int first_mask_element, int m,
3033                        int mask_nunits, bool only_one_vec, int index,
3034                        unsigned char *mask, int *current_mask_element,
3035                        bool *need_next_vector, int *number_of_mask_fixes,
3036                        bool *mask_fixed, bool *needs_first_vector)
3037 {
3038   int i;
3039
3040   /* Convert to target specific representation.  */
3041   *current_mask_element = first_mask_element + m;
3042   /* Adjust the value in case it's a mask for second and third vectors.  */
3043   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3044
3045   if (*current_mask_element < 0)
3046     {
3047       if (dump_enabled_p ())
3048         {
3049           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3050                            "permutation requires past vector ");
3051           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3052           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3053         }
3054       return false;
3055     }
3056
3057   if (*current_mask_element < mask_nunits)
3058     *needs_first_vector = true;
3059
3060   /* We have only one input vector to permute but the mask accesses values in
3061      the next vector as well.  */
3062   if (only_one_vec && *current_mask_element >= mask_nunits)
3063     {
3064       if (dump_enabled_p ())
3065         {
3066           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3067                            "permutation requires at least two vectors ");
3068           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3069           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3070         }
3071
3072       return false;
3073     }
3074
3075   /* The mask requires the next vector.  */
3076   while (*current_mask_element >= mask_nunits * 2)
3077     {
3078       if (*needs_first_vector || *mask_fixed)
3079         {
3080           /* We either need the first vector too or have already moved to the
3081              next vector. In both cases, this permutation needs three
3082              vectors.  */
3083           if (dump_enabled_p ())
3084             {
3085               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3086                                "permutation requires at "
3087                                "least three vectors ");
3088               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3089               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3090             }
3091
3092           return false;
3093         }
3094
3095       /* We move to the next vector, dropping the first one and working with
3096          the second and the third - we need to adjust the values of the mask
3097          accordingly.  */
3098       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3099
3100       for (i = 0; i < index; i++)
3101         mask[i] -= mask_nunits * *number_of_mask_fixes;
3102
3103       (*number_of_mask_fixes)++;
3104       *mask_fixed = true;
3105     }
3106
3107   *need_next_vector = *mask_fixed;
3108
3109   /* This was the last element of this mask. Start a new one.  */
3110   if (index == mask_nunits - 1)
3111     {
3112       *number_of_mask_fixes = 1;
3113       *mask_fixed = false;
3114       *needs_first_vector = false;
3115     }
3116
3117   return true;
3118 }
3119
3120
3121 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3122    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3123    permute statements for the SLP node NODE of the SLP instance
3124    SLP_NODE_INSTANCE.  */
3125
3126 bool
3127 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3128                               gimple_stmt_iterator *gsi, int vf,
3129                               slp_instance slp_node_instance, bool analyze_only)
3130 {
3131   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3132   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3133   tree mask_element_type = NULL_TREE, mask_type;
3134   int i, j, k, nunits, vec_index = 0;
3135   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3136   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3137   int first_mask_element;
3138   int index, unroll_factor, current_mask_element, ncopies;
3139   unsigned char *mask;
3140   bool only_one_vec = false, need_next_vector = false;
3141   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3142   int number_of_mask_fixes = 1;
3143   bool mask_fixed = false;
3144   bool needs_first_vector = false;
3145   machine_mode mode;
3146
3147   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3148     return false;
3149
3150   stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3151
3152   mode = TYPE_MODE (vectype);
3153
3154   if (!can_vec_perm_p (mode, false, NULL))
3155     {
3156       if (dump_enabled_p ())
3157         {
3158           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3159                            "no vect permute for ");
3160           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3161           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3162         }
3163       return false;
3164     }
3165
3166   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3167      same size as the vector element being permuted.  */
3168   mask_element_type = lang_hooks.types.type_for_mode
3169                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3170   mask_type = get_vectype_for_scalar_type (mask_element_type);
3171   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3172   mask = XALLOCAVEC (unsigned char, nunits);
3173   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3174
3175   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3176      unrolling factor.  */
3177   orig_vec_stmts_num
3178     = (STMT_VINFO_GROUP_SIZE (stmt_info)
3179        * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
3180        + nunits - 1) / nunits;
3181   if (orig_vec_stmts_num == 1)
3182     only_one_vec = true;
3183
3184   /* Number of copies is determined by the final vectorization factor
3185      relatively to SLP_NODE_INSTANCE unrolling factor.  */
3186   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3187
3188   /* Generate permutation masks for every NODE. Number of masks for each NODE
3189      is equal to GROUP_SIZE.
3190      E.g., we have a group of three nodes with three loads from the same
3191      location in each node, and the vector size is 4. I.e., we have a
3192      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3193      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3194      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3195      ...
3196
3197      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3198      The last mask is illegal since we assume two operands for permute
3199      operation, and the mask element values can't be outside that range.
3200      Hence, the last mask must be converted into {2,5,5,5}.
3201      For the first two permutations we need the first and the second input
3202      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3203      we need the second and the third vectors: {b1,c1,a2,b2} and
3204      {c2,a3,b3,c3}.  */
3205
3206   {
3207       index = 0;
3208       vect_stmts_counter = 0;
3209       vec_index = 0;
3210       first_vec_index = vec_index++;
3211       if (only_one_vec)
3212         second_vec_index = first_vec_index;
3213       else
3214         second_vec_index =  vec_index++;
3215
3216       for (j = 0; j < unroll_factor; j++)
3217         {
3218           for (k = 0; k < group_size; k++)
3219             {
3220               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3221               first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
3222               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3223                                           nunits, only_one_vec, index,
3224                                           mask, &current_mask_element,
3225                                           &need_next_vector,
3226                                           &number_of_mask_fixes, &mask_fixed,
3227                                           &needs_first_vector))
3228                 return false;
3229               gcc_assert (current_mask_element >= 0
3230                           && current_mask_element < 2 * nunits);
3231               mask[index++] = current_mask_element;
3232
3233               if (index == nunits)
3234                 {
3235                   index = 0;
3236                   if (!can_vec_perm_p (mode, false, mask))
3237                     {
3238                       if (dump_enabled_p ())
3239                         {
3240                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3241                                            vect_location, 
3242                                            "unsupported vect permute { ");
3243                           for (i = 0; i < nunits; ++i)
3244                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3245                                          mask[i]);
3246                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3247                         }
3248                       return false;
3249                     }
3250
3251                   if (!analyze_only)
3252                     {
3253                       int l;
3254                       tree mask_vec, *mask_elts;
3255                       mask_elts = XALLOCAVEC (tree, nunits);
3256                       for (l = 0; l < nunits; ++l)
3257                         mask_elts[l] = build_int_cst (mask_element_type,
3258                                                       mask[l]);
3259                       mask_vec = build_vector (mask_type, mask_elts);
3260
3261                       if (need_next_vector)
3262                         {
3263                           first_vec_index = second_vec_index;
3264                           second_vec_index = vec_index;
3265                         }
3266
3267                       vect_create_mask_and_perm (stmt,
3268                                mask_vec, first_vec_index, second_vec_index,
3269                                gsi, node, vectype, dr_chain,
3270                                ncopies, vect_stmts_counter++);
3271                     }
3272                 }
3273             }
3274         }
3275     }
3276
3277   return true;
3278 }
3279
3280
3281
3282 /* Vectorize SLP instance tree in postorder.  */
3283
3284 static bool
3285 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3286                             unsigned int vectorization_factor)
3287 {
3288   gimple *stmt;
3289   bool grouped_store, is_store;
3290   gimple_stmt_iterator si;
3291   stmt_vec_info stmt_info;
3292   unsigned int vec_stmts_size, nunits, group_size;
3293   tree vectype;
3294   int i;
3295   slp_tree child;
3296
3297   if (!node)
3298     return false;
3299
3300   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3301     vect_schedule_slp_instance (child, instance, vectorization_factor);
3302
3303   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3304   stmt_info = vinfo_for_stmt (stmt);
3305
3306   /* VECTYPE is the type of the destination.  */
3307   vectype = STMT_VINFO_VECTYPE (stmt_info);
3308   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3309   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3310
3311   /* For each SLP instance calculate number of vector stmts to be created
3312      for the scalar stmts in each node of the SLP tree.  Number of vector
3313      elements in one vector iteration is the number of scalar elements in
3314      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3315      size.
3316      Unless this is a SLP reduction in which case the number of vector
3317      stmts is equal to the number of vector stmts of the children.  */
3318   if (GROUP_FIRST_ELEMENT (stmt_info)
3319       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3320     vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3321   else
3322     vec_stmts_size = (vectorization_factor * group_size) / nunits;
3323
3324   if (!SLP_TREE_VEC_STMTS (node).exists ())
3325     {
3326       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3327       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3328     }
3329
3330   if (dump_enabled_p ())
3331     {
3332       dump_printf_loc (MSG_NOTE,vect_location,
3333                        "------>vectorizing SLP node starting from: ");
3334       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3335       dump_printf (MSG_NOTE, "\n");
3336     }
3337
3338   /* Vectorized stmts go before the last scalar stmt which is where
3339      all uses are ready.  */
3340   si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3341
3342   /* Mark the first element of the reduction chain as reduction to properly
3343      transform the node.  In the analysis phase only the last element of the
3344      chain is marked as reduction.  */
3345   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3346       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3347     {
3348       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3349       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3350     }
3351
3352   /* Handle two-operation SLP nodes by vectorizing the group with
3353      both operations and then performing a merge.  */
3354   if (SLP_TREE_TWO_OPERATORS (node))
3355     {
3356       enum tree_code code0 = gimple_assign_rhs_code (stmt);
3357       enum tree_code ocode;
3358       gimple *ostmt;
3359       unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3360       bool allsame = true;
3361       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3362         if (gimple_assign_rhs_code (ostmt) != code0)
3363           {
3364             mask[i] = 1;
3365             allsame = false;
3366             ocode = gimple_assign_rhs_code (ostmt);
3367           }
3368         else
3369           mask[i] = 0;
3370       if (!allsame)
3371         {
3372           vec<gimple *> v0;
3373           vec<gimple *> v1;
3374           unsigned j;
3375           tree tmask = NULL_TREE;
3376           vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3377           v0 = SLP_TREE_VEC_STMTS (node).copy ();
3378           SLP_TREE_VEC_STMTS (node).truncate (0);
3379           gimple_assign_set_rhs_code (stmt, ocode);
3380           vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3381           gimple_assign_set_rhs_code (stmt, code0);
3382           v1 = SLP_TREE_VEC_STMTS (node).copy ();
3383           SLP_TREE_VEC_STMTS (node).truncate (0);
3384           tree meltype = build_nonstandard_integer_type
3385               (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3386           tree mvectype = get_same_sized_vectype (meltype, vectype);
3387           unsigned k = 0, l;
3388           for (j = 0; j < v0.length (); ++j)
3389             {
3390               tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3391               for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3392                 {
3393                   if (k >= group_size)
3394                     k = 0;
3395                   melts[l] = build_int_cst
3396                       (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3397                 }
3398               tmask = build_vector (mvectype, melts);
3399
3400               /* ???  Not all targets support a VEC_PERM_EXPR with a
3401                  constant mask that would translate to a vec_merge RTX
3402                  (with their vec_perm_const_ok).  We can either not
3403                  vectorize in that case or let veclower do its job.
3404                  Unfortunately that isn't too great and at least for
3405                  plus/minus we'd eventually like to match targets
3406                  vector addsub instructions.  */
3407               gimple *vstmt;
3408               vstmt = gimple_build_assign (make_ssa_name (vectype),
3409                                            VEC_PERM_EXPR,
3410                                            gimple_assign_lhs (v0[j]),
3411                                            gimple_assign_lhs (v1[j]), tmask);
3412               vect_finish_stmt_generation (stmt, vstmt, &si);
3413               SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3414             }
3415           v0.release ();
3416           v1.release ();
3417           return false;
3418         }
3419     }
3420   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3421   return is_store;
3422 }
3423
3424 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3425    For loop vectorization this is done in vectorizable_call, but for SLP
3426    it needs to be deferred until end of vect_schedule_slp, because multiple
3427    SLP instances may refer to the same scalar stmt.  */
3428
3429 static void
3430 vect_remove_slp_scalar_calls (slp_tree node)
3431 {
3432   gimple *stmt, *new_stmt;
3433   gimple_stmt_iterator gsi;
3434   int i;
3435   slp_tree child;
3436   tree lhs;
3437   stmt_vec_info stmt_info;
3438
3439   if (!node)
3440     return;
3441
3442   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3443     vect_remove_slp_scalar_calls (child);
3444
3445   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3446     {
3447       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3448         continue;
3449       stmt_info = vinfo_for_stmt (stmt);
3450       if (stmt_info == NULL
3451           || is_pattern_stmt_p (stmt_info)
3452           || !PURE_SLP_STMT (stmt_info))
3453         continue;
3454       lhs = gimple_call_lhs (stmt);
3455       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3456       set_vinfo_for_stmt (new_stmt, stmt_info);
3457       set_vinfo_for_stmt (stmt, NULL);
3458       STMT_VINFO_STMT (stmt_info) = new_stmt;
3459       gsi = gsi_for_stmt (stmt);
3460       gsi_replace (&gsi, new_stmt, false);
3461       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3462     }
3463 }
3464
3465 /* Generate vector code for all SLP instances in the loop/basic block.  */
3466
3467 bool
3468 vect_schedule_slp (vec_info *vinfo)
3469 {
3470   vec<slp_instance> slp_instances;
3471   slp_instance instance;
3472   unsigned int i, vf;
3473   bool is_store = false;
3474
3475   slp_instances = vinfo->slp_instances;
3476   if (is_a <loop_vec_info> (vinfo))
3477     vf = as_a <loop_vec_info> (vinfo)->vectorization_factor;
3478   else
3479     vf = 1;
3480
3481   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3482     {
3483       /* Schedule the tree of INSTANCE.  */
3484       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3485                                              instance, vf);
3486       if (dump_enabled_p ())
3487         dump_printf_loc (MSG_NOTE, vect_location,
3488                          "vectorizing stmts using SLP.\n");
3489     }
3490
3491   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3492     {
3493       slp_tree root = SLP_INSTANCE_TREE (instance);
3494       gimple *store;
3495       unsigned int j;
3496       gimple_stmt_iterator gsi;
3497
3498       /* Remove scalar call stmts.  Do not do this for basic-block
3499          vectorization as not all uses may be vectorized.
3500          ???  Why should this be necessary?  DCE should be able to
3501          remove the stmts itself.
3502          ???  For BB vectorization we can as well remove scalar
3503          stmts starting from the SLP tree root if they have no
3504          uses.  */
3505       if (is_a <loop_vec_info> (vinfo))
3506         vect_remove_slp_scalar_calls (root);
3507
3508       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3509                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3510         {
3511           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3512             break;
3513
3514          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3515            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3516           /* Free the attached stmt_vec_info and remove the stmt.  */
3517           gsi = gsi_for_stmt (store);
3518           unlink_stmt_vdef (store);
3519           gsi_remove (&gsi, true);
3520           release_defs (store);
3521           free_stmt_vec_info (store);
3522         }
3523     }
3524
3525   return is_store;
3526 }
3527
3528
3529 /* Vectorize the basic block.  */
3530
3531 void
3532 vect_slp_transform_bb (basic_block bb)
3533 {
3534   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3535   gimple_stmt_iterator si;
3536
3537   gcc_assert (bb_vinfo);
3538
3539   if (dump_enabled_p ())
3540     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3541
3542   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3543     {
3544       gimple *stmt = gsi_stmt (si);
3545       stmt_vec_info stmt_info;
3546
3547       if (dump_enabled_p ())
3548         {
3549           dump_printf_loc (MSG_NOTE, vect_location,
3550                            "------>SLPing statement: ");
3551           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3552           dump_printf (MSG_NOTE, "\n");
3553         }
3554
3555       stmt_info = vinfo_for_stmt (stmt);
3556       gcc_assert (stmt_info);
3557
3558       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3559       if (STMT_SLP_TYPE (stmt_info))
3560         {
3561           vect_schedule_slp (bb_vinfo);
3562           break;
3563         }
3564     }
3565
3566   if (dump_enabled_p ())
3567     dump_printf_loc (MSG_NOTE, vect_location,
3568                      "BASIC BLOCK VECTORIZED\n");
3569
3570   destroy_bb_vec_info (bb_vinfo);
3571 }