tree-vect-data-refs.c (vect_analyze_data_ref_dependences): Allocate the data dependen...
[platform/upstream/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "params.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "target.h"
36 #include "gimple-pretty-print.h"
37 #include "internal-fn.h"
38 #include "gimple-iterator.h"
39 #include "tree-pass.h"
40 #include "cfgloop.h"
41 #include "flags.h"
42 #include "insn-config.h"
43 #include "recog.h"              /* FIXME: for insn_data */
44 #include "insn-codes.h"
45 #include "optabs-tree.h"
46 #include "tree-vectorizer.h"
47 #include "langhooks.h"
48 #include "gimple-walk.h"
49
50 /* Extract the location of the basic block in the source code.
51    Return the basic block location if succeed and NULL if not.  */
52
53 source_location
54 find_bb_location (basic_block bb)
55 {
56   gimple *stmt = NULL;
57   gimple_stmt_iterator si;
58
59   if (!bb)
60     return UNKNOWN_LOCATION;
61
62   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
63     {
64       stmt = gsi_stmt (si);
65       if (gimple_location (stmt) != UNKNOWN_LOCATION)
66         return gimple_location (stmt);
67     }
68
69   return UNKNOWN_LOCATION;
70 }
71
72
73 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
74
75 static void
76 vect_free_slp_tree (slp_tree node)
77 {
78   int i;
79   slp_tree child;
80
81   if (!node)
82     return;
83
84   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
85     vect_free_slp_tree (child);
86
87   SLP_TREE_CHILDREN (node).release ();
88   SLP_TREE_SCALAR_STMTS (node).release ();
89   SLP_TREE_VEC_STMTS (node).release ();
90   SLP_TREE_LOAD_PERMUTATION (node).release ();
91
92   free (node);
93 }
94
95
96 /* Free the memory allocated for the SLP instance.  */
97
98 void
99 vect_free_slp_instance (slp_instance instance)
100 {
101   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
102   SLP_INSTANCE_LOADS (instance).release ();
103   free (instance);
104 }
105
106
107 /* Create an SLP node for SCALAR_STMTS.  */
108
109 static slp_tree
110 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
111 {
112   slp_tree node;
113   gimple *stmt = scalar_stmts[0];
114   unsigned int nops;
115
116   if (is_gimple_call (stmt))
117     nops = gimple_call_num_args (stmt);
118   else if (is_gimple_assign (stmt))
119     {
120       nops = gimple_num_ops (stmt) - 1;
121       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
122         nops++;
123     }
124   else
125     return NULL;
126
127   node = XNEW (struct _slp_tree);
128   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
129   SLP_TREE_VEC_STMTS (node).create (0);
130   SLP_TREE_CHILDREN (node).create (nops);
131   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
132   SLP_TREE_TWO_OPERATORS (node) = false;
133
134   return node;
135 }
136
137
138 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
139    operand.  */
140 static vec<slp_oprnd_info> 
141 vect_create_oprnd_info (int nops, int group_size)
142 {
143   int i;
144   slp_oprnd_info oprnd_info;
145   vec<slp_oprnd_info> oprnds_info;
146
147   oprnds_info.create (nops);
148   for (i = 0; i < nops; i++)
149     {
150       oprnd_info = XNEW (struct _slp_oprnd_info);
151       oprnd_info->def_stmts.create (group_size);
152       oprnd_info->first_dt = vect_uninitialized_def;
153       oprnd_info->first_op_type = NULL_TREE;
154       oprnd_info->first_pattern = false;
155       oprnd_info->second_pattern = false;
156       oprnds_info.quick_push (oprnd_info);
157     }
158
159   return oprnds_info;
160 }
161
162
163 /* Free operands info.  */
164
165 static void
166 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
167 {
168   int i;
169   slp_oprnd_info oprnd_info;
170
171   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
172     {
173       oprnd_info->def_stmts.release ();
174       XDELETE (oprnd_info);
175     }
176
177   oprnds_info.release ();
178 }
179
180
181 /* Find the place of the data-ref in STMT in the interleaving chain that starts
182    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
183
184 static int
185 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
186 {
187   gimple *next_stmt = first_stmt;
188   int result = 0;
189
190   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
191     return -1;
192
193   do
194     {
195       if (next_stmt == stmt)
196         return result;
197       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
198       if (next_stmt)
199         result += GROUP_GAP (vinfo_for_stmt (next_stmt));
200     }
201   while (next_stmt);
202
203   return -1;
204 }
205
206
207 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
208    they are of a valid type and that they match the defs of the first stmt of
209    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
210    return -1, if the error could be corrected by swapping operands of the
211    operation return 1, if everything is ok return 0.  */
212
213 static int 
214 vect_get_and_check_slp_defs (vec_info *vinfo,
215                              gimple *stmt, unsigned stmt_num,
216                              vec<slp_oprnd_info> *oprnds_info)
217 {
218   tree oprnd;
219   unsigned int i, number_of_oprnds;
220   tree def;
221   gimple *def_stmt;
222   enum vect_def_type dt = vect_uninitialized_def;
223   struct loop *loop = NULL;
224   bool pattern = false;
225   slp_oprnd_info oprnd_info;
226   int first_op_idx = 1;
227   bool commutative = false;
228   bool first_op_cond = false;
229   bool first = stmt_num == 0;
230   bool second = stmt_num == 1;
231
232   if (is_a <loop_vec_info> (vinfo))
233     loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
234
235   if (is_gimple_call (stmt))
236     {
237       number_of_oprnds = gimple_call_num_args (stmt);
238       first_op_idx = 3;
239     }
240   else if (is_gimple_assign (stmt))
241     {
242       enum tree_code code = gimple_assign_rhs_code (stmt);
243       number_of_oprnds = gimple_num_ops (stmt) - 1;
244       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
245         {
246           first_op_cond = true;
247           commutative = true;
248           number_of_oprnds++;
249         }
250       else
251         commutative = commutative_tree_code (code);
252     }
253   else
254     return -1;
255
256   bool swapped = false;
257   for (i = 0; i < number_of_oprnds; i++)
258     {
259 again:
260       if (first_op_cond)
261         {
262           if (i == 0 || i == 1)
263             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
264                                   swapped ? !i : i);
265           else
266             oprnd = gimple_op (stmt, first_op_idx + i - 1);
267         }
268       else
269         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
270
271       oprnd_info = (*oprnds_info)[i];
272
273       if (!vect_is_simple_use (oprnd, NULL, vinfo, &def_stmt,
274                                &def, &dt))
275         {
276           if (dump_enabled_p ())
277             {
278               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
279                                "Build SLP failed: can't analyze def for ");
280               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
281               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
282             }
283
284           return -1;
285         }
286
287       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
288          from the pattern.  Check that all the stmts of the node are in the
289          pattern.  */
290       if (def_stmt && gimple_bb (def_stmt)
291           && ((is_a <loop_vec_info> (vinfo)
292                && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
293               || (is_a <bb_vec_info> (vinfo)
294                   && gimple_bb (def_stmt) == as_a <bb_vec_info> (vinfo)->bb
295                   && gimple_code (def_stmt) != GIMPLE_PHI))
296           && vinfo_for_stmt (def_stmt)
297           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
298           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
299           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
300         {
301           pattern = true;
302           if (!first && !oprnd_info->first_pattern
303               /* Allow different pattern state for the defs of the
304                  first stmt in reduction chains.  */
305               && (oprnd_info->first_dt != vect_reduction_def
306                   || (!second && !oprnd_info->second_pattern)))
307             {
308               if (i == 0
309                   && !swapped
310                   && commutative)
311                 {
312                   swapped = true;
313                   goto again;
314                 }
315
316               if (dump_enabled_p ())
317                 {
318                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
319                                    "Build SLP failed: some of the stmts"
320                                    " are in a pattern, and others are not ");
321                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
322                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
323                 }
324
325               return 1;
326             }
327
328           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
329           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
330
331           if (dt == vect_unknown_def_type)
332             {
333               if (dump_enabled_p ())
334                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
335                                  "Unsupported pattern.\n");
336               return -1;
337             }
338
339           switch (gimple_code (def_stmt))
340             {
341               case GIMPLE_PHI:
342                 def = gimple_phi_result (def_stmt);
343                 break;
344
345               case GIMPLE_ASSIGN:
346                 def = gimple_assign_lhs (def_stmt);
347                 break;
348
349               default:
350                 if (dump_enabled_p ())
351                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
352                                    "unsupported defining stmt:\n");
353                 return -1;
354             }
355         }
356
357       if (second)
358         oprnd_info->second_pattern = pattern;
359
360       if (first)
361         {
362           oprnd_info->first_dt = dt;
363           oprnd_info->first_pattern = pattern;
364           oprnd_info->first_op_type = TREE_TYPE (oprnd);
365         }
366       else
367         {
368           /* Not first stmt of the group, check that the def-stmt/s match
369              the def-stmt/s of the first stmt.  Allow different definition
370              types for reduction chains: the first stmt must be a
371              vect_reduction_def (a phi node), and the rest
372              vect_internal_def.  */
373           if (((oprnd_info->first_dt != dt
374                 && !(oprnd_info->first_dt == vect_reduction_def
375                      && dt == vect_internal_def)
376                 && !((oprnd_info->first_dt == vect_external_def
377                       || oprnd_info->first_dt == vect_constant_def)
378                      && (dt == vect_external_def
379                          || dt == vect_constant_def)))
380                || !types_compatible_p (oprnd_info->first_op_type,
381                                        TREE_TYPE (oprnd))))
382             {
383               /* Try swapping operands if we got a mismatch.  */
384               if (i == 0
385                   && !swapped
386                   && commutative)
387                 {
388                   swapped = true;
389                   goto again;
390                 }
391
392               if (dump_enabled_p ())
393                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394                                  "Build SLP failed: different types\n");
395
396               return 1;
397             }
398         }
399
400       /* Check the types of the definitions.  */
401       switch (dt)
402         {
403         case vect_constant_def:
404         case vect_external_def:
405         case vect_reduction_def:
406           break;
407
408         case vect_internal_def:
409           oprnd_info->def_stmts.quick_push (def_stmt);
410           break;
411
412         default:
413           /* FORNOW: Not supported.  */
414           if (dump_enabled_p ())
415             {
416               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
417                                "Build SLP failed: illegal type of def ");
418               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
419               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
420             }
421
422           return -1;
423         }
424     }
425
426   /* Swap operands.  */
427   if (swapped)
428     {
429       if (first_op_cond)
430         {
431           tree cond = gimple_assign_rhs1 (stmt);
432           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
433                              &TREE_OPERAND (cond, 1));
434           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
435         }
436       else
437         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
438                            gimple_assign_rhs2_ptr (stmt));
439     }
440
441   return 0;
442 }
443
444
445 /* Verify if the scalar stmts STMTS are isomorphic, require data
446    permutation or are of unsupported types of operation.  Return
447    true if they are, otherwise return false and indicate in *MATCHES
448    which stmts are not isomorphic to the first one.  If MATCHES[0]
449    is false then this indicates the comparison could not be
450    carried out or the stmts will never be vectorized by SLP.  */
451
452 static bool
453 vect_build_slp_tree_1 (vec_info *vinfo,
454                        vec<gimple *> stmts, unsigned int group_size,
455                        unsigned nops, unsigned int *max_nunits,
456                        unsigned int vectorization_factor, bool *matches,
457                        bool *two_operators)
458 {
459   unsigned int i;
460   gimple *first_stmt = stmts[0], *stmt = stmts[0];
461   enum tree_code first_stmt_code = ERROR_MARK;
462   enum tree_code alt_stmt_code = ERROR_MARK;
463   enum tree_code rhs_code = ERROR_MARK;
464   enum tree_code first_cond_code = ERROR_MARK;
465   tree lhs;
466   bool need_same_oprnds = false;
467   tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
468   optab optab;
469   int icode;
470   machine_mode optab_op2_mode;
471   machine_mode vec_mode;
472   HOST_WIDE_INT dummy;
473   gimple *first_load = NULL, *prev_first_load = NULL;
474   tree cond;
475
476   /* For every stmt in NODE find its def stmt/s.  */
477   FOR_EACH_VEC_ELT (stmts, i, stmt)
478     {
479       matches[i] = false;
480
481       if (dump_enabled_p ())
482         {
483           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
484           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
485           dump_printf (MSG_NOTE, "\n");
486         }
487
488       /* Fail to vectorize statements marked as unvectorizable.  */
489       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
490         {
491           if (dump_enabled_p ())
492             {
493               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494                                "Build SLP failed: unvectorizable statement ");
495               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
496               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
497             }
498           /* Fatal mismatch.  */
499           matches[0] = false;
500           return false;
501         }
502
503       lhs = gimple_get_lhs (stmt);
504       if (lhs == NULL_TREE)
505         {
506           if (dump_enabled_p ())
507             {
508               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509                                "Build SLP failed: not GIMPLE_ASSIGN nor "
510                                "GIMPLE_CALL ");
511               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513             }
514           /* Fatal mismatch.  */
515           matches[0] = false;
516           return false;
517         }
518
519        if (is_gimple_assign (stmt)
520            && gimple_assign_rhs_code (stmt) == COND_EXPR
521            && (cond = gimple_assign_rhs1 (stmt))
522            && !COMPARISON_CLASS_P (cond))
523         {
524           if (dump_enabled_p ())
525             {
526               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
527                                "Build SLP failed: condition is not "
528                                "comparison ");
529               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
530               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
531             }
532           /* Fatal mismatch.  */
533           matches[0] = false;
534           return false;
535         }
536
537       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
538       vectype = get_vectype_for_scalar_type (scalar_type);
539       if (!vectype)
540         {
541           if (dump_enabled_p ())
542             {
543               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
544                                "Build SLP failed: unsupported data-type ");
545               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
546                                  scalar_type);
547               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
548             }
549           /* Fatal mismatch.  */
550           matches[0] = false;
551           return false;
552         }
553
554       /* If populating the vector type requires unrolling then fail
555          before adjusting *max_nunits for basic-block vectorization.  */
556       if (is_a <bb_vec_info> (vinfo)
557           && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
558         {
559           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
560                            "Build SLP failed: unrolling required "
561                            "in basic block SLP\n");
562           /* Fatal mismatch.  */
563           matches[0] = false;
564           return false;
565         }
566
567       /* In case of multiple types we need to detect the smallest type.  */
568       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
569         {
570           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
571           if (is_a <bb_vec_info> (vinfo))
572             vectorization_factor = *max_nunits;
573         }
574
575       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
576         {
577           rhs_code = CALL_EXPR;
578           if (gimple_call_internal_p (call_stmt)
579               || gimple_call_tail_p (call_stmt)
580               || gimple_call_noreturn_p (call_stmt)
581               || !gimple_call_nothrow_p (call_stmt)
582               || gimple_call_chain (call_stmt))
583             {
584               if (dump_enabled_p ())
585                 {
586                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
587                                    "Build SLP failed: unsupported call type ");
588                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
589                                     call_stmt, 0);
590                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
591                 }
592               /* Fatal mismatch.  */
593               matches[0] = false;
594               return false;
595             }
596         }
597       else
598         rhs_code = gimple_assign_rhs_code (stmt);
599
600       /* Check the operation.  */
601       if (i == 0)
602         {
603           first_stmt_code = rhs_code;
604
605           /* Shift arguments should be equal in all the packed stmts for a
606              vector shift with scalar shift operand.  */
607           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
608               || rhs_code == LROTATE_EXPR
609               || rhs_code == RROTATE_EXPR)
610             {
611               vec_mode = TYPE_MODE (vectype);
612
613               /* First see if we have a vector/vector shift.  */
614               optab = optab_for_tree_code (rhs_code, vectype,
615                                            optab_vector);
616
617               if (!optab
618                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
619                 {
620                   /* No vector/vector shift, try for a vector/scalar shift.  */
621                   optab = optab_for_tree_code (rhs_code, vectype,
622                                                optab_scalar);
623
624                   if (!optab)
625                     {
626                       if (dump_enabled_p ())
627                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
628                                          "Build SLP failed: no optab.\n");
629                       /* Fatal mismatch.  */
630                       matches[0] = false;
631                       return false;
632                     }
633                   icode = (int) optab_handler (optab, vec_mode);
634                   if (icode == CODE_FOR_nothing)
635                     {
636                       if (dump_enabled_p ())
637                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
638                                          "Build SLP failed: "
639                                          "op not supported by target.\n");
640                       /* Fatal mismatch.  */
641                       matches[0] = false;
642                       return false;
643                     }
644                   optab_op2_mode = insn_data[icode].operand[2].mode;
645                   if (!VECTOR_MODE_P (optab_op2_mode))
646                     {
647                       need_same_oprnds = true;
648                       first_op1 = gimple_assign_rhs2 (stmt);
649                     }
650                 }
651             }
652           else if (rhs_code == WIDEN_LSHIFT_EXPR)
653             {
654               need_same_oprnds = true;
655               first_op1 = gimple_assign_rhs2 (stmt);
656             }
657         }
658       else
659         {
660           if (first_stmt_code != rhs_code
661               && alt_stmt_code == ERROR_MARK)
662             alt_stmt_code = rhs_code;
663           if (first_stmt_code != rhs_code
664               && (first_stmt_code != IMAGPART_EXPR
665                   || rhs_code != REALPART_EXPR)
666               && (first_stmt_code != REALPART_EXPR
667                   || rhs_code != IMAGPART_EXPR)
668               /* Handle mismatches in plus/minus by computing both
669                  and merging the results.  */
670               && !((first_stmt_code == PLUS_EXPR
671                     || first_stmt_code == MINUS_EXPR)
672                    && (alt_stmt_code == PLUS_EXPR
673                        || alt_stmt_code == MINUS_EXPR)
674                    && rhs_code == alt_stmt_code)
675               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
676                    && (first_stmt_code == ARRAY_REF
677                        || first_stmt_code == BIT_FIELD_REF
678                        || first_stmt_code == INDIRECT_REF
679                        || first_stmt_code == COMPONENT_REF
680                        || first_stmt_code == MEM_REF)))
681             {
682               if (dump_enabled_p ())
683                 {
684                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
685                                    "Build SLP failed: different operation "
686                                    "in stmt ");
687                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
688                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
689                                    "original stmt ");
690                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
691                                     first_stmt, 0);
692                 }
693               /* Mismatch.  */
694               continue;
695             }
696
697           if (need_same_oprnds
698               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
699             {
700               if (dump_enabled_p ())
701                 {
702                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
703                                    "Build SLP failed: different shift "
704                                    "arguments in ");
705                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
706                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
707                 }
708               /* Mismatch.  */
709               continue;
710             }
711
712           if (rhs_code == CALL_EXPR)
713             {
714               gimple *first_stmt = stmts[0];
715               if (gimple_call_num_args (stmt) != nops
716                   || !operand_equal_p (gimple_call_fn (first_stmt),
717                                        gimple_call_fn (stmt), 0)
718                   || gimple_call_fntype (first_stmt)
719                      != gimple_call_fntype (stmt))
720                 {
721                   if (dump_enabled_p ())
722                     {
723                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
724                                        "Build SLP failed: different calls in ");
725                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
726                                         stmt, 0);
727                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
728                     }
729                   /* Mismatch.  */
730                   continue;
731                 }
732             }
733         }
734
735       /* Grouped store or load.  */
736       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
737         {
738           if (REFERENCE_CLASS_P (lhs))
739             {
740               /* Store.  */
741               ;
742             }
743           else
744             {
745               /* Load.  */
746               /* Check that the size of interleaved loads group is not
747                  greater than the SLP group size.  */
748               unsigned ncopies
749                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
750               if (is_a <loop_vec_info> (vinfo)
751                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
752                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
753                        - GROUP_GAP (vinfo_for_stmt (stmt)))
754                       > ncopies * group_size))
755                 {
756                   if (dump_enabled_p ())
757                     {
758                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
759                                        "Build SLP failed: the number "
760                                        "of interleaved loads is greater than "
761                                        "the SLP group size ");
762                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
763                                         stmt, 0);
764                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
765                     }
766                   /* Fatal mismatch.  */
767                   matches[0] = false;
768                   return false;
769                 }
770
771               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
772               if (prev_first_load)
773                 {
774                   /* Check that there are no loads from different interleaving
775                      chains in the same node.  */
776                   if (prev_first_load != first_load)
777                     {
778                       if (dump_enabled_p ())
779                         {
780                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
781                                            vect_location, 
782                                            "Build SLP failed: different "
783                                            "interleaving chains in one node ");
784                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
785                                             stmt, 0);
786                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787                         }
788                       /* Mismatch.  */
789                       continue;
790                     }
791                 }
792               else
793                 prev_first_load = first_load;
794            }
795         } /* Grouped access.  */
796       else
797         {
798           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
799             {
800               /* Not grouped load.  */
801               if (dump_enabled_p ())
802                 {
803                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
804                                    "Build SLP failed: not grouped load ");
805                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
806                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
807                 }
808
809               /* FORNOW: Not grouped loads are not supported.  */
810               /* Fatal mismatch.  */
811               matches[0] = false;
812               return false;
813             }
814
815           /* Not memory operation.  */
816           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
817               && TREE_CODE_CLASS (rhs_code) != tcc_unary
818               && TREE_CODE_CLASS (rhs_code) != tcc_expression
819               && rhs_code != CALL_EXPR)
820             {
821               if (dump_enabled_p ())
822                 {
823                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
824                                    "Build SLP failed: operation");
825                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
826                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
827                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
828                 }
829               /* Fatal mismatch.  */
830               matches[0] = false;
831               return false;
832             }
833
834           if (rhs_code == COND_EXPR)
835             {
836               tree cond_expr = gimple_assign_rhs1 (stmt);
837
838               if (i == 0)
839                 first_cond_code = TREE_CODE (cond_expr);
840               else if (first_cond_code != TREE_CODE (cond_expr))
841                 {
842                   if (dump_enabled_p ())
843                     {
844                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845                                        "Build SLP failed: different"
846                                        " operation");
847                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
848                                         stmt, 0);
849                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
850                     }
851                   /* Mismatch.  */
852                   continue;
853                 }
854             }
855         }
856
857       matches[i] = true;
858     }
859
860   for (i = 0; i < group_size; ++i)
861     if (!matches[i])
862       return false;
863
864   /* If we allowed a two-operation SLP node verify the target can cope
865      with the permute we are going to use.  */
866   if (alt_stmt_code != ERROR_MARK
867       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
868     {
869       unsigned char *sel
870         = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
871       for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
872         {
873           sel[i] = i;
874           if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
875             sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
876         }
877       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
878         {
879           for (i = 0; i < group_size; ++i)
880             if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
881               {
882                 matches[i] = false;
883                 if (dump_enabled_p ())
884                   {
885                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
886                                      "Build SLP failed: different operation "
887                                      "in stmt ");
888                     dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
889                                       stmts[i], 0);
890                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891                                      "original stmt ");
892                     dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
893                                       first_stmt, 0);
894                   }
895               }
896           return false;
897         }
898       *two_operators = true;
899     }
900
901   return true;
902 }
903
904 /* Recursively build an SLP tree starting from NODE.
905    Fail (and return a value not equal to zero) if def-stmts are not
906    isomorphic, require data permutation or are of unsupported types of
907    operation.  Otherwise, return 0.
908    The value returned is the depth in the SLP tree where a mismatch
909    was found.  */
910
911 static bool
912 vect_build_slp_tree (vec_info *vinfo,
913                      slp_tree *node, unsigned int group_size,
914                      unsigned int *max_nunits,
915                      vec<slp_tree> *loads,
916                      unsigned int vectorization_factor,
917                      bool *matches, unsigned *npermutes, unsigned *tree_size,
918                      unsigned max_tree_size)
919 {
920   unsigned nops, i, this_tree_size = 0;
921   gimple *stmt;
922
923   matches[0] = false;
924
925   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
926   if (is_gimple_call (stmt))
927     nops = gimple_call_num_args (stmt);
928   else if (is_gimple_assign (stmt))
929     {
930       nops = gimple_num_ops (stmt) - 1;
931       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
932         nops++;
933     }
934   else
935     return false;
936
937   bool two_operators = false;
938   if (!vect_build_slp_tree_1 (vinfo,
939                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
940                               max_nunits, vectorization_factor, matches,
941                               &two_operators))
942     return false;
943   SLP_TREE_TWO_OPERATORS (*node) = two_operators;
944
945   /* If the SLP node is a load, terminate the recursion.  */
946   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
947       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
948     {
949       loads->safe_push (*node);
950       return true;
951     }
952
953   /* Get at the operands, verifying they are compatible.  */
954   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
955   slp_oprnd_info oprnd_info;
956   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
957     {
958       switch (vect_get_and_check_slp_defs (vinfo, stmt, i, &oprnds_info))
959         {
960         case 0:
961           break;
962         case -1:
963           matches[0] = false;
964           vect_free_oprnd_info (oprnds_info);
965           return false;
966         case 1:
967           matches[i] = false;
968           break;
969         }
970     }
971   for (i = 0; i < group_size; ++i)
972     if (!matches[i])
973       {
974         vect_free_oprnd_info (oprnds_info);
975         return false;
976       }
977
978   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
979
980   /* Create SLP_TREE nodes for the definition node/s.  */
981   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
982     {
983       slp_tree child;
984       unsigned old_nloads = loads->length ();
985       unsigned old_max_nunits = *max_nunits;
986
987       if (oprnd_info->first_dt != vect_internal_def)
988         continue;
989
990       if (++this_tree_size > max_tree_size)
991         {
992           vect_free_oprnd_info (oprnds_info);
993           return false;
994         }
995
996       child = vect_create_new_slp_node (oprnd_info->def_stmts);
997       if (!child)
998         {
999           vect_free_oprnd_info (oprnds_info);
1000           return false;
1001         }
1002
1003       if (vect_build_slp_tree (vinfo, &child,
1004                                group_size, max_nunits, loads,
1005                                vectorization_factor, matches,
1006                                npermutes, &this_tree_size, max_tree_size))
1007         {
1008           /* If we have all children of child built up from scalars then just
1009              throw that away and build it up this node from scalars.  */
1010           if (!SLP_TREE_CHILDREN (child).is_empty ())
1011             {
1012               unsigned int j;
1013               slp_tree grandchild;
1014
1015               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1016                 if (grandchild != NULL)
1017                   break;
1018               if (!grandchild)
1019                 {
1020                   /* Roll back.  */
1021                   *max_nunits = old_max_nunits;
1022                   loads->truncate (old_nloads);
1023                   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1024                       vect_free_slp_tree (grandchild);
1025                   SLP_TREE_CHILDREN (child).truncate (0);
1026
1027                   dump_printf_loc (MSG_NOTE, vect_location,
1028                                    "Building parent vector operands from "
1029                                    "scalars instead\n");
1030                   oprnd_info->def_stmts = vNULL;
1031                   vect_free_slp_tree (child);
1032                   SLP_TREE_CHILDREN (*node).quick_push (NULL);
1033                   continue;
1034                 }
1035             }
1036
1037           oprnd_info->def_stmts = vNULL;
1038           SLP_TREE_CHILDREN (*node).quick_push (child);
1039           continue;
1040         }
1041
1042       /* If the SLP build failed fatally and we analyze a basic-block
1043          simply treat nodes we fail to build as externally defined
1044          (and thus build vectors from the scalar defs).
1045          The cost model will reject outright expensive cases.
1046          ???  This doesn't treat cases where permutation ultimatively
1047          fails (or we don't try permutation below).  Ideally we'd
1048          even compute a permutation that will end up with the maximum
1049          SLP tree size...  */
1050       if (is_a <bb_vec_info> (vinfo)
1051           && !matches[0]
1052           /* ???  Rejecting patterns this way doesn't work.  We'd have to
1053              do extra work to cancel the pattern so the uses see the
1054              scalar version.  */
1055           && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1056         {
1057           unsigned int j;
1058           slp_tree grandchild;
1059
1060           /* Roll back.  */
1061           *max_nunits = old_max_nunits;
1062           loads->truncate (old_nloads);
1063           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1064             vect_free_slp_tree (grandchild);
1065           SLP_TREE_CHILDREN (child).truncate (0);
1066
1067           dump_printf_loc (MSG_NOTE, vect_location,
1068                            "Building vector operands from scalars\n");
1069           oprnd_info->def_stmts = vNULL;
1070           vect_free_slp_tree (child);
1071           SLP_TREE_CHILDREN (*node).quick_push (NULL);
1072           continue;
1073         }
1074
1075       /* If the SLP build for operand zero failed and operand zero
1076          and one can be commutated try that for the scalar stmts
1077          that failed the match.  */
1078       if (i == 0
1079           /* A first scalar stmt mismatch signals a fatal mismatch.  */
1080           && matches[0]
1081           /* ???  For COND_EXPRs we can swap the comparison operands
1082              as well as the arms under some constraints.  */
1083           && nops == 2
1084           && oprnds_info[1]->first_dt == vect_internal_def
1085           && is_gimple_assign (stmt)
1086           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1087           && !SLP_TREE_TWO_OPERATORS (*node)
1088           /* Do so only if the number of not successful permutes was nor more
1089              than a cut-ff as re-trying the recursive match on
1090              possibly each level of the tree would expose exponential
1091              behavior.  */
1092           && *npermutes < 4)
1093         {
1094           unsigned int j;
1095           slp_tree grandchild;
1096
1097           /* Roll back.  */
1098           *max_nunits = old_max_nunits;
1099           loads->truncate (old_nloads);
1100           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1101             vect_free_slp_tree (grandchild);
1102           SLP_TREE_CHILDREN (child).truncate (0);
1103
1104           /* Swap mismatched definition stmts.  */
1105           dump_printf_loc (MSG_NOTE, vect_location,
1106                            "Re-trying with swapped operands of stmts ");
1107           for (j = 0; j < group_size; ++j)
1108             if (!matches[j])
1109               {
1110                 std::swap (oprnds_info[0]->def_stmts[j],
1111                            oprnds_info[1]->def_stmts[j]);
1112                 dump_printf (MSG_NOTE, "%d ", j);
1113               }
1114           dump_printf (MSG_NOTE, "\n");
1115           /* And try again with scratch 'matches' ... */
1116           bool *tem = XALLOCAVEC (bool, group_size);
1117           if (vect_build_slp_tree (vinfo, &child,
1118                                    group_size, max_nunits, loads,
1119                                    vectorization_factor,
1120                                    tem, npermutes, &this_tree_size,
1121                                    max_tree_size))
1122             {
1123               /* ... so if successful we can apply the operand swapping
1124                  to the GIMPLE IL.  This is necessary because for example
1125                  vect_get_slp_defs uses operand indexes and thus expects
1126                  canonical operand order.  */
1127               for (j = 0; j < group_size; ++j)
1128                 if (!matches[j])
1129                   {
1130                     gimple *stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1131                     swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1132                                        gimple_assign_rhs2_ptr (stmt));
1133                   }
1134               oprnd_info->def_stmts = vNULL;
1135               SLP_TREE_CHILDREN (*node).quick_push (child);
1136               continue;
1137             }
1138
1139           ++*npermutes;
1140         }
1141
1142       oprnd_info->def_stmts = vNULL;
1143       vect_free_slp_tree (child);
1144       vect_free_oprnd_info (oprnds_info);
1145       return false;
1146     }
1147
1148   if (tree_size)
1149     *tree_size += this_tree_size;
1150
1151   vect_free_oprnd_info (oprnds_info);
1152   return true;
1153 }
1154
1155 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1156
1157 static void
1158 vect_print_slp_tree (int dump_kind, slp_tree node)
1159 {
1160   int i;
1161   gimple *stmt;
1162   slp_tree child;
1163
1164   if (!node)
1165     return;
1166
1167   dump_printf (dump_kind, "node ");
1168   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1169     {
1170       dump_printf (dump_kind, "\n\tstmt %d ", i);
1171       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1172     }
1173   dump_printf (dump_kind, "\n");
1174
1175   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1176     vect_print_slp_tree (dump_kind, child);
1177 }
1178
1179
1180 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1181    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1182    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1183    stmts in NODE are to be marked.  */
1184
1185 static void
1186 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1187 {
1188   int i;
1189   gimple *stmt;
1190   slp_tree child;
1191
1192   if (!node)
1193     return;
1194
1195   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1196     if (j < 0 || i == j)
1197       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1198
1199   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1200     vect_mark_slp_stmts (child, mark, j);
1201 }
1202
1203
1204 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1205
1206 static void
1207 vect_mark_slp_stmts_relevant (slp_tree node)
1208 {
1209   int i;
1210   gimple *stmt;
1211   stmt_vec_info stmt_info;
1212   slp_tree child;
1213
1214   if (!node)
1215     return;
1216
1217   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1218     {
1219       stmt_info = vinfo_for_stmt (stmt);
1220       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1221                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1222       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1223     }
1224
1225   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1226     vect_mark_slp_stmts_relevant (child);
1227 }
1228
1229
1230 /* Rearrange the statements of NODE according to PERMUTATION.  */
1231
1232 static void
1233 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1234                           vec<unsigned> permutation)
1235 {
1236   gimple *stmt;
1237   vec<gimple *> tmp_stmts;
1238   unsigned int i;
1239   slp_tree child;
1240
1241   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1242     vect_slp_rearrange_stmts (child, group_size, permutation);
1243
1244   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1245   tmp_stmts.create (group_size);
1246   tmp_stmts.quick_grow_cleared (group_size);
1247
1248   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1249     tmp_stmts[permutation[i]] = stmt;
1250
1251   SLP_TREE_SCALAR_STMTS (node).release ();
1252   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1253 }
1254
1255
1256 /* Attempt to reorder stmts in a reduction chain so that we don't
1257    require any load permutation.  Return true if that was possible,
1258    otherwise return false.  */
1259
1260 static bool
1261 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1262 {
1263   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1264   unsigned int i, j;
1265   sbitmap load_index;
1266   unsigned int lidx;
1267   slp_tree node, load;
1268
1269   /* Compare all the permutation sequences to the first one.  We know
1270      that at least one load is permuted.  */
1271   node = SLP_INSTANCE_LOADS (slp_instn)[0];
1272   if (!node->load_permutation.exists ())
1273     return false;
1274   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1275     {
1276       if (!load->load_permutation.exists ())
1277         return false;
1278       FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1279         if (lidx != node->load_permutation[j])
1280           return false;
1281     }
1282
1283   /* Check that the loads in the first sequence are different and there
1284      are no gaps between them.  */
1285   load_index = sbitmap_alloc (group_size);
1286   bitmap_clear (load_index);
1287   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1288     {
1289       if (bitmap_bit_p (load_index, lidx))
1290         {
1291           sbitmap_free (load_index);
1292           return false;
1293         }
1294       bitmap_set_bit (load_index, lidx);
1295     }
1296   for (i = 0; i < group_size; i++)
1297     if (!bitmap_bit_p (load_index, i))
1298       {
1299         sbitmap_free (load_index);
1300         return false;
1301       }
1302   sbitmap_free (load_index);
1303
1304   /* This permutation is valid for reduction.  Since the order of the
1305      statements in the nodes is not important unless they are memory
1306      accesses, we can rearrange the statements in all the nodes
1307      according to the order of the loads.  */
1308   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1309                             node->load_permutation);
1310
1311   /* We are done, no actual permutations need to be generated.  */
1312   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1313     SLP_TREE_LOAD_PERMUTATION (node).release ();
1314   return true;
1315 }
1316
1317 /* Check if the required load permutations in the SLP instance
1318    SLP_INSTN are supported.  */
1319
1320 static bool
1321 vect_supported_load_permutation_p (slp_instance slp_instn)
1322 {
1323   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1324   unsigned int i, j, k, next;
1325   slp_tree node;
1326   gimple *stmt, *load, *next_load, *first_load;
1327   struct data_reference *dr;
1328
1329   if (dump_enabled_p ())
1330     {
1331       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1332       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1333         if (node->load_permutation.exists ())
1334           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1335             dump_printf (MSG_NOTE, "%d ", next);
1336         else
1337           for (k = 0; k < group_size; ++k)
1338             dump_printf (MSG_NOTE, "%d ", k);
1339       dump_printf (MSG_NOTE, "\n");
1340     }
1341
1342   /* In case of reduction every load permutation is allowed, since the order
1343      of the reduction statements is not important (as opposed to the case of
1344      grouped stores).  The only condition we need to check is that all the
1345      load nodes are of the same size and have the same permutation (and then
1346      rearrange all the nodes of the SLP instance according to this 
1347      permutation).  */
1348
1349   /* Check that all the load nodes are of the same size.  */
1350   /* ???  Can't we assert this? */
1351   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1352     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1353       return false;
1354
1355   node = SLP_INSTANCE_TREE (slp_instn);
1356   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1357
1358   /* Reduction (there are no data-refs in the root).
1359      In reduction chain the order of the loads is not important.  */
1360   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1361       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1362     {
1363       if (vect_attempt_slp_rearrange_stmts (slp_instn))
1364         return true;
1365
1366       /* Fallthru to general load permutation handling.  */
1367     }
1368
1369   /* In basic block vectorization we allow any subchain of an interleaving
1370      chain.
1371      FORNOW: not supported in loop SLP because of realignment compications.  */
1372   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1373     {
1374       /* Check whether the loads in an instance form a subchain and thus
1375          no permutation is necessary.  */
1376       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1377         {
1378           if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1379             continue;
1380           bool subchain_p = true;
1381           next_load = NULL;
1382           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1383             {
1384               if (j != 0
1385                   && (next_load != load
1386                       || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1387                 {
1388                   subchain_p = false;
1389                   break;
1390                 }
1391               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1392             }
1393           if (subchain_p)
1394             SLP_TREE_LOAD_PERMUTATION (node).release ();
1395           else
1396             {
1397               /* Verify the permutation can be generated.  */
1398               vec<tree> tem;
1399               if (!vect_transform_slp_perm_load (node, tem, NULL,
1400                                                  1, slp_instn, true))
1401                 {
1402                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1403                                    vect_location,
1404                                    "unsupported load permutation\n");
1405                   return false;
1406                 }
1407             }
1408         }
1409
1410       /* Check that the alignment of the first load in every subchain, i.e.,
1411          the first statement in every load node, is supported.
1412          ???  This belongs in alignment checking.  */
1413       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1414         {
1415           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1416           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1417             {
1418               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1419               if (vect_supportable_dr_alignment (dr, false)
1420                   == dr_unaligned_unsupported)
1421                 {
1422                   if (dump_enabled_p ())
1423                     {
1424                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1425                                        vect_location,
1426                                        "unsupported unaligned load ");
1427                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1428                                         first_load, 0);
1429                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1430                     }
1431                   return false;
1432                 }
1433             }
1434         }
1435
1436       return true;
1437     }
1438
1439   /* For loop vectorization verify we can generate the permutation.  */
1440   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1441     if (node->load_permutation.exists ()
1442         && !vect_transform_slp_perm_load
1443               (node, vNULL, NULL,
1444                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1445       return false;
1446
1447   return true;
1448 }
1449
1450
1451 /* Find the last store in SLP INSTANCE.  */
1452
1453 static gimple *
1454 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1455 {
1456   gimple *last = NULL, *stmt;
1457
1458   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1459     {
1460       stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1461       if (is_pattern_stmt_p (stmt_vinfo))
1462         last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1463       else
1464         last = get_later_stmt (stmt, last);
1465     }
1466
1467   return last;
1468 }
1469
1470 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1471
1472 static void
1473 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1474                          stmt_vector_for_cost *prologue_cost_vec,
1475                          stmt_vector_for_cost *body_cost_vec,
1476                          unsigned ncopies_for_cost)
1477 {
1478   unsigned i;
1479   slp_tree child;
1480   gimple *stmt, *s;
1481   stmt_vec_info stmt_info;
1482   tree lhs;
1483   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1484
1485   /* Recurse down the SLP tree.  */
1486   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1487     if (child)
1488       vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1489                                body_cost_vec, ncopies_for_cost);
1490
1491   /* Look at the first scalar stmt to determine the cost.  */
1492   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1493   stmt_info = vinfo_for_stmt (stmt);
1494   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1495     {
1496       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1497         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1498                                vect_uninitialized_def,
1499                                node, prologue_cost_vec, body_cost_vec);
1500       else
1501         {
1502           int i;
1503           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1504           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1505                                 node, prologue_cost_vec, body_cost_vec);
1506           /* If the load is permuted record the cost for the permutation.
1507              ???  Loads from multiple chains are let through here only
1508              for a single special case involving complex numbers where
1509              in the end no permutation is necessary.  */
1510           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1511             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1512                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1513                 && vect_get_place_in_interleaving_chain
1514                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1515               {
1516                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1517                                   stmt_info, 0, vect_body);
1518                 break;
1519               }
1520         }
1521     }
1522   else
1523     {
1524       record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1525                         stmt_info, 0, vect_body);
1526       if (SLP_TREE_TWO_OPERATORS (node))
1527         {
1528           record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1529                             stmt_info, 0, vect_body);
1530           record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1531                             stmt_info, 0, vect_body);
1532         }
1533     }
1534
1535   /* Scan operands and account for prologue cost of constants/externals.
1536      ???  This over-estimates cost for multiple uses and should be
1537      re-engineered.  */
1538   lhs = gimple_get_lhs (stmt);
1539   for (i = 0; i < gimple_num_ops (stmt); ++i)
1540     {
1541       tree def, op = gimple_op (stmt, i);
1542       gimple *def_stmt;
1543       enum vect_def_type dt;
1544       if (!op || op == lhs)
1545         continue;
1546       if (vect_is_simple_use (op, NULL, stmt_info->vinfo, &def_stmt, &def, &dt))
1547         {
1548           /* Without looking at the actual initializer a vector of
1549              constants can be implemented as load from the constant pool.
1550              ???  We need to pass down stmt_info for a vector type
1551              even if it points to the wrong stmt.  */
1552           if (dt == vect_constant_def)
1553             record_stmt_cost (prologue_cost_vec, 1, vector_load,
1554                               stmt_info, 0, vect_prologue);
1555           else if (dt == vect_external_def)
1556             record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1557                               stmt_info, 0, vect_prologue);
1558         }
1559     }
1560 }
1561
1562 /* Compute the cost for the SLP instance INSTANCE.  */
1563
1564 static void
1565 vect_analyze_slp_cost (slp_instance instance, void *data)
1566 {
1567   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1568   unsigned ncopies_for_cost;
1569   stmt_info_for_cost *si;
1570   unsigned i;
1571
1572   if (dump_enabled_p ())
1573     dump_printf_loc (MSG_NOTE, vect_location,
1574                      "=== vect_analyze_slp_cost ===\n");
1575
1576   /* Calculate the number of vector stmts to create based on the unrolling
1577      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1578      GROUP_SIZE / NUNITS otherwise.  */
1579   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1580   slp_tree node = SLP_INSTANCE_TREE (instance);
1581   stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1582   /* Adjust the group_size by the vectorization factor which is always one
1583      for basic-block vectorization.  */
1584   if (STMT_VINFO_LOOP_VINFO (stmt_info))
1585     group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1586   unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1587   /* For reductions look at a reduction operand in case the reduction
1588      operation is widening like DOT_PROD or SAD.  */
1589   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1590     {
1591       gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1592       switch (gimple_assign_rhs_code (stmt))
1593         {
1594         case DOT_PROD_EXPR:
1595         case SAD_EXPR:
1596           nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1597                                 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1598           break;
1599         default:;
1600         }
1601     }
1602   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1603
1604   prologue_cost_vec.create (10);
1605   body_cost_vec.create (10);
1606   vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1607                            &prologue_cost_vec, &body_cost_vec,
1608                            ncopies_for_cost);
1609
1610   /* Record the prologue costs, which were delayed until we were
1611      sure that SLP was successful.  */
1612   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1613     {
1614       struct _stmt_vec_info *stmt_info
1615         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1616       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1617                             si->misalign, vect_prologue);
1618     }
1619
1620   /* Record the instance's instructions in the target cost model.  */
1621   FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1622     {
1623       struct _stmt_vec_info *stmt_info
1624         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1625       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1626                             si->misalign, vect_body);
1627     }
1628
1629   prologue_cost_vec.release ();
1630   body_cost_vec.release ();
1631 }
1632
1633 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1634    vect_build_slp_tree to build a tree of packed stmts if possible.
1635    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1636
1637 static bool
1638 vect_analyze_slp_instance (vec_info *vinfo,
1639                            gimple *stmt, unsigned max_tree_size)
1640 {
1641   slp_instance new_instance;
1642   slp_tree node;
1643   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1644   unsigned int unrolling_factor = 1, nunits;
1645   tree vectype, scalar_type = NULL_TREE;
1646   gimple *next;
1647   unsigned int vectorization_factor = 0;
1648   int i;
1649   unsigned int max_nunits = 0;
1650   vec<slp_tree> loads;
1651   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1652   vec<gimple *> scalar_stmts;
1653
1654   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1655     {
1656       if (dr)
1657         {
1658           scalar_type = TREE_TYPE (DR_REF (dr));
1659           vectype = get_vectype_for_scalar_type (scalar_type);
1660         }
1661       else
1662         {
1663           gcc_assert (is_a <loop_vec_info> (vinfo));
1664           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1665         }
1666
1667       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1668     }
1669   else
1670     {
1671       gcc_assert (is_a <loop_vec_info> (vinfo));
1672       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1673       group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1674     }
1675
1676   if (!vectype)
1677     {
1678       if (dump_enabled_p ())
1679         {
1680           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1681                            "Build SLP failed: unsupported data-type ");
1682           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1683           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1684         }
1685
1686       return false;
1687     }
1688
1689   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1690   if (is_a <loop_vec_info> (vinfo))
1691     vectorization_factor = as_a <loop_vec_info> (vinfo)->vectorization_factor;
1692   else
1693     vectorization_factor = nunits;
1694
1695   /* Calculate the unrolling factor.  */
1696   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1697   if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo))
1698     {
1699       if (dump_enabled_p ())
1700         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1701                          "Build SLP failed: unrolling required in basic"
1702                          " block SLP\n");
1703
1704       return false;
1705     }
1706
1707   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1708   scalar_stmts.create (group_size);
1709   next = stmt;
1710   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1711     {
1712       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1713       while (next)
1714         {
1715           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1716               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1717             scalar_stmts.safe_push (
1718                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1719           else
1720             scalar_stmts.safe_push (next);
1721           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1722         }
1723       /* Mark the first element of the reduction chain as reduction to properly
1724          transform the node.  In the reduction analysis phase only the last
1725          element of the chain is marked as reduction.  */
1726       if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1727         STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1728     }
1729   else
1730     {
1731       /* Collect reduction statements.  */
1732       vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1733       for (i = 0; reductions.iterate (i, &next); i++)
1734         scalar_stmts.safe_push (next);
1735     }
1736
1737   node = vect_create_new_slp_node (scalar_stmts);
1738
1739   loads.create (group_size);
1740
1741   /* Build the tree for the SLP instance.  */
1742   bool *matches = XALLOCAVEC (bool, group_size);
1743   unsigned npermutes = 0;
1744   if (vect_build_slp_tree (vinfo, &node, group_size,
1745                            &max_nunits, &loads,
1746                            vectorization_factor, matches, &npermutes, NULL,
1747                            max_tree_size))
1748     {
1749       /* Calculate the unrolling factor based on the smallest type.  */
1750       if (max_nunits > nunits)
1751         unrolling_factor = least_common_multiple (max_nunits, group_size)
1752                            / group_size;
1753
1754       if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo))
1755         {
1756           if (dump_enabled_p ())
1757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1758                              "Build SLP failed: unrolling required in basic"
1759                              " block SLP\n");
1760           vect_free_slp_tree (node);
1761           loads.release ();
1762           return false;
1763         }
1764
1765       /* Create a new SLP instance.  */
1766       new_instance = XNEW (struct _slp_instance);
1767       SLP_INSTANCE_TREE (new_instance) = node;
1768       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1769       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1770       SLP_INSTANCE_LOADS (new_instance) = loads;
1771
1772       /* Compute the load permutation.  */
1773       slp_tree load_node;
1774       bool loads_permuted = false;
1775       FOR_EACH_VEC_ELT (loads, i, load_node)
1776         {
1777           vec<unsigned> load_permutation;
1778           int j;
1779           gimple *load, *first_stmt;
1780           bool this_load_permuted = false;
1781           load_permutation.create (group_size);
1782           first_stmt = GROUP_FIRST_ELEMENT
1783               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1784           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1785             {
1786               int load_place
1787                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1788               gcc_assert (load_place != -1);
1789               if (load_place != j)
1790                 this_load_permuted = true;
1791               load_permutation.safe_push (load_place);
1792             }
1793           if (!this_load_permuted
1794               /* The load requires permutation when unrolling exposes
1795                  a gap either because the group is larger than the SLP
1796                  group-size or because there is a gap between the groups.  */
1797               && (unrolling_factor == 1
1798                   || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1799                       && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
1800             {
1801               load_permutation.release ();
1802               continue;
1803             }
1804           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1805           loads_permuted = true;
1806         }
1807
1808       if (loads_permuted)
1809         {
1810           if (!vect_supported_load_permutation_p (new_instance))
1811             {
1812               if (dump_enabled_p ())
1813                 {
1814                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815                                    "Build SLP failed: unsupported load "
1816                                    "permutation ");
1817                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1818                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1819                 }
1820               vect_free_slp_instance (new_instance);
1821               return false;
1822             }
1823         }
1824
1825       vinfo->slp_instances.safe_push (new_instance);
1826
1827       if (dump_enabled_p ())
1828         vect_print_slp_tree (MSG_NOTE, node);
1829
1830       return true;
1831     }
1832
1833   /* Failed to SLP.  */
1834   /* Free the allocated memory.  */
1835   vect_free_slp_tree (node);
1836   loads.release ();
1837
1838   return false;
1839 }
1840
1841
1842 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1843    trees of packed scalar stmts if SLP is possible.  */
1844
1845 bool
1846 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
1847 {
1848   unsigned int i;
1849   gimple *first_element;
1850   bool ok = false;
1851
1852   if (dump_enabled_p ())
1853     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1854
1855   /* Find SLP sequences starting from groups of grouped stores.  */
1856   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
1857     if (vect_analyze_slp_instance (vinfo, first_element, max_tree_size))
1858       ok = true;
1859
1860   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1861     {
1862       if (loop_vinfo->reduction_chains.length () > 0)
1863         {
1864           /* Find SLP sequences starting from reduction chains.  */
1865           FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
1866               if (vect_analyze_slp_instance (vinfo, first_element,
1867                                              max_tree_size))
1868                 ok = true;
1869               else
1870                 return false;
1871
1872           /* Don't try to vectorize SLP reductions if reduction chain was
1873              detected.  */
1874           return ok;
1875         }
1876
1877       /* Find SLP sequences starting from groups of reductions.  */
1878       if (loop_vinfo->reductions.length () > 1
1879           && vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
1880                                         max_tree_size))
1881         ok = true;
1882     }
1883
1884   return true;
1885 }
1886
1887
1888 /* For each possible SLP instance decide whether to SLP it and calculate overall
1889    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1890    least one instance.  */
1891
1892 bool
1893 vect_make_slp_decision (loop_vec_info loop_vinfo)
1894 {
1895   unsigned int i, unrolling_factor = 1;
1896   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1897   slp_instance instance;
1898   int decided_to_slp = 0;
1899
1900   if (dump_enabled_p ())
1901     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1902                      "\n");
1903
1904   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1905     {
1906       /* FORNOW: SLP if you can.  */
1907       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1908         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1909
1910       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1911          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1912          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1913       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1914       decided_to_slp++;
1915     }
1916
1917   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1918
1919   if (decided_to_slp && dump_enabled_p ())
1920     dump_printf_loc (MSG_NOTE, vect_location,
1921                      "Decided to SLP %d instances. Unrolling factor %d\n",
1922                      decided_to_slp, unrolling_factor);
1923
1924   return (decided_to_slp > 0);
1925 }
1926
1927
1928 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1929    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1930
1931 static void
1932 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1933 {
1934   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1935   imm_use_iterator imm_iter;
1936   gimple *use_stmt;
1937   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1938   slp_tree child;
1939   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1940   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1941   int j;
1942
1943   /* Propagate hybrid down the SLP tree.  */
1944   if (stype == hybrid)
1945     ;
1946   else if (HYBRID_SLP_STMT (stmt_vinfo))
1947     stype = hybrid;
1948   else
1949     {
1950       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1951       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1952       /* We always get the pattern stmt here, but for immediate
1953          uses we have to use the LHS of the original stmt.  */
1954       gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1955       if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
1956         stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
1957       if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1958         FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1959           {
1960             if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1961               continue;
1962             use_vinfo = vinfo_for_stmt (use_stmt);
1963             if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1964                 && STMT_VINFO_RELATED_STMT (use_vinfo))
1965               use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
1966             if (!STMT_SLP_TYPE (use_vinfo)
1967                 && (STMT_VINFO_RELEVANT (use_vinfo)
1968                     || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
1969                 && !(gimple_code (use_stmt) == GIMPLE_PHI
1970                      && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1971               {
1972                 if (dump_enabled_p ())
1973                   {
1974                     dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
1975                                      "def in non-SLP stmt: ");
1976                     dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
1977                   }
1978                 stype = hybrid;
1979               }
1980           }
1981     }
1982
1983   if (stype == hybrid
1984       && !HYBRID_SLP_STMT (stmt_vinfo))
1985     {
1986       if (dump_enabled_p ())
1987         {
1988           dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
1989           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
1990         }
1991       STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1992     }
1993
1994   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1995     if (child)
1996       vect_detect_hybrid_slp_stmts (child, i, stype);
1997 }
1998
1999 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
2000
2001 static tree
2002 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2003 {
2004   walk_stmt_info *wi = (walk_stmt_info *)data;
2005   struct loop *loopp = (struct loop *)wi->info;
2006
2007   if (wi->is_lhs)
2008     return NULL_TREE;
2009
2010   if (TREE_CODE (*tp) == SSA_NAME
2011       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2012     {
2013       gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2014       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2015           && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2016         {
2017           if (dump_enabled_p ())
2018             {
2019               dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2020               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2021             }
2022           STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2023         }
2024     }
2025
2026   return NULL_TREE;
2027 }
2028
2029 static tree
2030 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2031                           walk_stmt_info *)
2032 {
2033   /* If the stmt is in a SLP instance then this isn't a reason
2034      to mark use definitions in other SLP instances as hybrid.  */
2035   if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2036     *handled = true;
2037   return NULL_TREE;
2038 }
2039
2040 /* Find stmts that must be both vectorized and SLPed.  */
2041
2042 void
2043 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2044 {
2045   unsigned int i;
2046   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2047   slp_instance instance;
2048
2049   if (dump_enabled_p ())
2050     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2051                      "\n");
2052
2053   /* First walk all pattern stmt in the loop and mark defs of uses as
2054      hybrid because immediate uses in them are not recorded.  */
2055   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2056     {
2057       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2058       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2059            gsi_next (&gsi))
2060         {
2061           gimple *stmt = gsi_stmt (gsi);
2062           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2063           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2064             {
2065               walk_stmt_info wi;
2066               memset (&wi, 0, sizeof (wi));
2067               wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2068               gimple_stmt_iterator gsi2
2069                 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2070               walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2071                                 vect_detect_hybrid_slp_1, &wi);
2072               walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2073                                vect_detect_hybrid_slp_2,
2074                                vect_detect_hybrid_slp_1, &wi);
2075             }
2076         }
2077     }
2078
2079   /* Then walk the SLP instance trees marking stmts with uses in
2080      non-SLP stmts as hybrid, also propagating hybrid down the
2081      SLP tree, collecting the above info on-the-fly.  */
2082   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2083     {
2084       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2085         vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2086                                       i, pure_slp);
2087     }
2088 }
2089
2090
2091 /* Create and initialize a new bb_vec_info struct for BB, as well as
2092    stmt_vec_info structs for all the stmts in it.  */
2093
2094 static bb_vec_info
2095 new_bb_vec_info (basic_block bb)
2096 {
2097   bb_vec_info res = NULL;
2098   gimple_stmt_iterator gsi;
2099
2100   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2101   res->kind = vec_info::bb;
2102   BB_VINFO_BB (res) = bb;
2103
2104   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2105     {
2106       gimple *stmt = gsi_stmt (gsi);
2107       gimple_set_uid (stmt, 0);
2108       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
2109     }
2110
2111   BB_VINFO_GROUPED_STORES (res).create (10);
2112   BB_VINFO_SLP_INSTANCES (res).create (2);
2113   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2114
2115   bb->aux = res;
2116   return res;
2117 }
2118
2119
2120 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2121    stmts in the basic block.  */
2122
2123 static void
2124 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2125 {
2126   vec<slp_instance> slp_instances;
2127   slp_instance instance;
2128   basic_block bb;
2129   gimple_stmt_iterator si;
2130   unsigned i;
2131
2132   if (!bb_vinfo)
2133     return;
2134
2135   bb = BB_VINFO_BB (bb_vinfo);
2136
2137   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2138     {
2139       gimple *stmt = gsi_stmt (si);
2140       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2141
2142       if (stmt_info)
2143         /* Free stmt_vec_info.  */
2144         free_stmt_vec_info (stmt);
2145     }
2146
2147   vect_destroy_datarefs (bb_vinfo);
2148   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2149   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2150   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2151   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2152     vect_free_slp_instance (instance);
2153   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2154   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2155   free (bb_vinfo);
2156   bb->aux = NULL;
2157 }
2158
2159
2160 /* Analyze statements contained in SLP tree node after recursively analyzing
2161    the subtree. Return TRUE if the operations are supported.  */
2162
2163 static bool
2164 vect_slp_analyze_node_operations (slp_tree node)
2165 {
2166   bool dummy;
2167   int i;
2168   gimple *stmt;
2169   slp_tree child;
2170
2171   if (!node)
2172     return true;
2173
2174   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2175     if (!vect_slp_analyze_node_operations (child))
2176       return false;
2177
2178   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2179     {
2180       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2181       gcc_assert (stmt_info);
2182       gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2183
2184       if (!vect_analyze_stmt (stmt, &dummy, node))
2185         return false;
2186     }
2187
2188   return true;
2189 }
2190
2191
2192 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2193    operations are supported. */
2194
2195 bool
2196 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2197 {
2198   slp_instance instance;
2199   int i;
2200
2201   if (dump_enabled_p ())
2202     dump_printf_loc (MSG_NOTE, vect_location,
2203                      "=== vect_slp_analyze_operations ===\n");
2204
2205   for (i = 0; slp_instances.iterate (i, &instance); )
2206     {
2207       if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2208         {
2209           dump_printf_loc (MSG_NOTE, vect_location,
2210                            "removing SLP instance operations starting from: ");
2211           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2212                             SLP_TREE_SCALAR_STMTS
2213                               (SLP_INSTANCE_TREE (instance))[0], 0);
2214           vect_free_slp_instance (instance);
2215           slp_instances.ordered_remove (i);
2216         }
2217       else
2218         {
2219           /* Compute the costs of the SLP instance.  */
2220           vect_analyze_slp_cost (instance, data);
2221           i++;
2222         }
2223     }
2224
2225   if (!slp_instances.length ())
2226     return false;
2227
2228   return true;
2229 }
2230
2231
2232 /* Compute the scalar cost of the SLP node NODE and its children
2233    and return it.  Do not account defs that are marked in LIFE and
2234    update LIFE according to uses of NODE.  */
2235
2236 static unsigned
2237 vect_bb_slp_scalar_cost (basic_block bb,
2238                          slp_tree node, vec<bool, va_heap> *life)
2239 {
2240   unsigned scalar_cost = 0;
2241   unsigned i;
2242   gimple *stmt;
2243   slp_tree child;
2244
2245   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2246     {
2247       unsigned stmt_cost;
2248       ssa_op_iter op_iter;
2249       def_operand_p def_p;
2250       stmt_vec_info stmt_info;
2251
2252       if ((*life)[i])
2253         continue;
2254
2255       /* If there is a non-vectorized use of the defs then the scalar
2256          stmt is kept live in which case we do not account it or any
2257          required defs in the SLP children in the scalar cost.  This
2258          way we make the vectorization more costly when compared to
2259          the scalar cost.  */
2260       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2261         {
2262           imm_use_iterator use_iter;
2263           gimple *use_stmt;
2264           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2265             if (!is_gimple_debug (use_stmt)
2266                 && (gimple_code (use_stmt) == GIMPLE_PHI
2267                     || gimple_bb (use_stmt) != bb
2268                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2269               {
2270                 (*life)[i] = true;
2271                 BREAK_FROM_IMM_USE_STMT (use_iter);
2272               }
2273         }
2274       if ((*life)[i])
2275         continue;
2276
2277       stmt_info = vinfo_for_stmt (stmt);
2278       if (STMT_VINFO_DATA_REF (stmt_info))
2279         {
2280           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2281             stmt_cost = vect_get_stmt_cost (scalar_load);
2282           else
2283             stmt_cost = vect_get_stmt_cost (scalar_store);
2284         }
2285       else
2286         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2287
2288       scalar_cost += stmt_cost;
2289     }
2290
2291   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2292     if (child)
2293       scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2294
2295   return scalar_cost;
2296 }
2297
2298 /* Check if vectorization of the basic block is profitable.  */
2299
2300 static bool
2301 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2302 {
2303   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2304   slp_instance instance;
2305   int i;
2306   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2307   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2308
2309   /* Calculate scalar cost.  */
2310   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2311     {
2312       auto_vec<bool, 20> life;
2313       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2314       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2315                                               SLP_INSTANCE_TREE (instance),
2316                                               &life);
2317     }
2318
2319   /* Complete the target-specific cost calculation.  */
2320   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2321                &vec_inside_cost, &vec_epilogue_cost);
2322
2323   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2324
2325   if (dump_enabled_p ())
2326     {
2327       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2328       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2329                    vec_inside_cost);
2330       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2331       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2332       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2333     }
2334
2335   /* Vectorization is profitable if its cost is less than the cost of scalar
2336      version.  */
2337   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2338     return false;
2339
2340   return true;
2341 }
2342
2343 /* Check if the basic block can be vectorized.  */
2344
2345 static bb_vec_info
2346 vect_slp_analyze_bb_1 (basic_block bb)
2347 {
2348   bb_vec_info bb_vinfo;
2349   vec<slp_instance> slp_instances;
2350   slp_instance instance;
2351   int i;
2352   int min_vf = 2;
2353   unsigned n_stmts = 0;
2354
2355   bb_vinfo = new_bb_vec_info (bb);
2356   if (!bb_vinfo)
2357     return NULL;
2358
2359   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, &n_stmts))
2360     {
2361       if (dump_enabled_p ())
2362         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2363                          "not vectorized: unhandled data-ref in basic "
2364                          "block.\n");
2365
2366       destroy_bb_vec_info (bb_vinfo);
2367       return NULL;
2368     }
2369
2370   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2371     {
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374                          "not vectorized: not enough data-refs in "
2375                          "basic block.\n");
2376
2377       destroy_bb_vec_info (bb_vinfo);
2378       return NULL;
2379     }
2380
2381   if (!vect_analyze_data_ref_accesses (bb_vinfo))
2382     {
2383      if (dump_enabled_p ())
2384        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2385                         "not vectorized: unhandled data access in "
2386                         "basic block.\n");
2387
2388       destroy_bb_vec_info (bb_vinfo);
2389       return NULL;
2390     }
2391
2392   vect_pattern_recog (bb_vinfo);
2393
2394   if (!vect_analyze_data_refs_alignment (bb_vinfo))
2395     {
2396       if (dump_enabled_p ())
2397         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2398                          "not vectorized: bad data alignment in basic "
2399                          "block.\n");
2400
2401       destroy_bb_vec_info (bb_vinfo);
2402       return NULL;
2403     }
2404
2405   /* Check the SLP opportunities in the basic block, analyze and build SLP
2406      trees.  */
2407   if (!vect_analyze_slp (bb_vinfo, n_stmts))
2408     {
2409       if (dump_enabled_p ())
2410         {
2411           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2412                            "Failed to SLP the basic block.\n");
2413           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2414                            "not vectorized: failed to find SLP opportunities "
2415                            "in basic block.\n");
2416         }
2417
2418       destroy_bb_vec_info (bb_vinfo);
2419       return NULL;
2420     }
2421
2422   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2423
2424   /* Mark all the statements that we want to vectorize as pure SLP and
2425      relevant.  */
2426   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2427     {
2428       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2429       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2430     }
2431
2432   /* Mark all the statements that we do not want to vectorize.  */
2433   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2434        !gsi_end_p (gsi); gsi_next (&gsi))
2435     {
2436       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2437       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2438         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2439     }
2440
2441   /* Analyze dependences.  At this point all stmts not participating in
2442      vectorization have to be marked.  Dependence analysis assumes
2443      that we either vectorize all SLP instances or none at all.  */
2444   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2445      {
2446        if (dump_enabled_p ())
2447          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2448                           "not vectorized: unhandled data dependence "
2449                           "in basic block.\n");
2450
2451        destroy_bb_vec_info (bb_vinfo);
2452        return NULL;
2453      }
2454
2455   if (!vect_verify_datarefs_alignment (bb_vinfo))
2456     {
2457       if (dump_enabled_p ())
2458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2459                          "not vectorized: unsupported alignment in basic "
2460                          "block.\n");
2461       destroy_bb_vec_info (bb_vinfo);
2462       return NULL;
2463     }
2464
2465   if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2466                                     BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2467     {
2468       if (dump_enabled_p ())
2469         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2470                          "not vectorized: bad operation in basic block.\n");
2471
2472       destroy_bb_vec_info (bb_vinfo);
2473       return NULL;
2474     }
2475
2476   /* Cost model: check if the vectorization is worthwhile.  */
2477   if (!unlimited_cost_model (NULL)
2478       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2479     {
2480       if (dump_enabled_p ())
2481         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2482                          "not vectorized: vectorization is not "
2483                          "profitable.\n");
2484
2485       destroy_bb_vec_info (bb_vinfo);
2486       return NULL;
2487     }
2488
2489   if (dump_enabled_p ())
2490     dump_printf_loc (MSG_NOTE, vect_location,
2491                      "Basic block will be vectorized using SLP\n");
2492
2493   return bb_vinfo;
2494 }
2495
2496
2497 bb_vec_info
2498 vect_slp_analyze_bb (basic_block bb)
2499 {
2500   bb_vec_info bb_vinfo;
2501   int insns = 0;
2502   gimple_stmt_iterator gsi;
2503   unsigned int vector_sizes;
2504
2505   if (dump_enabled_p ())
2506     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2507
2508   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2509     {
2510       gimple *stmt = gsi_stmt (gsi);
2511       if (!is_gimple_debug (stmt)
2512           && !gimple_nop_p (stmt)
2513           && gimple_code (stmt) != GIMPLE_LABEL)
2514         insns++;
2515     }
2516
2517   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2518     {
2519       if (dump_enabled_p ())
2520         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2521                          "not vectorized: too many instructions in "
2522                          "basic block.\n");
2523
2524       return NULL;
2525     }
2526
2527   /* Autodetect first vector size we try.  */
2528   current_vector_size = 0;
2529   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2530
2531   while (1)
2532     {
2533       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2534       if (bb_vinfo)
2535         return bb_vinfo;
2536
2537       destroy_bb_vec_info (bb_vinfo);
2538
2539       vector_sizes &= ~current_vector_size;
2540       if (vector_sizes == 0
2541           || current_vector_size == 0)
2542         return NULL;
2543
2544       /* Try the next biggest vector size.  */
2545       current_vector_size = 1 << floor_log2 (vector_sizes);
2546       if (dump_enabled_p ())
2547         dump_printf_loc (MSG_NOTE, vect_location,
2548                          "***** Re-trying analysis with "
2549                          "vector size %d\n", current_vector_size);
2550     }
2551 }
2552
2553
2554 /* For constant and loop invariant defs of SLP_NODE this function returns
2555    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2556    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2557    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2558    REDUC_INDEX is the index of the reduction operand in the statements, unless
2559    it is -1.  */
2560
2561 static void
2562 vect_get_constant_vectors (tree op, slp_tree slp_node,
2563                            vec<tree> *vec_oprnds,
2564                            unsigned int op_num, unsigned int number_of_vectors,
2565                            int reduc_index)
2566 {
2567   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2568   gimple *stmt = stmts[0];
2569   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2570   unsigned nunits;
2571   tree vec_cst;
2572   tree *elts;
2573   unsigned j, number_of_places_left_in_vector;
2574   tree vector_type;
2575   tree vop;
2576   int group_size = stmts.length ();
2577   unsigned int vec_num, i;
2578   unsigned number_of_copies = 1;
2579   vec<tree> voprnds;
2580   voprnds.create (number_of_vectors);
2581   bool constant_p, is_store;
2582   tree neutral_op = NULL;
2583   enum tree_code code = gimple_expr_code (stmt);
2584   gimple *def_stmt;
2585   struct loop *loop;
2586   gimple_seq ctor_seq = NULL;
2587
2588   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2589   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2590
2591   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2592       && reduc_index != -1)
2593     {
2594       op_num = reduc_index;
2595       op = gimple_op (stmt, op_num + 1);
2596       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2597          we need either neutral operands or the original operands.  See
2598          get_initial_def_for_reduction() for details.  */
2599       switch (code)
2600         {
2601           case WIDEN_SUM_EXPR:
2602           case DOT_PROD_EXPR:
2603           case SAD_EXPR:
2604           case PLUS_EXPR:
2605           case MINUS_EXPR:
2606           case BIT_IOR_EXPR:
2607           case BIT_XOR_EXPR:
2608              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2609                neutral_op = build_real (TREE_TYPE (op), dconst0);
2610              else
2611                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2612
2613              break;
2614
2615           case MULT_EXPR:
2616              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2617                neutral_op = build_real (TREE_TYPE (op), dconst1);
2618              else
2619                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2620
2621              break;
2622
2623           case BIT_AND_EXPR:
2624             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2625             break;
2626
2627           /* For MIN/MAX we don't have an easy neutral operand but
2628              the initial values can be used fine here.  Only for
2629              a reduction chain we have to force a neutral element.  */
2630           case MAX_EXPR:
2631           case MIN_EXPR:
2632             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2633               neutral_op = NULL;
2634             else
2635               {
2636                 def_stmt = SSA_NAME_DEF_STMT (op);
2637                 loop = (gimple_bb (stmt))->loop_father;
2638                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2639                                                     loop_preheader_edge (loop));
2640               }
2641             break;
2642
2643           default:
2644             gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2645             neutral_op = NULL;
2646         }
2647     }
2648
2649   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2650     {
2651       is_store = true;
2652       op = gimple_assign_rhs1 (stmt);
2653     }
2654   else
2655     is_store = false;
2656
2657   gcc_assert (op);
2658
2659   if (CONSTANT_CLASS_P (op))
2660     constant_p = true;
2661   else
2662     constant_p = false;
2663
2664   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2665      created vectors. It is greater than 1 if unrolling is performed.
2666
2667      For example, we have two scalar operands, s1 and s2 (e.g., group of
2668      strided accesses of size two), while NUNITS is four (i.e., four scalars
2669      of this type can be packed in a vector).  The output vector will contain
2670      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2671      will be 2).
2672
2673      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2674      containing the operands.
2675
2676      For example, NUNITS is four as before, and the group size is 8
2677      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2678      {s5, s6, s7, s8}.  */
2679
2680   number_of_copies = nunits * number_of_vectors / group_size;
2681
2682   number_of_places_left_in_vector = nunits;
2683   elts = XALLOCAVEC (tree, nunits);
2684   bool place_after_defs = false;
2685   for (j = 0; j < number_of_copies; j++)
2686     {
2687       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2688         {
2689           if (is_store)
2690             op = gimple_assign_rhs1 (stmt);
2691           else
2692             {
2693               switch (code)
2694                 {
2695                   case COND_EXPR:
2696                     if (op_num == 0 || op_num == 1)
2697                       {
2698                         tree cond = gimple_assign_rhs1 (stmt);
2699                         op = TREE_OPERAND (cond, op_num);
2700                       }
2701                     else
2702                       {
2703                         if (op_num == 2)
2704                           op = gimple_assign_rhs2 (stmt);
2705                         else
2706                           op = gimple_assign_rhs3 (stmt);
2707                       }
2708                     break;
2709
2710                   case CALL_EXPR:
2711                     op = gimple_call_arg (stmt, op_num);
2712                     break;
2713
2714                   case LSHIFT_EXPR:
2715                   case RSHIFT_EXPR:
2716                   case LROTATE_EXPR:
2717                   case RROTATE_EXPR:
2718                     op = gimple_op (stmt, op_num + 1);
2719                     /* Unlike the other binary operators, shifts/rotates have
2720                        the shift count being int, instead of the same type as
2721                        the lhs, so make sure the scalar is the right type if
2722                        we are dealing with vectors of
2723                        long long/long/short/char.  */
2724                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2725                       op = fold_convert (TREE_TYPE (vector_type), op);
2726                     break;
2727
2728                   default:
2729                     op = gimple_op (stmt, op_num + 1);
2730                     break;
2731                 }
2732             }
2733
2734           if (reduc_index != -1)
2735             {
2736               loop = (gimple_bb (stmt))->loop_father;
2737               def_stmt = SSA_NAME_DEF_STMT (op);
2738
2739               gcc_assert (loop);
2740
2741               /* Get the def before the loop.  In reduction chain we have only
2742                  one initial value.  */
2743               if ((j != (number_of_copies - 1)
2744                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2745                        && i != 0))
2746                   && neutral_op)
2747                 op = neutral_op;
2748               else
2749                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2750                                             loop_preheader_edge (loop));
2751             }
2752
2753           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2754           number_of_places_left_in_vector--;
2755           tree orig_op = op;
2756           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2757             {
2758               if (CONSTANT_CLASS_P (op))
2759                 {
2760                   op = fold_unary (VIEW_CONVERT_EXPR,
2761                                    TREE_TYPE (vector_type), op);
2762                   gcc_assert (op && CONSTANT_CLASS_P (op));
2763                 }
2764               else
2765                 {
2766                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2767                   gimple *init_stmt;
2768                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2769                   init_stmt
2770                     = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2771                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2772                   op = new_temp;
2773                 }
2774             }
2775           elts[number_of_places_left_in_vector] = op;
2776           if (!CONSTANT_CLASS_P (op))
2777             constant_p = false;
2778           if (TREE_CODE (orig_op) == SSA_NAME
2779               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2780               && STMT_VINFO_BB_VINFO (stmt_vinfo)
2781               && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2782                   == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2783             place_after_defs = true;
2784
2785           if (number_of_places_left_in_vector == 0)
2786             {
2787               number_of_places_left_in_vector = nunits;
2788
2789               if (constant_p)
2790                 vec_cst = build_vector (vector_type, elts);
2791               else
2792                 {
2793                   vec<constructor_elt, va_gc> *v;
2794                   unsigned k;
2795                   vec_alloc (v, nunits);
2796                   for (k = 0; k < nunits; ++k)
2797                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2798                   vec_cst = build_constructor (vector_type, v);
2799                 }
2800               tree init;
2801               gimple_stmt_iterator gsi;
2802               if (place_after_defs)
2803                 {
2804                   gsi = gsi_for_stmt
2805                           (vect_find_last_scalar_stmt_in_slp (slp_node));
2806                   init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2807                 }
2808               else
2809                 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2810               if (ctor_seq != NULL)
2811                 {
2812                   gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2813                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2814                                                         GSI_SAME_STMT);
2815                   ctor_seq = NULL;
2816                 }
2817               voprnds.quick_push (init);
2818               place_after_defs = false;
2819             }
2820         }
2821     }
2822
2823   /* Since the vectors are created in the reverse order, we should invert
2824      them.  */
2825   vec_num = voprnds.length ();
2826   for (j = vec_num; j != 0; j--)
2827     {
2828       vop = voprnds[j - 1];
2829       vec_oprnds->quick_push (vop);
2830     }
2831
2832   voprnds.release ();
2833
2834   /* In case that VF is greater than the unrolling factor needed for the SLP
2835      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2836      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2837      to replicate the vectors.  */
2838   while (number_of_vectors > vec_oprnds->length ())
2839     {
2840       tree neutral_vec = NULL;
2841
2842       if (neutral_op)
2843         {
2844           if (!neutral_vec)
2845             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2846
2847           vec_oprnds->quick_push (neutral_vec);
2848         }
2849       else
2850         {
2851           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2852             vec_oprnds->quick_push (vop);
2853         }
2854     }
2855 }
2856
2857
2858 /* Get vectorized definitions from SLP_NODE that contains corresponding
2859    vectorized def-stmts.  */
2860
2861 static void
2862 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2863 {
2864   tree vec_oprnd;
2865   gimple *vec_def_stmt;
2866   unsigned int i;
2867
2868   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2869
2870   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2871     {
2872       gcc_assert (vec_def_stmt);
2873       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2874       vec_oprnds->quick_push (vec_oprnd);
2875     }
2876 }
2877
2878
2879 /* Get vectorized definitions for SLP_NODE.
2880    If the scalar definitions are loop invariants or constants, collect them and
2881    call vect_get_constant_vectors() to create vector stmts.
2882    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2883    must be stored in the corresponding child of SLP_NODE, and we call
2884    vect_get_slp_vect_defs () to retrieve them.  */
2885
2886 void
2887 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2888                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2889 {
2890   gimple *first_stmt;
2891   int number_of_vects = 0, i;
2892   unsigned int child_index = 0;
2893   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2894   slp_tree child = NULL;
2895   vec<tree> vec_defs;
2896   tree oprnd;
2897   bool vectorized_defs;
2898
2899   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2900   FOR_EACH_VEC_ELT (ops, i, oprnd)
2901     {
2902       /* For each operand we check if it has vectorized definitions in a child
2903          node or we need to create them (for invariants and constants).  We
2904          check if the LHS of the first stmt of the next child matches OPRND.
2905          If it does, we found the correct child.  Otherwise, we call
2906          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2907          to check this child node for the next operand.  */
2908       vectorized_defs = false;
2909       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2910         {
2911           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2912
2913           /* We have to check both pattern and original def, if available.  */
2914           if (child)
2915             {
2916               gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2917               gimple *related
2918                 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2919
2920               if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2921                   || (related
2922                       && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2923                 {
2924                   /* The number of vector defs is determined by the number of
2925                      vector statements in the node from which we get those
2926                      statements.  */
2927                   number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2928                   vectorized_defs = true;
2929                   child_index++;
2930                 }
2931             }
2932           else
2933             child_index++;
2934         }
2935
2936       if (!vectorized_defs)
2937         {
2938           if (i == 0)
2939             {
2940               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2941               /* Number of vector stmts was calculated according to LHS in
2942                  vect_schedule_slp_instance (), fix it by replacing LHS with
2943                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2944                  details.  */
2945               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2946                                              &rhs_size_unit);
2947               if (rhs_size_unit != lhs_size_unit)
2948                 {
2949                   number_of_vects *= rhs_size_unit;
2950                   number_of_vects /= lhs_size_unit;
2951                 }
2952             }
2953         }
2954
2955       /* Allocate memory for vectorized defs.  */
2956       vec_defs = vNULL;
2957       vec_defs.create (number_of_vects);
2958
2959       /* For reduction defs we call vect_get_constant_vectors (), since we are
2960          looking for initial loop invariant values.  */
2961       if (vectorized_defs && reduc_index == -1)
2962         /* The defs are already vectorized.  */
2963         vect_get_slp_vect_defs (child, &vec_defs);
2964       else
2965         /* Build vectors from scalar defs.  */
2966         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2967                                    number_of_vects, reduc_index);
2968
2969       vec_oprnds->quick_push (vec_defs);
2970
2971       /* For reductions, we only need initial values.  */
2972       if (reduc_index != -1)
2973         return;
2974     }
2975 }
2976
2977
2978 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2979    building a vector of type MASK_TYPE from it) and two input vectors placed in
2980    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2981    shifting by STRIDE elements of DR_CHAIN for every copy.
2982    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2983    copies).
2984    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2985    the created stmts must be inserted.  */
2986
2987 static inline void
2988 vect_create_mask_and_perm (gimple *stmt,
2989                            tree mask, int first_vec_indx, int second_vec_indx,
2990                            gimple_stmt_iterator *gsi, slp_tree node,
2991                            tree vectype, vec<tree> dr_chain,
2992                            int ncopies, int vect_stmts_counter)
2993 {
2994   tree perm_dest;
2995   gimple *perm_stmt = NULL;
2996   int i, stride;
2997   tree first_vec, second_vec, data_ref;
2998
2999   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3000
3001   /* Initialize the vect stmts of NODE to properly insert the generated
3002      stmts later.  */
3003   for (i = SLP_TREE_VEC_STMTS (node).length ();
3004        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3005     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3006
3007   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3008   for (i = 0; i < ncopies; i++)
3009     {
3010       first_vec = dr_chain[first_vec_indx];
3011       second_vec = dr_chain[second_vec_indx];
3012
3013       /* Generate the permute statement.  */
3014       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3015                                        first_vec, second_vec, mask);
3016       data_ref = make_ssa_name (perm_dest, perm_stmt);
3017       gimple_set_lhs (perm_stmt, data_ref);
3018       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3019
3020       /* Store the vector statement in NODE.  */
3021       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3022
3023       first_vec_indx += stride;
3024       second_vec_indx += stride;
3025     }
3026 }
3027
3028
3029 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3030    return in CURRENT_MASK_ELEMENT its equivalent in target specific
3031    representation.  Check that the mask is valid and return FALSE if not.
3032    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3033    the next vector, i.e., the current first vector is not needed.  */
3034
3035 static bool
3036 vect_get_mask_element (gimple *stmt, int first_mask_element, int m,
3037                        int mask_nunits, bool only_one_vec, int index,
3038                        unsigned char *mask, int *current_mask_element,
3039                        bool *need_next_vector, int *number_of_mask_fixes,
3040                        bool *mask_fixed, bool *needs_first_vector)
3041 {
3042   int i;
3043
3044   /* Convert to target specific representation.  */
3045   *current_mask_element = first_mask_element + m;
3046   /* Adjust the value in case it's a mask for second and third vectors.  */
3047   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3048
3049   if (*current_mask_element < 0)
3050     {
3051       if (dump_enabled_p ())
3052         {
3053           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3054                            "permutation requires past vector ");
3055           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3056           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3057         }
3058       return false;
3059     }
3060
3061   if (*current_mask_element < mask_nunits)
3062     *needs_first_vector = true;
3063
3064   /* We have only one input vector to permute but the mask accesses values in
3065      the next vector as well.  */
3066   if (only_one_vec && *current_mask_element >= mask_nunits)
3067     {
3068       if (dump_enabled_p ())
3069         {
3070           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3071                            "permutation requires at least two vectors ");
3072           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3073           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3074         }
3075
3076       return false;
3077     }
3078
3079   /* The mask requires the next vector.  */
3080   while (*current_mask_element >= mask_nunits * 2)
3081     {
3082       if (*needs_first_vector || *mask_fixed)
3083         {
3084           /* We either need the first vector too or have already moved to the
3085              next vector. In both cases, this permutation needs three
3086              vectors.  */
3087           if (dump_enabled_p ())
3088             {
3089               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3090                                "permutation requires at "
3091                                "least three vectors ");
3092               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3093               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3094             }
3095
3096           return false;
3097         }
3098
3099       /* We move to the next vector, dropping the first one and working with
3100          the second and the third - we need to adjust the values of the mask
3101          accordingly.  */
3102       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3103
3104       for (i = 0; i < index; i++)
3105         mask[i] -= mask_nunits * *number_of_mask_fixes;
3106
3107       (*number_of_mask_fixes)++;
3108       *mask_fixed = true;
3109     }
3110
3111   *need_next_vector = *mask_fixed;
3112
3113   /* This was the last element of this mask. Start a new one.  */
3114   if (index == mask_nunits - 1)
3115     {
3116       *number_of_mask_fixes = 1;
3117       *mask_fixed = false;
3118       *needs_first_vector = false;
3119     }
3120
3121   return true;
3122 }
3123
3124
3125 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3126    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3127    permute statements for the SLP node NODE of the SLP instance
3128    SLP_NODE_INSTANCE.  */
3129
3130 bool
3131 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3132                               gimple_stmt_iterator *gsi, int vf,
3133                               slp_instance slp_node_instance, bool analyze_only)
3134 {
3135   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3136   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3137   tree mask_element_type = NULL_TREE, mask_type;
3138   int i, j, k, nunits, vec_index = 0;
3139   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3140   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3141   int first_mask_element;
3142   int index, unroll_factor, current_mask_element, ncopies;
3143   unsigned char *mask;
3144   bool only_one_vec = false, need_next_vector = false;
3145   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3146   int number_of_mask_fixes = 1;
3147   bool mask_fixed = false;
3148   bool needs_first_vector = false;
3149   machine_mode mode;
3150
3151   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3152     return false;
3153
3154   stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3155
3156   mode = TYPE_MODE (vectype);
3157
3158   if (!can_vec_perm_p (mode, false, NULL))
3159     {
3160       if (dump_enabled_p ())
3161         {
3162           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3163                            "no vect permute for ");
3164           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3165           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3166         }
3167       return false;
3168     }
3169
3170   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3171      same size as the vector element being permuted.  */
3172   mask_element_type = lang_hooks.types.type_for_mode
3173                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3174   mask_type = get_vectype_for_scalar_type (mask_element_type);
3175   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3176   mask = XALLOCAVEC (unsigned char, nunits);
3177   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3178
3179   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3180      unrolling factor.  */
3181   orig_vec_stmts_num
3182     = (STMT_VINFO_GROUP_SIZE (stmt_info)
3183        * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
3184        + nunits - 1) / nunits;
3185   if (orig_vec_stmts_num == 1)
3186     only_one_vec = true;
3187
3188   /* Number of copies is determined by the final vectorization factor
3189      relatively to SLP_NODE_INSTANCE unrolling factor.  */
3190   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3191
3192   /* Generate permutation masks for every NODE. Number of masks for each NODE
3193      is equal to GROUP_SIZE.
3194      E.g., we have a group of three nodes with three loads from the same
3195      location in each node, and the vector size is 4. I.e., we have a
3196      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3197      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3198      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3199      ...
3200
3201      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3202      The last mask is illegal since we assume two operands for permute
3203      operation, and the mask element values can't be outside that range.
3204      Hence, the last mask must be converted into {2,5,5,5}.
3205      For the first two permutations we need the first and the second input
3206      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3207      we need the second and the third vectors: {b1,c1,a2,b2} and
3208      {c2,a3,b3,c3}.  */
3209
3210   {
3211       index = 0;
3212       vect_stmts_counter = 0;
3213       vec_index = 0;
3214       first_vec_index = vec_index++;
3215       if (only_one_vec)
3216         second_vec_index = first_vec_index;
3217       else
3218         second_vec_index =  vec_index++;
3219
3220       for (j = 0; j < unroll_factor; j++)
3221         {
3222           for (k = 0; k < group_size; k++)
3223             {
3224               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3225               first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
3226               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3227                                           nunits, only_one_vec, index,
3228                                           mask, &current_mask_element,
3229                                           &need_next_vector,
3230                                           &number_of_mask_fixes, &mask_fixed,
3231                                           &needs_first_vector))
3232                 return false;
3233               gcc_assert (current_mask_element >= 0
3234                           && current_mask_element < 2 * nunits);
3235               mask[index++] = current_mask_element;
3236
3237               if (index == nunits)
3238                 {
3239                   index = 0;
3240                   if (!can_vec_perm_p (mode, false, mask))
3241                     {
3242                       if (dump_enabled_p ())
3243                         {
3244                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3245                                            vect_location, 
3246                                            "unsupported vect permute { ");
3247                           for (i = 0; i < nunits; ++i)
3248                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3249                                          mask[i]);
3250                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3251                         }
3252                       return false;
3253                     }
3254
3255                   if (!analyze_only)
3256                     {
3257                       int l;
3258                       tree mask_vec, *mask_elts;
3259                       mask_elts = XALLOCAVEC (tree, nunits);
3260                       for (l = 0; l < nunits; ++l)
3261                         mask_elts[l] = build_int_cst (mask_element_type,
3262                                                       mask[l]);
3263                       mask_vec = build_vector (mask_type, mask_elts);
3264
3265                       if (need_next_vector)
3266                         {
3267                           first_vec_index = second_vec_index;
3268                           second_vec_index = vec_index;
3269                         }
3270
3271                       vect_create_mask_and_perm (stmt,
3272                                mask_vec, first_vec_index, second_vec_index,
3273                                gsi, node, vectype, dr_chain,
3274                                ncopies, vect_stmts_counter++);
3275                     }
3276                 }
3277             }
3278         }
3279     }
3280
3281   return true;
3282 }
3283
3284
3285
3286 /* Vectorize SLP instance tree in postorder.  */
3287
3288 static bool
3289 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3290                             unsigned int vectorization_factor)
3291 {
3292   gimple *stmt;
3293   bool grouped_store, is_store;
3294   gimple_stmt_iterator si;
3295   stmt_vec_info stmt_info;
3296   unsigned int vec_stmts_size, nunits, group_size;
3297   tree vectype;
3298   int i;
3299   slp_tree child;
3300
3301   if (!node)
3302     return false;
3303
3304   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3305     vect_schedule_slp_instance (child, instance, vectorization_factor);
3306
3307   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3308   stmt_info = vinfo_for_stmt (stmt);
3309
3310   /* VECTYPE is the type of the destination.  */
3311   vectype = STMT_VINFO_VECTYPE (stmt_info);
3312   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3313   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3314
3315   /* For each SLP instance calculate number of vector stmts to be created
3316      for the scalar stmts in each node of the SLP tree.  Number of vector
3317      elements in one vector iteration is the number of scalar elements in
3318      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3319      size.
3320      Unless this is a SLP reduction in which case the number of vector
3321      stmts is equal to the number of vector stmts of the children.  */
3322   if (GROUP_FIRST_ELEMENT (stmt_info)
3323       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3324     vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3325   else
3326     vec_stmts_size = (vectorization_factor * group_size) / nunits;
3327
3328   if (!SLP_TREE_VEC_STMTS (node).exists ())
3329     {
3330       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3331       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3332     }
3333
3334   if (dump_enabled_p ())
3335     {
3336       dump_printf_loc (MSG_NOTE,vect_location,
3337                        "------>vectorizing SLP node starting from: ");
3338       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3339       dump_printf (MSG_NOTE, "\n");
3340     }
3341
3342   /* Vectorized stmts go before the last scalar stmt which is where
3343      all uses are ready.  */
3344   si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3345
3346   /* Mark the first element of the reduction chain as reduction to properly
3347      transform the node.  In the analysis phase only the last element of the
3348      chain is marked as reduction.  */
3349   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3350       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3351     {
3352       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3353       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3354     }
3355
3356   /* Handle two-operation SLP nodes by vectorizing the group with
3357      both operations and then performing a merge.  */
3358   if (SLP_TREE_TWO_OPERATORS (node))
3359     {
3360       enum tree_code code0 = gimple_assign_rhs_code (stmt);
3361       enum tree_code ocode;
3362       gimple *ostmt;
3363       unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3364       bool allsame = true;
3365       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3366         if (gimple_assign_rhs_code (ostmt) != code0)
3367           {
3368             mask[i] = 1;
3369             allsame = false;
3370             ocode = gimple_assign_rhs_code (ostmt);
3371           }
3372         else
3373           mask[i] = 0;
3374       if (!allsame)
3375         {
3376           vec<gimple *> v0;
3377           vec<gimple *> v1;
3378           unsigned j;
3379           tree tmask = NULL_TREE;
3380           vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3381           v0 = SLP_TREE_VEC_STMTS (node).copy ();
3382           SLP_TREE_VEC_STMTS (node).truncate (0);
3383           gimple_assign_set_rhs_code (stmt, ocode);
3384           vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3385           gimple_assign_set_rhs_code (stmt, code0);
3386           v1 = SLP_TREE_VEC_STMTS (node).copy ();
3387           SLP_TREE_VEC_STMTS (node).truncate (0);
3388           tree meltype = build_nonstandard_integer_type
3389               (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3390           tree mvectype = get_same_sized_vectype (meltype, vectype);
3391           unsigned k = 0, l;
3392           for (j = 0; j < v0.length (); ++j)
3393             {
3394               tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3395               for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3396                 {
3397                   if (k >= group_size)
3398                     k = 0;
3399                   melts[l] = build_int_cst
3400                       (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3401                 }
3402               tmask = build_vector (mvectype, melts);
3403
3404               /* ???  Not all targets support a VEC_PERM_EXPR with a
3405                  constant mask that would translate to a vec_merge RTX
3406                  (with their vec_perm_const_ok).  We can either not
3407                  vectorize in that case or let veclower do its job.
3408                  Unfortunately that isn't too great and at least for
3409                  plus/minus we'd eventually like to match targets
3410                  vector addsub instructions.  */
3411               gimple *vstmt;
3412               vstmt = gimple_build_assign (make_ssa_name (vectype),
3413                                            VEC_PERM_EXPR,
3414                                            gimple_assign_lhs (v0[j]),
3415                                            gimple_assign_lhs (v1[j]), tmask);
3416               vect_finish_stmt_generation (stmt, vstmt, &si);
3417               SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3418             }
3419           v0.release ();
3420           v1.release ();
3421           return false;
3422         }
3423     }
3424   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3425   return is_store;
3426 }
3427
3428 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3429    For loop vectorization this is done in vectorizable_call, but for SLP
3430    it needs to be deferred until end of vect_schedule_slp, because multiple
3431    SLP instances may refer to the same scalar stmt.  */
3432
3433 static void
3434 vect_remove_slp_scalar_calls (slp_tree node)
3435 {
3436   gimple *stmt, *new_stmt;
3437   gimple_stmt_iterator gsi;
3438   int i;
3439   slp_tree child;
3440   tree lhs;
3441   stmt_vec_info stmt_info;
3442
3443   if (!node)
3444     return;
3445
3446   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3447     vect_remove_slp_scalar_calls (child);
3448
3449   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3450     {
3451       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3452         continue;
3453       stmt_info = vinfo_for_stmt (stmt);
3454       if (stmt_info == NULL
3455           || is_pattern_stmt_p (stmt_info)
3456           || !PURE_SLP_STMT (stmt_info))
3457         continue;
3458       lhs = gimple_call_lhs (stmt);
3459       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3460       set_vinfo_for_stmt (new_stmt, stmt_info);
3461       set_vinfo_for_stmt (stmt, NULL);
3462       STMT_VINFO_STMT (stmt_info) = new_stmt;
3463       gsi = gsi_for_stmt (stmt);
3464       gsi_replace (&gsi, new_stmt, false);
3465       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3466     }
3467 }
3468
3469 /* Generate vector code for all SLP instances in the loop/basic block.  */
3470
3471 bool
3472 vect_schedule_slp (vec_info *vinfo)
3473 {
3474   vec<slp_instance> slp_instances;
3475   slp_instance instance;
3476   unsigned int i, vf;
3477   bool is_store = false;
3478
3479   slp_instances = vinfo->slp_instances;
3480   if (is_a <loop_vec_info> (vinfo))
3481     vf = as_a <loop_vec_info> (vinfo)->vectorization_factor;
3482   else
3483     vf = 1;
3484
3485   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3486     {
3487       /* Schedule the tree of INSTANCE.  */
3488       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3489                                              instance, vf);
3490       if (dump_enabled_p ())
3491         dump_printf_loc (MSG_NOTE, vect_location,
3492                          "vectorizing stmts using SLP.\n");
3493     }
3494
3495   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3496     {
3497       slp_tree root = SLP_INSTANCE_TREE (instance);
3498       gimple *store;
3499       unsigned int j;
3500       gimple_stmt_iterator gsi;
3501
3502       /* Remove scalar call stmts.  Do not do this for basic-block
3503          vectorization as not all uses may be vectorized.
3504          ???  Why should this be necessary?  DCE should be able to
3505          remove the stmts itself.
3506          ???  For BB vectorization we can as well remove scalar
3507          stmts starting from the SLP tree root if they have no
3508          uses.  */
3509       if (is_a <loop_vec_info> (vinfo))
3510         vect_remove_slp_scalar_calls (root);
3511
3512       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3513                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3514         {
3515           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3516             break;
3517
3518          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3519            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3520           /* Free the attached stmt_vec_info and remove the stmt.  */
3521           gsi = gsi_for_stmt (store);
3522           unlink_stmt_vdef (store);
3523           gsi_remove (&gsi, true);
3524           release_defs (store);
3525           free_stmt_vec_info (store);
3526         }
3527     }
3528
3529   return is_store;
3530 }
3531
3532
3533 /* Vectorize the basic block.  */
3534
3535 void
3536 vect_slp_transform_bb (basic_block bb)
3537 {
3538   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3539   gimple_stmt_iterator si;
3540
3541   gcc_assert (bb_vinfo);
3542
3543   if (dump_enabled_p ())
3544     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3545
3546   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3547     {
3548       gimple *stmt = gsi_stmt (si);
3549       stmt_vec_info stmt_info;
3550
3551       if (dump_enabled_p ())
3552         {
3553           dump_printf_loc (MSG_NOTE, vect_location,
3554                            "------>SLPing statement: ");
3555           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3556           dump_printf (MSG_NOTE, "\n");
3557         }
3558
3559       stmt_info = vinfo_for_stmt (stmt);
3560       gcc_assert (stmt_info);
3561
3562       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3563       if (STMT_SLP_TYPE (stmt_info))
3564         {
3565           vect_schedule_slp (bb_vinfo);
3566           break;
3567         }
3568     }
3569
3570   if (dump_enabled_p ())
3571     dump_printf_loc (MSG_NOTE, vect_location,
3572                      "BASIC BLOCK VECTORIZED\n");
3573
3574   destroy_bb_vec_info (bb_vinfo);
3575 }