2014-11-01 Andrew MacLeod <amacleod@redhat,com>
[platform/upstream/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2014 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "target.h"
30 #include "predict.h"
31 #include "vec.h"
32 #include "hashtab.h"
33 #include "hash-set.h"
34 #include "machmode.h"
35 #include "hard-reg-set.h"
36 #include "input.h"
37 #include "function.h"
38 #include "basic-block.h"
39 #include "gimple-pretty-print.h"
40 #include "tree-ssa-alias.h"
41 #include "internal-fn.h"
42 #include "gimple-expr.h"
43 #include "is-a.h"
44 #include "gimple.h"
45 #include "gimple-iterator.h"
46 #include "gimple-ssa.h"
47 #include "tree-phinodes.h"
48 #include "ssa-iterators.h"
49 #include "stringpool.h"
50 #include "tree-ssanames.h"
51 #include "tree-pass.h"
52 #include "cfgloop.h"
53 #include "expr.h"
54 #include "recog.h"              /* FIXME: for insn_data */
55 #include "insn-codes.h"
56 #include "optabs.h"
57 #include "tree-vectorizer.h"
58 #include "langhooks.h"
59
60 /* Extract the location of the basic block in the source code.
61    Return the basic block location if succeed and NULL if not.  */
62
63 source_location
64 find_bb_location (basic_block bb)
65 {
66   gimple stmt = NULL;
67   gimple_stmt_iterator si;
68
69   if (!bb)
70     return UNKNOWN_LOCATION;
71
72   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
73     {
74       stmt = gsi_stmt (si);
75       if (gimple_location (stmt) != UNKNOWN_LOCATION)
76         return gimple_location (stmt);
77     }
78
79   return UNKNOWN_LOCATION;
80 }
81
82
83 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
84
85 static void
86 vect_free_slp_tree (slp_tree node)
87 {
88   int i;
89   slp_tree child;
90
91   if (!node)
92     return;
93
94   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
95     vect_free_slp_tree (child);
96
97   SLP_TREE_CHILDREN (node).release ();
98   SLP_TREE_SCALAR_STMTS (node).release ();
99   SLP_TREE_VEC_STMTS (node).release ();
100   SLP_TREE_LOAD_PERMUTATION (node).release ();
101
102   free (node);
103 }
104
105
106 /* Free the memory allocated for the SLP instance.  */
107
108 void
109 vect_free_slp_instance (slp_instance instance)
110 {
111   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
112   SLP_INSTANCE_LOADS (instance).release ();
113   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
114   free (instance);
115 }
116
117
118 /* Create an SLP node for SCALAR_STMTS.  */
119
120 static slp_tree
121 vect_create_new_slp_node (vec<gimple> scalar_stmts)
122 {
123   slp_tree node;
124   gimple stmt = scalar_stmts[0];
125   unsigned int nops;
126
127   if (is_gimple_call (stmt))
128     nops = gimple_call_num_args (stmt);
129   else if (is_gimple_assign (stmt))
130     {
131       nops = gimple_num_ops (stmt) - 1;
132       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
133         nops++;
134     }
135   else
136     return NULL;
137
138   node = XNEW (struct _slp_tree);
139   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
140   SLP_TREE_VEC_STMTS (node).create (0);
141   SLP_TREE_CHILDREN (node).create (nops);
142   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
143
144   return node;
145 }
146
147
148 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
149    operand.  */
150 static vec<slp_oprnd_info> 
151 vect_create_oprnd_info (int nops, int group_size)
152 {
153   int i;
154   slp_oprnd_info oprnd_info;
155   vec<slp_oprnd_info> oprnds_info;
156
157   oprnds_info.create (nops);
158   for (i = 0; i < nops; i++)
159     {
160       oprnd_info = XNEW (struct _slp_oprnd_info);
161       oprnd_info->def_stmts.create (group_size);
162       oprnd_info->first_dt = vect_uninitialized_def;
163       oprnd_info->first_op_type = NULL_TREE;
164       oprnd_info->first_pattern = false;
165       oprnds_info.quick_push (oprnd_info);
166     }
167
168   return oprnds_info;
169 }
170
171
172 /* Free operands info.  */
173
174 static void
175 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
176 {
177   int i;
178   slp_oprnd_info oprnd_info;
179
180   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
181     {
182       oprnd_info->def_stmts.release ();
183       XDELETE (oprnd_info);
184     }
185
186   oprnds_info.release ();
187 }
188
189
190 /* Find the place of the data-ref in STMT in the interleaving chain that starts
191    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
192
193 static int
194 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
195 {
196   gimple next_stmt = first_stmt;
197   int result = 0;
198
199   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
200     return -1;
201
202   do
203     {
204       if (next_stmt == stmt)
205         return result;
206       result++;
207       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
208     }
209   while (next_stmt);
210
211   return -1;
212 }
213
214
215 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
216    they are of a valid type and that they match the defs of the first stmt of
217    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
218    return -1, if the error could be corrected by swapping operands of the
219    operation return 1, if everything is ok return 0.  */
220
221 static int 
222 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
223                              gimple stmt, bool first,
224                              vec<slp_oprnd_info> *oprnds_info)
225 {
226   tree oprnd;
227   unsigned int i, number_of_oprnds;
228   tree def;
229   gimple def_stmt;
230   enum vect_def_type dt = vect_uninitialized_def;
231   struct loop *loop = NULL;
232   bool pattern = false;
233   slp_oprnd_info oprnd_info;
234   int first_op_idx = 1;
235   bool commutative = false;
236   bool first_op_cond = false;
237
238   if (loop_vinfo)
239     loop = LOOP_VINFO_LOOP (loop_vinfo);
240
241   if (is_gimple_call (stmt))
242     {
243       number_of_oprnds = gimple_call_num_args (stmt);
244       first_op_idx = 3;
245     }
246   else if (is_gimple_assign (stmt))
247     {
248       enum tree_code code = gimple_assign_rhs_code (stmt);
249       number_of_oprnds = gimple_num_ops (stmt) - 1;
250       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
251         {
252           first_op_cond = true;
253           commutative = true;
254           number_of_oprnds++;
255         }
256       else
257         commutative = commutative_tree_code (code);
258     }
259   else
260     return -1;
261
262   bool swapped = false;
263   for (i = 0; i < number_of_oprnds; i++)
264     {
265 again:
266       if (first_op_cond)
267         {
268           if (i == 0 || i == 1)
269             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
270                                   swapped ? !i : i);
271           else
272             oprnd = gimple_op (stmt, first_op_idx + i - 1);
273         }
274       else
275         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
276
277       oprnd_info = (*oprnds_info)[i];
278
279       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
280                                &def, &dt)
281           || (!def_stmt && dt != vect_constant_def))
282         {
283           if (dump_enabled_p ())
284             {
285               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
286                                "Build SLP failed: can't find def for ");
287               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
288               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
289             }
290
291           return -1;
292         }
293
294       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
295          from the pattern.  Check that all the stmts of the node are in the
296          pattern.  */
297       if (def_stmt && gimple_bb (def_stmt)
298           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
299               || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
300                   && gimple_code (def_stmt) != GIMPLE_PHI))
301           && vinfo_for_stmt (def_stmt)
302           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
303           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
304           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
305         {
306           pattern = true;
307           if (!first && !oprnd_info->first_pattern)
308             {
309               if (i == 0
310                   && !swapped
311                   && commutative)
312                 {
313                   swapped = true;
314                   goto again;
315                 }
316
317               if (dump_enabled_p ())
318                 {
319                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
320                                    "Build SLP failed: some of the stmts"
321                                    " are in a pattern, and others are not ");
322                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
323                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
324                 }
325
326               return 1;
327             }
328
329           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
330           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
331
332           if (dt == vect_unknown_def_type)
333             {
334               if (dump_enabled_p ())
335                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
336                                  "Unsupported pattern.\n");
337               return -1;
338             }
339
340           switch (gimple_code (def_stmt))
341             {
342               case GIMPLE_PHI:
343                 def = gimple_phi_result (def_stmt);
344                 break;
345
346               case GIMPLE_ASSIGN:
347                 def = gimple_assign_lhs (def_stmt);
348                 break;
349
350               default:
351                 if (dump_enabled_p ())
352                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
353                                    "unsupported defining stmt:\n");
354                 return -1;
355             }
356         }
357
358       if (first)
359         {
360           oprnd_info->first_dt = dt;
361           oprnd_info->first_pattern = pattern;
362           oprnd_info->first_op_type = TREE_TYPE (oprnd);
363         }
364       else
365         {
366           /* Not first stmt of the group, check that the def-stmt/s match
367              the def-stmt/s of the first stmt.  Allow different definition
368              types for reduction chains: the first stmt must be a
369              vect_reduction_def (a phi node), and the rest
370              vect_internal_def.  */
371           if (((oprnd_info->first_dt != dt
372                 && !(oprnd_info->first_dt == vect_reduction_def
373                      && dt == vect_internal_def)
374                 && !((oprnd_info->first_dt == vect_external_def
375                       || oprnd_info->first_dt == vect_constant_def)
376                      && (dt == vect_external_def
377                          || dt == vect_constant_def)))
378                || !types_compatible_p (oprnd_info->first_op_type,
379                                        TREE_TYPE (oprnd))))
380             {
381               /* Try swapping operands if we got a mismatch.  */
382               if (i == 0
383                   && !swapped
384                   && commutative)
385                 {
386                   swapped = true;
387                   goto again;
388                 }
389
390               if (dump_enabled_p ())
391                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
392                                  "Build SLP failed: different types\n");
393
394               return 1;
395             }
396         }
397
398       /* Check the types of the definitions.  */
399       switch (dt)
400         {
401         case vect_constant_def:
402         case vect_external_def:
403         case vect_reduction_def:
404           break;
405
406         case vect_internal_def:
407           oprnd_info->def_stmts.quick_push (def_stmt);
408           break;
409
410         default:
411           /* FORNOW: Not supported.  */
412           if (dump_enabled_p ())
413             {
414               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
415                                "Build SLP failed: illegal type of def ");
416               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
417               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
418             }
419
420           return -1;
421         }
422     }
423
424   /* Swap operands.  */
425   if (swapped)
426     {
427       if (first_op_cond)
428         {
429           tree cond = gimple_assign_rhs1 (stmt);
430           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
431                              &TREE_OPERAND (cond, 1));
432           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
433         }
434       else
435         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
436                            gimple_assign_rhs2_ptr (stmt));
437     }
438
439   return 0;
440 }
441
442
443 /* Verify if the scalar stmts STMTS are isomorphic, require data
444    permutation or are of unsupported types of operation.  Return
445    true if they are, otherwise return false and indicate in *MATCHES
446    which stmts are not isomorphic to the first one.  If MATCHES[0]
447    is false then this indicates the comparison could not be
448    carried out or the stmts will never be vectorized by SLP.  */
449
450 static bool
451 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
452                        vec<gimple> stmts, unsigned int group_size,
453                        unsigned nops, unsigned int *max_nunits,
454                        unsigned int vectorization_factor, bool *matches)
455 {
456   unsigned int i;
457   gimple stmt = stmts[0];
458   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
459   enum tree_code first_cond_code = ERROR_MARK;
460   tree lhs;
461   bool need_same_oprnds = false;
462   tree vectype, scalar_type, first_op1 = NULL_TREE;
463   optab optab;
464   int icode;
465   machine_mode optab_op2_mode;
466   machine_mode vec_mode;
467   struct data_reference *first_dr;
468   HOST_WIDE_INT dummy;
469   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
470   tree cond;
471
472   /* For every stmt in NODE find its def stmt/s.  */
473   FOR_EACH_VEC_ELT (stmts, i, stmt)
474     {
475       matches[i] = false;
476
477       if (dump_enabled_p ())
478         {
479           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
480           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
481           dump_printf (MSG_NOTE, "\n");
482         }
483
484       /* Fail to vectorize statements marked as unvectorizable.  */
485       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
486         {
487           if (dump_enabled_p ())
488             {
489               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490                                "Build SLP failed: unvectorizable statement ");
491               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
492               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
493             }
494           /* Fatal mismatch.  */
495           matches[0] = false;
496           return false;
497         }
498
499       lhs = gimple_get_lhs (stmt);
500       if (lhs == NULL_TREE)
501         {
502           if (dump_enabled_p ())
503             {
504               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
505                                "Build SLP failed: not GIMPLE_ASSIGN nor "
506                                "GIMPLE_CALL ");
507               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
508               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
509             }
510           /* Fatal mismatch.  */
511           matches[0] = false;
512           return false;
513         }
514
515        if (is_gimple_assign (stmt)
516            && gimple_assign_rhs_code (stmt) == COND_EXPR
517            && (cond = gimple_assign_rhs1 (stmt))
518            && !COMPARISON_CLASS_P (cond))
519         {
520           if (dump_enabled_p ())
521             {
522               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
523                                "Build SLP failed: condition is not "
524                                "comparison ");
525               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
526               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527             }
528           /* Fatal mismatch.  */
529           matches[0] = false;
530           return false;
531         }
532
533       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
534       vectype = get_vectype_for_scalar_type (scalar_type);
535       if (!vectype)
536         {
537           if (dump_enabled_p ())
538             {
539               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
540                                "Build SLP failed: unsupported data-type ");
541               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
542                                  scalar_type);
543               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544             }
545           /* Fatal mismatch.  */
546           matches[0] = false;
547           return false;
548         }
549
550       /* In case of multiple types we need to detect the smallest type.  */
551       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
552         {
553           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
554           if (bb_vinfo)
555             vectorization_factor = *max_nunits;
556         }
557
558       if (is_gimple_call (stmt))
559         {
560           rhs_code = CALL_EXPR;
561           if (gimple_call_internal_p (stmt)
562               || gimple_call_tail_p (stmt)
563               || gimple_call_noreturn_p (stmt)
564               || !gimple_call_nothrow_p (stmt)
565               || gimple_call_chain (stmt))
566             {
567               if (dump_enabled_p ())
568                 {
569                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
570                                    "Build SLP failed: unsupported call type ");
571                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
572                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
573                 }
574               /* Fatal mismatch.  */
575               matches[0] = false;
576               return false;
577             }
578         }
579       else
580         rhs_code = gimple_assign_rhs_code (stmt);
581
582       /* Check the operation.  */
583       if (i == 0)
584         {
585           first_stmt_code = rhs_code;
586
587           /* Shift arguments should be equal in all the packed stmts for a
588              vector shift with scalar shift operand.  */
589           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
590               || rhs_code == LROTATE_EXPR
591               || rhs_code == RROTATE_EXPR)
592             {
593               vec_mode = TYPE_MODE (vectype);
594
595               /* First see if we have a vector/vector shift.  */
596               optab = optab_for_tree_code (rhs_code, vectype,
597                                            optab_vector);
598
599               if (!optab
600                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
601                 {
602                   /* No vector/vector shift, try for a vector/scalar shift.  */
603                   optab = optab_for_tree_code (rhs_code, vectype,
604                                                optab_scalar);
605
606                   if (!optab)
607                     {
608                       if (dump_enabled_p ())
609                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
610                                          "Build SLP failed: no optab.\n");
611                       /* Fatal mismatch.  */
612                       matches[0] = false;
613                       return false;
614                     }
615                   icode = (int) optab_handler (optab, vec_mode);
616                   if (icode == CODE_FOR_nothing)
617                     {
618                       if (dump_enabled_p ())
619                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
620                                          "Build SLP failed: "
621                                          "op not supported by target.\n");
622                       /* Fatal mismatch.  */
623                       matches[0] = false;
624                       return false;
625                     }
626                   optab_op2_mode = insn_data[icode].operand[2].mode;
627                   if (!VECTOR_MODE_P (optab_op2_mode))
628                     {
629                       need_same_oprnds = true;
630                       first_op1 = gimple_assign_rhs2 (stmt);
631                     }
632                 }
633             }
634           else if (rhs_code == WIDEN_LSHIFT_EXPR)
635             {
636               need_same_oprnds = true;
637               first_op1 = gimple_assign_rhs2 (stmt);
638             }
639         }
640       else
641         {
642           if (first_stmt_code != rhs_code
643               && (first_stmt_code != IMAGPART_EXPR
644                   || rhs_code != REALPART_EXPR)
645               && (first_stmt_code != REALPART_EXPR
646                   || rhs_code != IMAGPART_EXPR)
647               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
648                    && (first_stmt_code == ARRAY_REF
649                        || first_stmt_code == BIT_FIELD_REF
650                        || first_stmt_code == INDIRECT_REF
651                        || first_stmt_code == COMPONENT_REF
652                        || first_stmt_code == MEM_REF)))
653             {
654               if (dump_enabled_p ())
655                 {
656                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
657                                    "Build SLP failed: different operation "
658                                    "in stmt ");
659                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
660                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
661                 }
662               /* Mismatch.  */
663               continue;
664             }
665
666           if (need_same_oprnds
667               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
668             {
669               if (dump_enabled_p ())
670                 {
671                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
672                                    "Build SLP failed: different shift "
673                                    "arguments in ");
674                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
675                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
676                 }
677               /* Mismatch.  */
678               continue;
679             }
680
681           if (rhs_code == CALL_EXPR)
682             {
683               gimple first_stmt = stmts[0];
684               if (gimple_call_num_args (stmt) != nops
685                   || !operand_equal_p (gimple_call_fn (first_stmt),
686                                        gimple_call_fn (stmt), 0)
687                   || gimple_call_fntype (first_stmt)
688                      != gimple_call_fntype (stmt))
689                 {
690                   if (dump_enabled_p ())
691                     {
692                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
693                                        "Build SLP failed: different calls in ");
694                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
695                                         stmt, 0);
696                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
697                     }
698                   /* Mismatch.  */
699                   continue;
700                 }
701             }
702         }
703
704       /* Grouped store or load.  */
705       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
706         {
707           if (REFERENCE_CLASS_P (lhs))
708             {
709               /* Store.  */
710               ;
711             }
712           else
713             {
714               /* Load.  */
715               unsigned unrolling_factor
716                 = least_common_multiple
717                     (*max_nunits, group_size) / group_size;
718               /* FORNOW: Check that there is no gap between the loads
719                  and no gap between the groups when we need to load
720                  multiple groups at once.
721                  ???  We should enhance this to only disallow gaps
722                  inside vectors.  */
723               if ((unrolling_factor > 1
724                    && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
725                    && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
726                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
727                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
728                 {
729                   if (dump_enabled_p ())
730                     {
731                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
732                                        "Build SLP failed: grouped "
733                                        "loads have gaps ");
734                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
735                                         stmt, 0);
736                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
737                     }
738                   /* Fatal mismatch.  */
739                   matches[0] = false;
740                   return false;
741                 }
742
743               /* Check that the size of interleaved loads group is not
744                  greater than the SLP group size.  */
745               unsigned ncopies
746                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
747               if (loop_vinfo
748                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
749                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
750                        - GROUP_GAP (vinfo_for_stmt (stmt)))
751                       > ncopies * group_size))
752                 {
753                   if (dump_enabled_p ())
754                     {
755                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756                                        "Build SLP failed: the number "
757                                        "of interleaved loads is greater than "
758                                        "the SLP group size ");
759                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
760                                         stmt, 0);
761                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
762                     }
763                   /* Fatal mismatch.  */
764                   matches[0] = false;
765                   return false;
766                 }
767
768               old_first_load = first_load;
769               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
770               if (prev_first_load)
771                 {
772                   /* Check that there are no loads from different interleaving
773                      chains in the same node.  */
774                   if (prev_first_load != first_load)
775                     {
776                       if (dump_enabled_p ())
777                         {
778                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
779                                            vect_location, 
780                                            "Build SLP failed: different "
781                                            "interleaving chains in one node ");
782                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
783                                             stmt, 0);
784                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
785                         }
786                       /* Mismatch.  */
787                       continue;
788                     }
789                 }
790               else
791                 prev_first_load = first_load;
792
793               /* In some cases a group of loads is just the same load
794                  repeated N times.  Only analyze its cost once.  */
795               if (first_load == stmt && old_first_load != first_load)
796                 {
797                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
798                   if (vect_supportable_dr_alignment (first_dr, false)
799                       == dr_unaligned_unsupported)
800                     {
801                       if (dump_enabled_p ())
802                         {
803                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
804                                            vect_location, 
805                                            "Build SLP failed: unsupported "
806                                            "unaligned load ");
807                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
808                                             stmt, 0);
809                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
810                         }
811                       /* Fatal mismatch.  */
812                       matches[0] = false;
813                       return false;
814                     }
815                 }
816            }
817         } /* Grouped access.  */
818       else
819         {
820           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
821             {
822               /* Not grouped load.  */
823               if (dump_enabled_p ())
824                 {
825                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
826                                    "Build SLP failed: not grouped load ");
827                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
828                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
829                 }
830
831               /* FORNOW: Not grouped loads are not supported.  */
832               /* Fatal mismatch.  */
833               matches[0] = false;
834               return false;
835             }
836
837           /* Not memory operation.  */
838           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
839               && TREE_CODE_CLASS (rhs_code) != tcc_unary
840               && rhs_code != COND_EXPR
841               && rhs_code != CALL_EXPR)
842             {
843               if (dump_enabled_p ())
844                 {
845                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
846                                    "Build SLP failed: operation");
847                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
848                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
849                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
850                 }
851               /* Fatal mismatch.  */
852               matches[0] = false;
853               return false;
854             }
855
856           if (rhs_code == COND_EXPR)
857             {
858               tree cond_expr = gimple_assign_rhs1 (stmt);
859
860               if (i == 0)
861                 first_cond_code = TREE_CODE (cond_expr);
862               else if (first_cond_code != TREE_CODE (cond_expr))
863                 {
864                   if (dump_enabled_p ())
865                     {
866                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
867                                        "Build SLP failed: different"
868                                        " operation");
869                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
870                                         stmt, 0);
871                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
872                     }
873                   /* Mismatch.  */
874                   continue;
875                 }
876             }
877         }
878
879       matches[i] = true;
880     }
881
882   for (i = 0; i < group_size; ++i)
883     if (!matches[i])
884       return false;
885
886   return true;
887 }
888
889 /* Recursively build an SLP tree starting from NODE.
890    Fail (and return a value not equal to zero) if def-stmts are not
891    isomorphic, require data permutation or are of unsupported types of
892    operation.  Otherwise, return 0.
893    The value returned is the depth in the SLP tree where a mismatch
894    was found.  */
895
896 static bool
897 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
898                      slp_tree *node, unsigned int group_size,
899                      unsigned int *max_nunits,
900                      vec<slp_tree> *loads,
901                      unsigned int vectorization_factor,
902                      bool *matches, unsigned *npermutes, unsigned *tree_size,
903                      unsigned max_tree_size)
904 {
905   unsigned nops, i, this_npermutes = 0, this_tree_size = 0;
906   gimple stmt;
907
908   if (!matches)
909     matches = XALLOCAVEC (bool, group_size);
910   if (!npermutes)
911     npermutes = &this_npermutes;
912
913   matches[0] = false;
914
915   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
916   if (is_gimple_call (stmt))
917     nops = gimple_call_num_args (stmt);
918   else if (is_gimple_assign (stmt))
919     {
920       nops = gimple_num_ops (stmt) - 1;
921       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
922         nops++;
923     }
924   else
925     return false;
926
927   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
928                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
929                               max_nunits, vectorization_factor, matches))
930     return false;
931
932   /* If the SLP node is a load, terminate the recursion.  */
933   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
934       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
935     {
936       loads->safe_push (*node);
937       return true;
938     }
939
940   /* Get at the operands, verifying they are compatible.  */
941   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
942   slp_oprnd_info oprnd_info;
943   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
944     {
945       switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
946                                            stmt, (i == 0), &oprnds_info))
947         {
948         case 0:
949           break;
950         case -1:
951           matches[0] = false;
952           vect_free_oprnd_info (oprnds_info);
953           return false;
954         case 1:
955           matches[i] = false;
956           break;
957         }
958     }
959   for (i = 0; i < group_size; ++i)
960     if (!matches[i])
961       {
962         vect_free_oprnd_info (oprnds_info);
963         return false;
964       }
965
966   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
967
968   /* Create SLP_TREE nodes for the definition node/s.  */
969   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
970     {
971       slp_tree child;
972       unsigned old_nloads = loads->length ();
973       unsigned old_max_nunits = *max_nunits;
974
975       if (oprnd_info->first_dt != vect_internal_def)
976         continue;
977
978       if (++this_tree_size > max_tree_size)
979         {
980           vect_free_oprnd_info (oprnds_info);
981           return false;
982         }
983
984       child = vect_create_new_slp_node (oprnd_info->def_stmts);
985       if (!child)
986         {
987           vect_free_oprnd_info (oprnds_info);
988           return false;
989         }
990
991       bool *matches = XALLOCAVEC (bool, group_size);
992       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
993                                group_size, max_nunits, loads,
994                                vectorization_factor, matches,
995                                npermutes, &this_tree_size, max_tree_size))
996         {
997           oprnd_info->def_stmts = vNULL;
998           SLP_TREE_CHILDREN (*node).quick_push (child);
999           continue;
1000         }
1001
1002       /* If the SLP build for operand zero failed and operand zero
1003          and one can be commutated try that for the scalar stmts
1004          that failed the match.  */
1005       if (i == 0
1006           /* A first scalar stmt mismatch signals a fatal mismatch.  */
1007           && matches[0]
1008           /* ???  For COND_EXPRs we can swap the comparison operands
1009              as well as the arms under some constraints.  */
1010           && nops == 2
1011           && oprnds_info[1]->first_dt == vect_internal_def
1012           && is_gimple_assign (stmt)
1013           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1014           /* Do so only if the number of not successful permutes was nor more
1015              than a cut-ff as re-trying the recursive match on
1016              possibly each level of the tree would expose exponential
1017              behavior.  */
1018           && *npermutes < 4)
1019         {
1020           /* Roll back.  */
1021           *max_nunits = old_max_nunits;
1022           loads->truncate (old_nloads);
1023           /* Swap mismatched definition stmts.  */
1024           dump_printf_loc (MSG_NOTE, vect_location,
1025                            "Re-trying with swapped operands of stmts ");
1026           for (unsigned j = 0; j < group_size; ++j)
1027             if (!matches[j])
1028               {
1029                 gimple tem = oprnds_info[0]->def_stmts[j];
1030                 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1031                 oprnds_info[1]->def_stmts[j] = tem;
1032                 dump_printf (MSG_NOTE, "%d ", j);
1033               }
1034           dump_printf (MSG_NOTE, "\n");
1035           /* And try again ... */
1036           if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1037                                    group_size, max_nunits, loads,
1038                                    vectorization_factor,
1039                                    matches, npermutes, &this_tree_size,
1040                                    max_tree_size))
1041             {
1042               oprnd_info->def_stmts = vNULL;
1043               SLP_TREE_CHILDREN (*node).quick_push (child);
1044               continue;
1045             }
1046
1047           ++*npermutes;
1048         }
1049
1050       oprnd_info->def_stmts = vNULL;
1051       vect_free_slp_tree (child);
1052       vect_free_oprnd_info (oprnds_info);
1053       return false;
1054     }
1055
1056   if (tree_size)
1057     *tree_size += this_tree_size;
1058
1059   vect_free_oprnd_info (oprnds_info);
1060   return true;
1061 }
1062
1063 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1064
1065 static void
1066 vect_print_slp_tree (int dump_kind, slp_tree node)
1067 {
1068   int i;
1069   gimple stmt;
1070   slp_tree child;
1071
1072   if (!node)
1073     return;
1074
1075   dump_printf (dump_kind, "node ");
1076   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1077     {
1078       dump_printf (dump_kind, "\n\tstmt %d ", i);
1079       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1080     }
1081   dump_printf (dump_kind, "\n");
1082
1083   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1084     vect_print_slp_tree (dump_kind, child);
1085 }
1086
1087
1088 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1089    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1090    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1091    stmts in NODE are to be marked.  */
1092
1093 static void
1094 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1095 {
1096   int i;
1097   gimple stmt;
1098   slp_tree child;
1099
1100   if (!node)
1101     return;
1102
1103   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1104     if (j < 0 || i == j)
1105       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1106
1107   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1108     vect_mark_slp_stmts (child, mark, j);
1109 }
1110
1111
1112 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1113
1114 static void
1115 vect_mark_slp_stmts_relevant (slp_tree node)
1116 {
1117   int i;
1118   gimple stmt;
1119   stmt_vec_info stmt_info;
1120   slp_tree child;
1121
1122   if (!node)
1123     return;
1124
1125   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1126     {
1127       stmt_info = vinfo_for_stmt (stmt);
1128       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1129                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1130       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1131     }
1132
1133   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1134     vect_mark_slp_stmts_relevant (child);
1135 }
1136
1137
1138 /* Rearrange the statements of NODE according to PERMUTATION.  */
1139
1140 static void
1141 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1142                           vec<unsigned> permutation)
1143 {
1144   gimple stmt;
1145   vec<gimple> tmp_stmts;
1146   unsigned int i;
1147   slp_tree child;
1148
1149   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1150     vect_slp_rearrange_stmts (child, group_size, permutation);
1151
1152   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1153   tmp_stmts.create (group_size);
1154   tmp_stmts.quick_grow_cleared (group_size);
1155
1156   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1157     tmp_stmts[permutation[i]] = stmt;
1158
1159   SLP_TREE_SCALAR_STMTS (node).release ();
1160   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1161 }
1162
1163
1164 /* Check if the required load permutations in the SLP instance
1165    SLP_INSTN are supported.  */
1166
1167 static bool
1168 vect_supported_load_permutation_p (slp_instance slp_instn)
1169 {
1170   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1171   unsigned int i, j, k, next;
1172   sbitmap load_index;
1173   slp_tree node;
1174   gimple stmt, load, next_load, first_load;
1175   struct data_reference *dr;
1176
1177   if (dump_enabled_p ())
1178     {
1179       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1180       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1181         if (node->load_permutation.exists ())
1182           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1183             dump_printf (MSG_NOTE, "%d ", next);
1184         else
1185           for (k = 0; k < group_size; ++k)
1186             dump_printf (MSG_NOTE, "%d ", k);
1187       dump_printf (MSG_NOTE, "\n");
1188     }
1189
1190   /* In case of reduction every load permutation is allowed, since the order
1191      of the reduction statements is not important (as opposed to the case of
1192      grouped stores).  The only condition we need to check is that all the
1193      load nodes are of the same size and have the same permutation (and then
1194      rearrange all the nodes of the SLP instance according to this 
1195      permutation).  */
1196
1197   /* Check that all the load nodes are of the same size.  */
1198   /* ???  Can't we assert this? */
1199   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1200     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1201       return false;
1202
1203   node = SLP_INSTANCE_TREE (slp_instn);
1204   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1205
1206   /* Reduction (there are no data-refs in the root).
1207      In reduction chain the order of the loads is important.  */
1208   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1209       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1210     {
1211       slp_tree load;
1212       unsigned int lidx;
1213
1214       /* Compare all the permutation sequences to the first one.  We know
1215          that at least one load is permuted.  */
1216       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1217       if (!node->load_permutation.exists ())
1218         return false;
1219       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1220         {
1221           if (!load->load_permutation.exists ())
1222             return false;
1223           FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1224             if (lidx != node->load_permutation[j])
1225               return false;
1226         }
1227
1228       /* Check that the loads in the first sequence are different and there
1229          are no gaps between them.  */
1230       load_index = sbitmap_alloc (group_size);
1231       bitmap_clear (load_index);
1232       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1233         {
1234           if (bitmap_bit_p (load_index, lidx))
1235             {
1236               sbitmap_free (load_index);
1237               return false;
1238             }
1239           bitmap_set_bit (load_index, lidx);
1240         }
1241       for (i = 0; i < group_size; i++)
1242         if (!bitmap_bit_p (load_index, i))
1243           {
1244             sbitmap_free (load_index);
1245             return false;
1246           }
1247       sbitmap_free (load_index);
1248
1249       /* This permutation is valid for reduction.  Since the order of the
1250          statements in the nodes is not important unless they are memory
1251          accesses, we can rearrange the statements in all the nodes
1252          according to the order of the loads.  */
1253       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1254                                 node->load_permutation);
1255
1256       /* We are done, no actual permutations need to be generated.  */
1257       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1258         SLP_TREE_LOAD_PERMUTATION (node).release ();
1259       return true;
1260     }
1261
1262   /* In basic block vectorization we allow any subchain of an interleaving
1263      chain.
1264      FORNOW: not supported in loop SLP because of realignment compications.  */
1265   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1266     {
1267       /* Check that for every node in the instance the loads
1268          form a subchain.  */
1269       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1270         {
1271           next_load = NULL;
1272           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1273             {
1274               if (j != 0 && next_load != load)
1275                 return false;
1276               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1277             }
1278         }
1279
1280       /* Check that the alignment of the first load in every subchain, i.e.,
1281          the first statement in every load node, is supported.
1282          ???  This belongs in alignment checking.  */
1283       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1284         {
1285           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1286           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1287             {
1288               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1289               if (vect_supportable_dr_alignment (dr, false)
1290                   == dr_unaligned_unsupported)
1291                 {
1292                   if (dump_enabled_p ())
1293                     {
1294                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1295                                        vect_location,
1296                                        "unsupported unaligned load ");
1297                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1298                                         first_load, 0);
1299                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1300                     }
1301                   return false;
1302                 }
1303             }
1304         }
1305
1306       /* We are done, no actual permutations need to be generated.  */
1307       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1308         SLP_TREE_LOAD_PERMUTATION (node).release ();
1309       return true;
1310     }
1311
1312   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1313      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1314      well (unless it's reduction).  */
1315   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1316     return false;
1317   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1318     if (!node->load_permutation.exists ())
1319       return false;
1320
1321   load_index = sbitmap_alloc (group_size);
1322   bitmap_clear (load_index);
1323   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1324     {
1325       unsigned int lidx = node->load_permutation[0];
1326       if (bitmap_bit_p (load_index, lidx))
1327         {
1328           sbitmap_free (load_index);
1329           return false;
1330         }
1331       bitmap_set_bit (load_index, lidx);
1332       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1333         if (k != lidx)
1334           {
1335             sbitmap_free (load_index);
1336             return false;
1337           }
1338     }
1339   for (i = 0; i < group_size; i++)
1340     if (!bitmap_bit_p (load_index, i))
1341       {
1342         sbitmap_free (load_index);
1343         return false;
1344       }
1345   sbitmap_free (load_index);
1346
1347   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1348     if (node->load_permutation.exists ()
1349         && !vect_transform_slp_perm_load
1350               (node, vNULL, NULL,
1351                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1352       return false;
1353   return true;
1354 }
1355
1356
1357 /* Find the first load in the loop that belongs to INSTANCE.
1358    When loads are in several SLP nodes, there can be a case in which the first
1359    load does not appear in the first SLP node to be transformed, causing
1360    incorrect order of statements.  Since we generate all the loads together,
1361    they must be inserted before the first load of the SLP instance and not
1362    before the first load of the first node of the instance.  */
1363
1364 static gimple
1365 vect_find_first_load_in_slp_instance (slp_instance instance)
1366 {
1367   int i, j;
1368   slp_tree load_node;
1369   gimple first_load = NULL, load;
1370
1371   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1372     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1373       first_load = get_earlier_stmt (load, first_load);
1374
1375   return first_load;
1376 }
1377
1378
1379 /* Find the last store in SLP INSTANCE.  */
1380
1381 static gimple
1382 vect_find_last_store_in_slp_instance (slp_instance instance)
1383 {
1384   int i;
1385   slp_tree node;
1386   gimple last_store = NULL, store;
1387
1388   node = SLP_INSTANCE_TREE (instance);
1389   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1390     last_store = get_later_stmt (store, last_store);
1391
1392   return last_store;
1393 }
1394
1395 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1396
1397 static void
1398 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1399                          slp_instance instance, slp_tree node,
1400                          stmt_vector_for_cost *prologue_cost_vec,
1401                          unsigned ncopies_for_cost)
1402 {
1403   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1404
1405   unsigned i;
1406   slp_tree child;
1407   gimple stmt, s;
1408   stmt_vec_info stmt_info;
1409   tree lhs;
1410   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1411
1412   /* Recurse down the SLP tree.  */
1413   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1414     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1415                              instance, child, prologue_cost_vec,
1416                              ncopies_for_cost);
1417
1418   /* Look at the first scalar stmt to determine the cost.  */
1419   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1420   stmt_info = vinfo_for_stmt (stmt);
1421   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422     {
1423       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1424         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1425                                vect_uninitialized_def,
1426                                node, prologue_cost_vec, body_cost_vec);
1427       else
1428         {
1429           int i;
1430           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1431           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1432                                 node, prologue_cost_vec, body_cost_vec);
1433           /* If the load is permuted record the cost for the permutation.
1434              ???  Loads from multiple chains are let through here only
1435              for a single special case involving complex numbers where
1436              in the end no permutation is necessary.  */
1437           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1438             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1439                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1440                 && vect_get_place_in_interleaving_chain
1441                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1442               {
1443                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1444                                   stmt_info, 0, vect_body);
1445                 break;
1446               }
1447         }
1448     }
1449   else
1450     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1451                       stmt_info, 0, vect_body);
1452
1453   /* Scan operands and account for prologue cost of constants/externals.
1454      ???  This over-estimates cost for multiple uses and should be
1455      re-engineered.  */
1456   lhs = gimple_get_lhs (stmt);
1457   for (i = 0; i < gimple_num_ops (stmt); ++i)
1458     {
1459       tree def, op = gimple_op (stmt, i);
1460       gimple def_stmt;
1461       enum vect_def_type dt;
1462       if (!op || op == lhs)
1463         continue;
1464       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1465                               &def_stmt, &def, &dt)
1466           && (dt == vect_constant_def || dt == vect_external_def))
1467         record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1468                           stmt_info, 0, vect_prologue);
1469     }
1470 }
1471
1472 /* Compute the cost for the SLP instance INSTANCE.  */
1473
1474 static void
1475 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1476                        slp_instance instance, unsigned nunits)
1477 {
1478   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1479   unsigned ncopies_for_cost;
1480   stmt_info_for_cost *si;
1481   unsigned i;
1482
1483   /* Calculate the number of vector stmts to create based on the unrolling
1484      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1485      GROUP_SIZE / NUNITS otherwise.  */
1486   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1487   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1488
1489   prologue_cost_vec.create (10);
1490   body_cost_vec.create (10);
1491   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1492   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1493                            instance, SLP_INSTANCE_TREE (instance),
1494                            &prologue_cost_vec, ncopies_for_cost);
1495
1496   /* Record the prologue costs, which were delayed until we were
1497      sure that SLP was successful.  Unlike the body costs, we know
1498      the final values now regardless of the loop vectorization factor.  */
1499   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1500                 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1501   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1502     {
1503       struct _stmt_vec_info *stmt_info
1504         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1505       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1506                             si->misalign, vect_prologue);
1507     }
1508
1509   prologue_cost_vec.release ();
1510 }
1511
1512 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1513    vect_build_slp_tree to build a tree of packed stmts if possible.
1514    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1515
1516 static bool
1517 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1518                            gimple stmt, unsigned max_tree_size)
1519 {
1520   slp_instance new_instance;
1521   slp_tree node;
1522   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1523   unsigned int unrolling_factor = 1, nunits;
1524   tree vectype, scalar_type = NULL_TREE;
1525   gimple next;
1526   unsigned int vectorization_factor = 0;
1527   int i;
1528   unsigned int max_nunits = 0;
1529   vec<slp_tree> loads;
1530   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1531   vec<gimple> scalar_stmts;
1532
1533   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1534     {
1535       if (dr)
1536         {
1537           scalar_type = TREE_TYPE (DR_REF (dr));
1538           vectype = get_vectype_for_scalar_type (scalar_type);
1539         }
1540       else
1541         {
1542           gcc_assert (loop_vinfo);
1543           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1544         }
1545
1546       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1547     }
1548   else
1549     {
1550       gcc_assert (loop_vinfo);
1551       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1552       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1553     }
1554
1555   if (!vectype)
1556     {
1557       if (dump_enabled_p ())
1558         {
1559           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1560                            "Build SLP failed: unsupported data-type ");
1561           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1562           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1563         }
1564
1565       return false;
1566     }
1567
1568   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1569   if (loop_vinfo)
1570     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1571   else
1572     vectorization_factor = nunits;
1573
1574   /* Calculate the unrolling factor.  */
1575   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1576   if (unrolling_factor != 1 && !loop_vinfo)
1577     {
1578       if (dump_enabled_p ())
1579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1580                          "Build SLP failed: unrolling required in basic"
1581                          " block SLP\n");
1582
1583       return false;
1584     }
1585
1586   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1587   scalar_stmts.create (group_size);
1588   next = stmt;
1589   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1590     {
1591       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1592       while (next)
1593         {
1594           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1595               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1596             scalar_stmts.safe_push (
1597                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1598           else
1599             scalar_stmts.safe_push (next);
1600           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1601         }
1602     }
1603   else
1604     {
1605       /* Collect reduction statements.  */
1606       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1607       for (i = 0; reductions.iterate (i, &next); i++)
1608         scalar_stmts.safe_push (next);
1609     }
1610
1611   node = vect_create_new_slp_node (scalar_stmts);
1612
1613   loads.create (group_size);
1614
1615   /* Build the tree for the SLP instance.  */
1616   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1617                            &max_nunits, &loads,
1618                            vectorization_factor, NULL, NULL, NULL,
1619                            max_tree_size))
1620     {
1621       /* Calculate the unrolling factor based on the smallest type.  */
1622       if (max_nunits > nunits)
1623         unrolling_factor = least_common_multiple (max_nunits, group_size)
1624                            / group_size;
1625
1626       if (unrolling_factor != 1 && !loop_vinfo)
1627         {
1628           if (dump_enabled_p ())
1629             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630                              "Build SLP failed: unrolling required in basic"
1631                              " block SLP\n");
1632           vect_free_slp_tree (node);
1633           loads.release ();
1634           return false;
1635         }
1636
1637       /* Create a new SLP instance.  */
1638       new_instance = XNEW (struct _slp_instance);
1639       SLP_INSTANCE_TREE (new_instance) = node;
1640       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1641       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1642       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1643       SLP_INSTANCE_LOADS (new_instance) = loads;
1644       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1645
1646       /* Compute the load permutation.  */
1647       slp_tree load_node;
1648       bool loads_permuted = false;
1649       FOR_EACH_VEC_ELT (loads, i, load_node)
1650         {
1651           vec<unsigned> load_permutation;
1652           int j;
1653           gimple load, first_stmt;
1654           bool this_load_permuted = false;
1655           load_permutation.create (group_size);
1656           first_stmt = GROUP_FIRST_ELEMENT
1657               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1658           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1659             {
1660               int load_place
1661                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1662               gcc_assert (load_place != -1);
1663               if (load_place != j)
1664                 this_load_permuted = true;
1665               load_permutation.safe_push (load_place);
1666             }
1667           if (!this_load_permuted)
1668             {
1669               load_permutation.release ();
1670               continue;
1671             }
1672           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1673           loads_permuted = true;
1674         }
1675
1676       if (loads_permuted)
1677         {
1678           if (!vect_supported_load_permutation_p (new_instance))
1679             {
1680               if (dump_enabled_p ())
1681                 {
1682                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                                    "Build SLP failed: unsupported load "
1684                                    "permutation ");
1685                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1686                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1687                 }
1688               vect_free_slp_instance (new_instance);
1689               return false;
1690             }
1691
1692           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1693             = vect_find_first_load_in_slp_instance (new_instance);
1694         }
1695
1696       /* Compute the costs of this SLP instance.  */
1697       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1698                              new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1699
1700       if (loop_vinfo)
1701         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1702       else
1703         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1704
1705       if (dump_enabled_p ())
1706         vect_print_slp_tree (MSG_NOTE, node);
1707
1708       return true;
1709     }
1710
1711   /* Failed to SLP.  */
1712   /* Free the allocated memory.  */
1713   vect_free_slp_tree (node);
1714   loads.release ();
1715
1716   return false;
1717 }
1718
1719
1720 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1721    trees of packed scalar stmts if SLP is possible.  */
1722
1723 bool
1724 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1725                   unsigned max_tree_size)
1726 {
1727   unsigned int i;
1728   vec<gimple> grouped_stores;
1729   vec<gimple> reductions = vNULL;
1730   vec<gimple> reduc_chains = vNULL;
1731   gimple first_element;
1732   bool ok = false;
1733
1734   if (dump_enabled_p ())
1735     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1736
1737   if (loop_vinfo)
1738     {
1739       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1740       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1741       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1742     }
1743   else
1744     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1745
1746   /* Find SLP sequences starting from groups of grouped stores.  */
1747   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1748     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1749                                    max_tree_size))
1750       ok = true;
1751
1752   if (bb_vinfo && !ok)
1753     {
1754       if (dump_enabled_p ())
1755         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756                          "Failed to SLP the basic block.\n");
1757
1758       return false;
1759     }
1760
1761   if (loop_vinfo
1762       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1763     {
1764       /* Find SLP sequences starting from reduction chains.  */
1765       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1766         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1767                                        max_tree_size))
1768           ok = true;
1769         else
1770           return false;
1771
1772       /* Don't try to vectorize SLP reductions if reduction chain was
1773          detected.  */
1774       return ok;
1775     }
1776
1777   /* Find SLP sequences starting from groups of reductions.  */
1778   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1779       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1780                                     max_tree_size))
1781     ok = true;
1782
1783   return true;
1784 }
1785
1786
1787 /* For each possible SLP instance decide whether to SLP it and calculate overall
1788    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1789    least one instance.  */
1790
1791 bool
1792 vect_make_slp_decision (loop_vec_info loop_vinfo)
1793 {
1794   unsigned int i, unrolling_factor = 1;
1795   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1796   slp_instance instance;
1797   int decided_to_slp = 0;
1798
1799   if (dump_enabled_p ())
1800     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1801                      "\n");
1802
1803   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1804     {
1805       /* FORNOW: SLP if you can.  */
1806       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1807         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1808
1809       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1810          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1811          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1812       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1813       decided_to_slp++;
1814     }
1815
1816   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1817
1818   if (decided_to_slp && dump_enabled_p ())
1819     dump_printf_loc (MSG_NOTE, vect_location,
1820                      "Decided to SLP %d instances. Unrolling factor %d\n",
1821                      decided_to_slp, unrolling_factor);
1822
1823   return (decided_to_slp > 0);
1824 }
1825
1826
1827 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1828    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1829
1830 static void
1831 vect_detect_hybrid_slp_stmts (slp_tree node)
1832 {
1833   int i;
1834   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1835   gimple stmt = stmts[0];
1836   imm_use_iterator imm_iter;
1837   gimple use_stmt;
1838   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1839   slp_tree child;
1840   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1841   struct loop *loop = NULL;
1842   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1843   basic_block bb = NULL;
1844
1845   if (!node)
1846     return;
1847
1848   if (loop_vinfo)
1849     loop = LOOP_VINFO_LOOP (loop_vinfo);
1850   else
1851     bb = BB_VINFO_BB (bb_vinfo);
1852
1853   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1854     if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1855         && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1856       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1857         if (gimple_bb (use_stmt)
1858             && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1859                  || bb == gimple_bb (use_stmt))
1860             && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1861             && !STMT_SLP_TYPE (stmt_vinfo)
1862             && (STMT_VINFO_RELEVANT (stmt_vinfo)
1863                 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))
1864                 || (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
1865                     && STMT_VINFO_RELATED_STMT (stmt_vinfo)
1866                     && !STMT_SLP_TYPE (vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo)))))
1867             && !(gimple_code (use_stmt) == GIMPLE_PHI
1868                  && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1869                   == vect_reduction_def))
1870           vect_mark_slp_stmts (node, hybrid, i);
1871
1872   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1873     vect_detect_hybrid_slp_stmts (child);
1874 }
1875
1876
1877 /* Find stmts that must be both vectorized and SLPed.  */
1878
1879 void
1880 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1881 {
1882   unsigned int i;
1883   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1884   slp_instance instance;
1885
1886   if (dump_enabled_p ())
1887     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1888                      "\n");
1889
1890   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1891     vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1892 }
1893
1894
1895 /* Create and initialize a new bb_vec_info struct for BB, as well as
1896    stmt_vec_info structs for all the stmts in it.  */
1897
1898 static bb_vec_info
1899 new_bb_vec_info (basic_block bb)
1900 {
1901   bb_vec_info res = NULL;
1902   gimple_stmt_iterator gsi;
1903
1904   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1905   BB_VINFO_BB (res) = bb;
1906
1907   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1908     {
1909       gimple stmt = gsi_stmt (gsi);
1910       gimple_set_uid (stmt, 0);
1911       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1912     }
1913
1914   BB_VINFO_GROUPED_STORES (res).create (10);
1915   BB_VINFO_SLP_INSTANCES (res).create (2);
1916   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1917
1918   bb->aux = res;
1919   return res;
1920 }
1921
1922
1923 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1924    stmts in the basic block.  */
1925
1926 static void
1927 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1928 {
1929   vec<slp_instance> slp_instances;
1930   slp_instance instance;
1931   basic_block bb;
1932   gimple_stmt_iterator si;
1933   unsigned i;
1934
1935   if (!bb_vinfo)
1936     return;
1937
1938   bb = BB_VINFO_BB (bb_vinfo);
1939
1940   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1941     {
1942       gimple stmt = gsi_stmt (si);
1943       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1944
1945       if (stmt_info)
1946         /* Free stmt_vec_info.  */
1947         free_stmt_vec_info (stmt);
1948     }
1949
1950   vect_destroy_datarefs (NULL, bb_vinfo);
1951   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1952   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1953   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1954   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1955     vect_free_slp_instance (instance);
1956   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1957   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1958   free (bb_vinfo);
1959   bb->aux = NULL;
1960 }
1961
1962
1963 /* Analyze statements contained in SLP tree node after recursively analyzing
1964    the subtree. Return TRUE if the operations are supported.  */
1965
1966 static bool
1967 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1968 {
1969   bool dummy;
1970   int i;
1971   gimple stmt;
1972   slp_tree child;
1973
1974   if (!node)
1975     return true;
1976
1977   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1978     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
1979       return false;
1980
1981   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1982     {
1983       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1984       gcc_assert (stmt_info);
1985       gcc_assert (PURE_SLP_STMT (stmt_info));
1986
1987       if (!vect_analyze_stmt (stmt, &dummy, node))
1988         return false;
1989     }
1990
1991   return true;
1992 }
1993
1994
1995 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
1996    operations are supported. */
1997
1998 static bool
1999 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2000 {
2001   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2002   slp_instance instance;
2003   int i;
2004
2005   for (i = 0; slp_instances.iterate (i, &instance); )
2006     {
2007       if (!vect_slp_analyze_node_operations (bb_vinfo,
2008                                              SLP_INSTANCE_TREE (instance)))
2009         {
2010           vect_free_slp_instance (instance);
2011           slp_instances.ordered_remove (i);
2012         }
2013       else
2014         i++;
2015     }
2016
2017   if (!slp_instances.length ())
2018     return false;
2019
2020   return true;
2021 }
2022
2023
2024 /* Compute the scalar cost of the SLP node NODE and its children
2025    and return it.  Do not account defs that are marked in LIFE and
2026    update LIFE according to uses of NODE.  */
2027
2028 static unsigned
2029 vect_bb_slp_scalar_cost (basic_block bb,
2030                          slp_tree node, vec<bool, va_heap> *life)
2031 {
2032   unsigned scalar_cost = 0;
2033   unsigned i;
2034   gimple stmt;
2035   slp_tree child;
2036
2037   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2038     {
2039       unsigned stmt_cost;
2040       ssa_op_iter op_iter;
2041       def_operand_p def_p;
2042       stmt_vec_info stmt_info;
2043
2044       if ((*life)[i])
2045         continue;
2046
2047       /* If there is a non-vectorized use of the defs then the scalar
2048          stmt is kept live in which case we do not account it or any
2049          required defs in the SLP children in the scalar cost.  This
2050          way we make the vectorization more costly when compared to
2051          the scalar cost.  */
2052       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2053         {
2054           imm_use_iterator use_iter;
2055           gimple use_stmt;
2056           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2057             if (!is_gimple_debug (use_stmt)
2058                 && (gimple_code (use_stmt) == GIMPLE_PHI
2059                     || gimple_bb (use_stmt) != bb
2060                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2061               {
2062                 (*life)[i] = true;
2063                 BREAK_FROM_IMM_USE_STMT (use_iter);
2064               }
2065         }
2066       if ((*life)[i])
2067         continue;
2068
2069       stmt_info = vinfo_for_stmt (stmt);
2070       if (STMT_VINFO_DATA_REF (stmt_info))
2071         {
2072           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2073             stmt_cost = vect_get_stmt_cost (scalar_load);
2074           else
2075             stmt_cost = vect_get_stmt_cost (scalar_store);
2076         }
2077       else
2078         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2079
2080       scalar_cost += stmt_cost;
2081     }
2082
2083   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2084     scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2085
2086   return scalar_cost;
2087 }
2088
2089 /* Check if vectorization of the basic block is profitable.  */
2090
2091 static bool
2092 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2093 {
2094   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2095   slp_instance instance;
2096   int i, j;
2097   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2098   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2099   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2100   stmt_vec_info stmt_info = NULL;
2101   stmt_vector_for_cost body_cost_vec;
2102   stmt_info_for_cost *ci;
2103
2104   /* Calculate vector costs.  */
2105   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2106     {
2107       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2108
2109       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2110         {
2111           stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2112           (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2113                                 stmt_info, ci->misalign, vect_body);
2114         }
2115     }
2116
2117   /* Calculate scalar cost.  */
2118   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2119     {
2120       auto_vec<bool, 20> life;
2121       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2122       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2123                                               SLP_INSTANCE_TREE (instance),
2124                                               &life);
2125     }
2126
2127   /* Complete the target-specific cost calculation.  */
2128   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2129                &vec_inside_cost, &vec_epilogue_cost);
2130
2131   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2132
2133   if (dump_enabled_p ())
2134     {
2135       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2136       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2137                    vec_inside_cost);
2138       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2139       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2140       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2141     }
2142
2143   /* Vectorization is profitable if its cost is less than the cost of scalar
2144      version.  */
2145   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2146     return false;
2147
2148   return true;
2149 }
2150
2151 /* Check if the basic block can be vectorized.  */
2152
2153 static bb_vec_info
2154 vect_slp_analyze_bb_1 (basic_block bb)
2155 {
2156   bb_vec_info bb_vinfo;
2157   vec<slp_instance> slp_instances;
2158   slp_instance instance;
2159   int i;
2160   int min_vf = 2;
2161   unsigned n_stmts = 0;
2162
2163   bb_vinfo = new_bb_vec_info (bb);
2164   if (!bb_vinfo)
2165     return NULL;
2166
2167   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2168     {
2169       if (dump_enabled_p ())
2170         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2171                          "not vectorized: unhandled data-ref in basic "
2172                          "block.\n");
2173
2174       destroy_bb_vec_info (bb_vinfo);
2175       return NULL;
2176     }
2177
2178   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2179     {
2180       if (dump_enabled_p ())
2181         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182                          "not vectorized: not enough data-refs in "
2183                          "basic block.\n");
2184
2185       destroy_bb_vec_info (bb_vinfo);
2186       return NULL;
2187     }
2188
2189   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2190     {
2191      if (dump_enabled_p ())
2192        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193                         "not vectorized: unhandled data access in "
2194                         "basic block.\n");
2195
2196       destroy_bb_vec_info (bb_vinfo);
2197       return NULL;
2198     }
2199
2200   vect_pattern_recog (NULL, bb_vinfo);
2201
2202   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2203     {
2204       if (dump_enabled_p ())
2205         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2206                          "not vectorized: bad data alignment in basic "
2207                          "block.\n");
2208
2209       destroy_bb_vec_info (bb_vinfo);
2210       return NULL;
2211     }
2212
2213   /* Check the SLP opportunities in the basic block, analyze and build SLP
2214      trees.  */
2215   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2216     {
2217       if (dump_enabled_p ())
2218         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2219                          "not vectorized: failed to find SLP opportunities "
2220                          "in basic block.\n");
2221
2222       destroy_bb_vec_info (bb_vinfo);
2223       return NULL;
2224     }
2225
2226   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2227
2228   /* Mark all the statements that we want to vectorize as pure SLP and
2229      relevant.  */
2230   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2231     {
2232       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2233       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2234     }
2235
2236   /* Mark all the statements that we do not want to vectorize.  */
2237   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2238        !gsi_end_p (gsi); gsi_next (&gsi))
2239     {
2240       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2241       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2242         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2243     }
2244
2245   /* Analyze dependences.  At this point all stmts not participating in
2246      vectorization have to be marked.  Dependence analysis assumes
2247      that we either vectorize all SLP instances or none at all.  */
2248   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2249      {
2250        if (dump_enabled_p ())
2251          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252                           "not vectorized: unhandled data dependence "
2253                           "in basic block.\n");
2254
2255        destroy_bb_vec_info (bb_vinfo);
2256        return NULL;
2257      }
2258
2259   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2260     {
2261       if (dump_enabled_p ())
2262         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263                          "not vectorized: unsupported alignment in basic "
2264                          "block.\n");
2265       destroy_bb_vec_info (bb_vinfo);
2266       return NULL;
2267     }
2268
2269   if (!vect_slp_analyze_operations (bb_vinfo))
2270     {
2271       if (dump_enabled_p ())
2272         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273                          "not vectorized: bad operation in basic block.\n");
2274
2275       destroy_bb_vec_info (bb_vinfo);
2276       return NULL;
2277     }
2278
2279   /* Cost model: check if the vectorization is worthwhile.  */
2280   if (!unlimited_cost_model (NULL)
2281       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2282     {
2283       if (dump_enabled_p ())
2284         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285                          "not vectorized: vectorization is not "
2286                          "profitable.\n");
2287
2288       destroy_bb_vec_info (bb_vinfo);
2289       return NULL;
2290     }
2291
2292   if (dump_enabled_p ())
2293     dump_printf_loc (MSG_NOTE, vect_location,
2294                      "Basic block will be vectorized using SLP\n");
2295
2296   return bb_vinfo;
2297 }
2298
2299
2300 bb_vec_info
2301 vect_slp_analyze_bb (basic_block bb)
2302 {
2303   bb_vec_info bb_vinfo;
2304   int insns = 0;
2305   gimple_stmt_iterator gsi;
2306   unsigned int vector_sizes;
2307
2308   if (dump_enabled_p ())
2309     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2310
2311   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2312     {
2313       gimple stmt = gsi_stmt (gsi);
2314       if (!is_gimple_debug (stmt)
2315           && !gimple_nop_p (stmt)
2316           && gimple_code (stmt) != GIMPLE_LABEL)
2317         insns++;
2318     }
2319
2320   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2321     {
2322       if (dump_enabled_p ())
2323         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2324                          "not vectorized: too many instructions in "
2325                          "basic block.\n");
2326
2327       return NULL;
2328     }
2329
2330   /* Autodetect first vector size we try.  */
2331   current_vector_size = 0;
2332   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2333
2334   while (1)
2335     {
2336       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2337       if (bb_vinfo)
2338         return bb_vinfo;
2339
2340       destroy_bb_vec_info (bb_vinfo);
2341
2342       vector_sizes &= ~current_vector_size;
2343       if (vector_sizes == 0
2344           || current_vector_size == 0)
2345         return NULL;
2346
2347       /* Try the next biggest vector size.  */
2348       current_vector_size = 1 << floor_log2 (vector_sizes);
2349       if (dump_enabled_p ())
2350         dump_printf_loc (MSG_NOTE, vect_location,
2351                          "***** Re-trying analysis with "
2352                          "vector size %d\n", current_vector_size);
2353     }
2354 }
2355
2356
2357 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2358    the number of created vector stmts depends on the unrolling factor).
2359    However, the actual number of vector stmts for every SLP node depends on
2360    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2361    should be updated.  In this function we assume that the inside costs
2362    calculated in vect_model_xxx_cost are linear in ncopies.  */
2363
2364 void
2365 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2366 {
2367   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2368   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2369   slp_instance instance;
2370   stmt_vector_for_cost body_cost_vec;
2371   stmt_info_for_cost *si;
2372   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2373
2374   if (dump_enabled_p ())
2375     dump_printf_loc (MSG_NOTE, vect_location,
2376                      "=== vect_update_slp_costs_according_to_vf ===\n");
2377
2378   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2379     {
2380       /* We assume that costs are linear in ncopies.  */
2381       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2382
2383       /* Record the instance's instructions in the target cost model.
2384          This was delayed until here because the count of instructions
2385          isn't known beforehand.  */
2386       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2387
2388       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2389         (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2390                               vinfo_for_stmt (si->stmt), si->misalign,
2391                               vect_body);
2392     }
2393 }
2394
2395
2396 /* For constant and loop invariant defs of SLP_NODE this function returns
2397    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2398    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2399    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2400    REDUC_INDEX is the index of the reduction operand in the statements, unless
2401    it is -1.  */
2402
2403 static void
2404 vect_get_constant_vectors (tree op, slp_tree slp_node,
2405                            vec<tree> *vec_oprnds,
2406                            unsigned int op_num, unsigned int number_of_vectors,
2407                            int reduc_index)
2408 {
2409   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2410   gimple stmt = stmts[0];
2411   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2412   unsigned nunits;
2413   tree vec_cst;
2414   tree *elts;
2415   unsigned j, number_of_places_left_in_vector;
2416   tree vector_type;
2417   tree vop;
2418   int group_size = stmts.length ();
2419   unsigned int vec_num, i;
2420   unsigned number_of_copies = 1;
2421   vec<tree> voprnds;
2422   voprnds.create (number_of_vectors);
2423   bool constant_p, is_store;
2424   tree neutral_op = NULL;
2425   enum tree_code code = gimple_expr_code (stmt);
2426   gimple def_stmt;
2427   struct loop *loop;
2428   gimple_seq ctor_seq = NULL;
2429
2430   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2431       && reduc_index != -1)
2432     {
2433       op_num = reduc_index - 1;
2434       op = gimple_op (stmt, reduc_index);
2435       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2436          we need either neutral operands or the original operands.  See
2437          get_initial_def_for_reduction() for details.  */
2438       switch (code)
2439         {
2440           case WIDEN_SUM_EXPR:
2441           case DOT_PROD_EXPR:
2442           case PLUS_EXPR:
2443           case MINUS_EXPR:
2444           case BIT_IOR_EXPR:
2445           case BIT_XOR_EXPR:
2446              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2447                neutral_op = build_real (TREE_TYPE (op), dconst0);
2448              else
2449                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2450
2451              break;
2452
2453           case MULT_EXPR:
2454              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2455                neutral_op = build_real (TREE_TYPE (op), dconst1);
2456              else
2457                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2458
2459              break;
2460
2461           case BIT_AND_EXPR:
2462             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2463             break;
2464
2465           /* For MIN/MAX we don't have an easy neutral operand but
2466              the initial values can be used fine here.  Only for
2467              a reduction chain we have to force a neutral element.  */
2468           case MAX_EXPR:
2469           case MIN_EXPR:
2470             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2471               neutral_op = NULL;
2472             else
2473               {
2474                 def_stmt = SSA_NAME_DEF_STMT (op);
2475                 loop = (gimple_bb (stmt))->loop_father;
2476                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2477                                                     loop_preheader_edge (loop));
2478               }
2479             break;
2480
2481           default:
2482             neutral_op = NULL;
2483         }
2484     }
2485
2486   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2487     {
2488       is_store = true;
2489       op = gimple_assign_rhs1 (stmt);
2490     }
2491   else
2492     is_store = false;
2493
2494   gcc_assert (op);
2495
2496   if (CONSTANT_CLASS_P (op))
2497     constant_p = true;
2498   else
2499     constant_p = false;
2500
2501   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2502   gcc_assert (vector_type);
2503   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2504
2505   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2506      created vectors. It is greater than 1 if unrolling is performed.
2507
2508      For example, we have two scalar operands, s1 and s2 (e.g., group of
2509      strided accesses of size two), while NUNITS is four (i.e., four scalars
2510      of this type can be packed in a vector).  The output vector will contain
2511      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2512      will be 2).
2513
2514      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2515      containing the operands.
2516
2517      For example, NUNITS is four as before, and the group size is 8
2518      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2519      {s5, s6, s7, s8}.  */
2520
2521   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2522
2523   number_of_places_left_in_vector = nunits;
2524   elts = XALLOCAVEC (tree, nunits);
2525   for (j = 0; j < number_of_copies; j++)
2526     {
2527       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2528         {
2529           if (is_store)
2530             op = gimple_assign_rhs1 (stmt);
2531           else
2532             {
2533               switch (code)
2534                 {
2535                   case COND_EXPR:
2536                     if (op_num == 0 || op_num == 1)
2537                       {
2538                         tree cond = gimple_assign_rhs1 (stmt);
2539                         op = TREE_OPERAND (cond, op_num);
2540                       }
2541                     else
2542                       {
2543                         if (op_num == 2)
2544                           op = gimple_assign_rhs2 (stmt);
2545                         else
2546                           op = gimple_assign_rhs3 (stmt);
2547                       }
2548                     break;
2549
2550                   case CALL_EXPR:
2551                     op = gimple_call_arg (stmt, op_num);
2552                     break;
2553
2554                   case LSHIFT_EXPR:
2555                   case RSHIFT_EXPR:
2556                   case LROTATE_EXPR:
2557                   case RROTATE_EXPR:
2558                     op = gimple_op (stmt, op_num + 1);
2559                     /* Unlike the other binary operators, shifts/rotates have
2560                        the shift count being int, instead of the same type as
2561                        the lhs, so make sure the scalar is the right type if
2562                        we are dealing with vectors of
2563                        long long/long/short/char.  */
2564                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2565                       op = fold_convert (TREE_TYPE (vector_type), op);
2566                     break;
2567
2568                   default:
2569                     op = gimple_op (stmt, op_num + 1);
2570                     break;
2571                 }
2572             }
2573
2574           if (reduc_index != -1)
2575             {
2576               loop = (gimple_bb (stmt))->loop_father;
2577               def_stmt = SSA_NAME_DEF_STMT (op);
2578
2579               gcc_assert (loop);
2580
2581               /* Get the def before the loop.  In reduction chain we have only
2582                  one initial value.  */
2583               if ((j != (number_of_copies - 1)
2584                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2585                        && i != 0))
2586                   && neutral_op)
2587                 op = neutral_op;
2588               else
2589                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2590                                             loop_preheader_edge (loop));
2591             }
2592
2593           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2594           number_of_places_left_in_vector--;
2595           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2596             {
2597               if (CONSTANT_CLASS_P (op))
2598                 {
2599                   op = fold_unary (VIEW_CONVERT_EXPR,
2600                                    TREE_TYPE (vector_type), op);
2601                   gcc_assert (op && CONSTANT_CLASS_P (op));
2602                 }
2603               else
2604                 {
2605                   tree new_temp
2606                     = make_ssa_name (TREE_TYPE (vector_type), NULL);
2607                   gimple init_stmt;
2608                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2609                                op);               
2610                   init_stmt
2611                     = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2612                                                     new_temp, op, NULL_TREE);
2613                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2614                   op = new_temp;
2615                 }
2616             }
2617           elts[number_of_places_left_in_vector] = op;
2618           if (!CONSTANT_CLASS_P (op))
2619             constant_p = false;
2620
2621           if (number_of_places_left_in_vector == 0)
2622             {
2623               number_of_places_left_in_vector = nunits;
2624
2625               if (constant_p)
2626                 vec_cst = build_vector (vector_type, elts);
2627               else
2628                 {
2629                   vec<constructor_elt, va_gc> *v;
2630                   unsigned k;
2631                   vec_alloc (v, nunits);
2632                   for (k = 0; k < nunits; ++k)
2633                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2634                   vec_cst = build_constructor (vector_type, v);
2635                 }
2636               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2637                                                     vector_type, NULL));
2638               if (ctor_seq != NULL)
2639                 {
2640                   gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2641                   gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2642                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2643                                                         GSI_SAME_STMT);
2644                   ctor_seq = NULL;
2645                 }
2646             }
2647         }
2648     }
2649
2650   /* Since the vectors are created in the reverse order, we should invert
2651      them.  */
2652   vec_num = voprnds.length ();
2653   for (j = vec_num; j != 0; j--)
2654     {
2655       vop = voprnds[j - 1];
2656       vec_oprnds->quick_push (vop);
2657     }
2658
2659   voprnds.release ();
2660
2661   /* In case that VF is greater than the unrolling factor needed for the SLP
2662      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2663      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2664      to replicate the vectors.  */
2665   while (number_of_vectors > vec_oprnds->length ())
2666     {
2667       tree neutral_vec = NULL;
2668
2669       if (neutral_op)
2670         {
2671           if (!neutral_vec)
2672             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2673
2674           vec_oprnds->quick_push (neutral_vec);
2675         }
2676       else
2677         {
2678           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2679             vec_oprnds->quick_push (vop);
2680         }
2681     }
2682 }
2683
2684
2685 /* Get vectorized definitions from SLP_NODE that contains corresponding
2686    vectorized def-stmts.  */
2687
2688 static void
2689 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2690 {
2691   tree vec_oprnd;
2692   gimple vec_def_stmt;
2693   unsigned int i;
2694
2695   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2696
2697   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2698     {
2699       gcc_assert (vec_def_stmt);
2700       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2701       vec_oprnds->quick_push (vec_oprnd);
2702     }
2703 }
2704
2705
2706 /* Get vectorized definitions for SLP_NODE.
2707    If the scalar definitions are loop invariants or constants, collect them and
2708    call vect_get_constant_vectors() to create vector stmts.
2709    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2710    must be stored in the corresponding child of SLP_NODE, and we call
2711    vect_get_slp_vect_defs () to retrieve them.  */
2712
2713 void
2714 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2715                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2716 {
2717   gimple first_stmt;
2718   int number_of_vects = 0, i;
2719   unsigned int child_index = 0;
2720   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2721   slp_tree child = NULL;
2722   vec<tree> vec_defs;
2723   tree oprnd;
2724   bool vectorized_defs;
2725
2726   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2727   FOR_EACH_VEC_ELT (ops, i, oprnd)
2728     {
2729       /* For each operand we check if it has vectorized definitions in a child
2730          node or we need to create them (for invariants and constants).  We
2731          check if the LHS of the first stmt of the next child matches OPRND.
2732          If it does, we found the correct child.  Otherwise, we call
2733          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2734          to check this child node for the next operand.  */
2735       vectorized_defs = false;
2736       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2737         {
2738           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2739
2740           /* We have to check both pattern and original def, if available.  */
2741           gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2742           gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2743
2744           if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2745               || (related
2746                   && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2747             {
2748               /* The number of vector defs is determined by the number of
2749                  vector statements in the node from which we get those
2750                  statements.  */
2751               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2752               vectorized_defs = true;
2753               child_index++;
2754             }
2755         }
2756
2757       if (!vectorized_defs)
2758         {
2759           if (i == 0)
2760             {
2761               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2762               /* Number of vector stmts was calculated according to LHS in
2763                  vect_schedule_slp_instance (), fix it by replacing LHS with
2764                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2765                  details.  */
2766               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2767                                              &rhs_size_unit);
2768               if (rhs_size_unit != lhs_size_unit)
2769                 {
2770                   number_of_vects *= rhs_size_unit;
2771                   number_of_vects /= lhs_size_unit;
2772                 }
2773             }
2774         }
2775
2776       /* Allocate memory for vectorized defs.  */
2777       vec_defs = vNULL;
2778       vec_defs.create (number_of_vects);
2779
2780       /* For reduction defs we call vect_get_constant_vectors (), since we are
2781          looking for initial loop invariant values.  */
2782       if (vectorized_defs && reduc_index == -1)
2783         /* The defs are already vectorized.  */
2784         vect_get_slp_vect_defs (child, &vec_defs);
2785       else
2786         /* Build vectors from scalar defs.  */
2787         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2788                                    number_of_vects, reduc_index);
2789
2790       vec_oprnds->quick_push (vec_defs);
2791
2792       /* For reductions, we only need initial values.  */
2793       if (reduc_index != -1)
2794         return;
2795     }
2796 }
2797
2798
2799 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2800    building a vector of type MASK_TYPE from it) and two input vectors placed in
2801    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2802    shifting by STRIDE elements of DR_CHAIN for every copy.
2803    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2804    copies).
2805    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2806    the created stmts must be inserted.  */
2807
2808 static inline void
2809 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2810                            tree mask, int first_vec_indx, int second_vec_indx,
2811                            gimple_stmt_iterator *gsi, slp_tree node,
2812                            tree vectype, vec<tree> dr_chain,
2813                            int ncopies, int vect_stmts_counter)
2814 {
2815   tree perm_dest;
2816   gimple perm_stmt = NULL;
2817   stmt_vec_info next_stmt_info;
2818   int i, stride;
2819   tree first_vec, second_vec, data_ref;
2820
2821   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2822
2823   /* Initialize the vect stmts of NODE to properly insert the generated
2824      stmts later.  */
2825   for (i = SLP_TREE_VEC_STMTS (node).length ();
2826        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2827     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2828
2829   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2830   for (i = 0; i < ncopies; i++)
2831     {
2832       first_vec = dr_chain[first_vec_indx];
2833       second_vec = dr_chain[second_vec_indx];
2834
2835       /* Generate the permute statement.  */
2836       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2837                                                 first_vec, second_vec, mask);
2838       data_ref = make_ssa_name (perm_dest, perm_stmt);
2839       gimple_set_lhs (perm_stmt, data_ref);
2840       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2841
2842       /* Store the vector statement in NODE.  */
2843       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2844
2845       first_vec_indx += stride;
2846       second_vec_indx += stride;
2847     }
2848
2849   /* Mark the scalar stmt as vectorized.  */
2850   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2851   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2852 }
2853
2854
2855 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2856    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2857    representation.  Check that the mask is valid and return FALSE if not.
2858    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2859    the next vector, i.e., the current first vector is not needed.  */
2860
2861 static bool
2862 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2863                        int mask_nunits, bool only_one_vec, int index,
2864                        unsigned char *mask, int *current_mask_element,
2865                        bool *need_next_vector, int *number_of_mask_fixes,
2866                        bool *mask_fixed, bool *needs_first_vector)
2867 {
2868   int i;
2869
2870   /* Convert to target specific representation.  */
2871   *current_mask_element = first_mask_element + m;
2872   /* Adjust the value in case it's a mask for second and third vectors.  */
2873   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2874
2875   if (*current_mask_element < mask_nunits)
2876     *needs_first_vector = true;
2877
2878   /* We have only one input vector to permute but the mask accesses values in
2879      the next vector as well.  */
2880   if (only_one_vec && *current_mask_element >= mask_nunits)
2881     {
2882       if (dump_enabled_p ())
2883         {
2884           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885                            "permutation requires at least two vectors ");
2886           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2887           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2888         }
2889
2890       return false;
2891     }
2892
2893   /* The mask requires the next vector.  */
2894   while (*current_mask_element >= mask_nunits * 2)
2895     {
2896       if (*needs_first_vector || *mask_fixed)
2897         {
2898           /* We either need the first vector too or have already moved to the
2899              next vector. In both cases, this permutation needs three
2900              vectors.  */
2901           if (dump_enabled_p ())
2902             {
2903               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2904                                "permutation requires at "
2905                                "least three vectors ");
2906               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2907               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2908             }
2909
2910           return false;
2911         }
2912
2913       /* We move to the next vector, dropping the first one and working with
2914          the second and the third - we need to adjust the values of the mask
2915          accordingly.  */
2916       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2917
2918       for (i = 0; i < index; i++)
2919         mask[i] -= mask_nunits * *number_of_mask_fixes;
2920
2921       (*number_of_mask_fixes)++;
2922       *mask_fixed = true;
2923     }
2924
2925   *need_next_vector = *mask_fixed;
2926
2927   /* This was the last element of this mask. Start a new one.  */
2928   if (index == mask_nunits - 1)
2929     {
2930       *number_of_mask_fixes = 1;
2931       *mask_fixed = false;
2932       *needs_first_vector = false;
2933     }
2934
2935   return true;
2936 }
2937
2938
2939 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2940    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2941    permute statements for the SLP node NODE of the SLP instance
2942    SLP_NODE_INSTANCE.  */
2943
2944 bool
2945 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
2946                               gimple_stmt_iterator *gsi, int vf,
2947                               slp_instance slp_node_instance, bool analyze_only)
2948 {
2949   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2950   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2951   tree mask_element_type = NULL_TREE, mask_type;
2952   int i, j, k, nunits, vec_index = 0, scalar_index;
2953   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2954   gimple next_scalar_stmt;
2955   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2956   int first_mask_element;
2957   int index, unroll_factor, current_mask_element, ncopies;
2958   unsigned char *mask;
2959   bool only_one_vec = false, need_next_vector = false;
2960   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2961   int number_of_mask_fixes = 1;
2962   bool mask_fixed = false;
2963   bool needs_first_vector = false;
2964   machine_mode mode;
2965
2966   mode = TYPE_MODE (vectype);
2967
2968   if (!can_vec_perm_p (mode, false, NULL))
2969     {
2970       if (dump_enabled_p ())
2971         {
2972           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2973                            "no vect permute for ");
2974           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2975           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2976         }
2977       return false;
2978     }
2979
2980   /* The generic VEC_PERM_EXPR code always uses an integral type of the
2981      same size as the vector element being permuted.  */
2982   mask_element_type = lang_hooks.types.type_for_mode
2983                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2984   mask_type = get_vectype_for_scalar_type (mask_element_type);
2985   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2986   mask = XALLOCAVEC (unsigned char, nunits);
2987   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2988
2989   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2990      unrolling factor.  */
2991   orig_vec_stmts_num = group_size *
2992                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2993   if (orig_vec_stmts_num == 1)
2994     only_one_vec = true;
2995
2996   /* Number of copies is determined by the final vectorization factor
2997      relatively to SLP_NODE_INSTANCE unrolling factor.  */
2998   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2999
3000   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3001     return false;
3002
3003   /* Generate permutation masks for every NODE. Number of masks for each NODE
3004      is equal to GROUP_SIZE.
3005      E.g., we have a group of three nodes with three loads from the same
3006      location in each node, and the vector size is 4. I.e., we have a
3007      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3008      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3009      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3010      ...
3011
3012      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3013      The last mask is illegal since we assume two operands for permute
3014      operation, and the mask element values can't be outside that range.
3015      Hence, the last mask must be converted into {2,5,5,5}.
3016      For the first two permutations we need the first and the second input
3017      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3018      we need the second and the third vectors: {b1,c1,a2,b2} and
3019      {c2,a3,b3,c3}.  */
3020
3021     {
3022       scalar_index = 0;
3023       index = 0;
3024       vect_stmts_counter = 0;
3025       vec_index = 0;
3026       first_vec_index = vec_index++;
3027       if (only_one_vec)
3028         second_vec_index = first_vec_index;
3029       else
3030         second_vec_index =  vec_index++;
3031
3032       for (j = 0; j < unroll_factor; j++)
3033         {
3034           for (k = 0; k < group_size; k++)
3035             {
3036               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3037               first_mask_element = i + j * group_size;
3038               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3039                                           nunits, only_one_vec, index,
3040                                           mask, &current_mask_element,
3041                                           &need_next_vector,
3042                                           &number_of_mask_fixes, &mask_fixed,
3043                                           &needs_first_vector))
3044                 return false;
3045               gcc_assert (current_mask_element < 2 * nunits);
3046               mask[index++] = current_mask_element;
3047
3048               if (index == nunits)
3049                 {
3050                   index = 0;
3051                   if (!can_vec_perm_p (mode, false, mask))
3052                     {
3053                       if (dump_enabled_p ())
3054                         {
3055                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3056                                            vect_location, 
3057                                            "unsupported vect permute { ");
3058                           for (i = 0; i < nunits; ++i)
3059                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3060                                          mask[i]);
3061                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3062                         }
3063                       return false;
3064                     }
3065
3066                   if (!analyze_only)
3067                     {
3068                       int l;
3069                       tree mask_vec, *mask_elts;
3070                       mask_elts = XALLOCAVEC (tree, nunits);
3071                       for (l = 0; l < nunits; ++l)
3072                         mask_elts[l] = build_int_cst (mask_element_type,
3073                                                       mask[l]);
3074                       mask_vec = build_vector (mask_type, mask_elts);
3075
3076                       if (need_next_vector)
3077                         {
3078                           first_vec_index = second_vec_index;
3079                           second_vec_index = vec_index;
3080                         }
3081
3082                       next_scalar_stmt
3083                           = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3084
3085                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3086                                mask_vec, first_vec_index, second_vec_index,
3087                                gsi, node, vectype, dr_chain,
3088                                ncopies, vect_stmts_counter++);
3089                     }
3090                 }
3091             }
3092         }
3093     }
3094
3095   return true;
3096 }
3097
3098
3099
3100 /* Vectorize SLP instance tree in postorder.  */
3101
3102 static bool
3103 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3104                             unsigned int vectorization_factor)
3105 {
3106   gimple stmt;
3107   bool grouped_store, is_store;
3108   gimple_stmt_iterator si;
3109   stmt_vec_info stmt_info;
3110   unsigned int vec_stmts_size, nunits, group_size;
3111   tree vectype;
3112   int i;
3113   slp_tree child;
3114
3115   if (!node)
3116     return false;
3117
3118   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3119     vect_schedule_slp_instance (child, instance, vectorization_factor);
3120
3121   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3122   stmt_info = vinfo_for_stmt (stmt);
3123
3124   /* VECTYPE is the type of the destination.  */
3125   vectype = STMT_VINFO_VECTYPE (stmt_info);
3126   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3127   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3128
3129   /* For each SLP instance calculate number of vector stmts to be created
3130      for the scalar stmts in each node of the SLP tree.  Number of vector
3131      elements in one vector iteration is the number of scalar elements in
3132      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3133      size.  */
3134   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3135
3136   if (!SLP_TREE_VEC_STMTS (node).exists ())
3137     {
3138       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3139       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3140     }
3141
3142   if (dump_enabled_p ())
3143     {
3144       dump_printf_loc (MSG_NOTE,vect_location,
3145                        "------>vectorizing SLP node starting from: ");
3146       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3147       dump_printf (MSG_NOTE, "\n");
3148     }
3149
3150   /* Loads should be inserted before the first load.  */
3151   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3152       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3153       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3154       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3155     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3156   else if (is_pattern_stmt_p (stmt_info))
3157     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3158   else
3159     si = gsi_for_stmt (stmt);
3160
3161   /* Stores should be inserted just before the last store.  */
3162   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3163       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3164     { 
3165       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3166       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3167        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3168       si = gsi_for_stmt (last_store);
3169     }
3170
3171   /* Mark the first element of the reduction chain as reduction to properly
3172      transform the node.  In the analysis phase only the last element of the
3173      chain is marked as reduction.  */
3174   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3175       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3176     {
3177       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3178       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3179     }
3180
3181   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3182   return is_store;
3183 }
3184
3185 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3186    For loop vectorization this is done in vectorizable_call, but for SLP
3187    it needs to be deferred until end of vect_schedule_slp, because multiple
3188    SLP instances may refer to the same scalar stmt.  */
3189
3190 static void
3191 vect_remove_slp_scalar_calls (slp_tree node)
3192 {
3193   gimple stmt, new_stmt;
3194   gimple_stmt_iterator gsi;
3195   int i;
3196   slp_tree child;
3197   tree lhs;
3198   stmt_vec_info stmt_info;
3199
3200   if (!node)
3201     return;
3202
3203   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3204     vect_remove_slp_scalar_calls (child);
3205
3206   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3207     {
3208       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3209         continue;
3210       stmt_info = vinfo_for_stmt (stmt);
3211       if (stmt_info == NULL
3212           || is_pattern_stmt_p (stmt_info)
3213           || !PURE_SLP_STMT (stmt_info))
3214         continue;
3215       lhs = gimple_call_lhs (stmt);
3216       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3217       set_vinfo_for_stmt (new_stmt, stmt_info);
3218       set_vinfo_for_stmt (stmt, NULL);
3219       STMT_VINFO_STMT (stmt_info) = new_stmt;
3220       gsi = gsi_for_stmt (stmt);
3221       gsi_replace (&gsi, new_stmt, false);
3222       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3223     }
3224 }
3225
3226 /* Generate vector code for all SLP instances in the loop/basic block.  */
3227
3228 bool
3229 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3230 {
3231   vec<slp_instance> slp_instances;
3232   slp_instance instance;
3233   unsigned int i, vf;
3234   bool is_store = false;
3235
3236   if (loop_vinfo)
3237     {
3238       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3239       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3240     }
3241   else
3242     {
3243       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3244       vf = 1;
3245     }
3246
3247   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3248     {
3249       /* Schedule the tree of INSTANCE.  */
3250       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3251                                              instance, vf);
3252       if (dump_enabled_p ())
3253         dump_printf_loc (MSG_NOTE, vect_location,
3254                          "vectorizing stmts using SLP.\n");
3255     }
3256
3257   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3258     {
3259       slp_tree root = SLP_INSTANCE_TREE (instance);
3260       gimple store;
3261       unsigned int j;
3262       gimple_stmt_iterator gsi;
3263
3264       /* Remove scalar call stmts.  Do not do this for basic-block
3265          vectorization as not all uses may be vectorized.
3266          ???  Why should this be necessary?  DCE should be able to
3267          remove the stmts itself.
3268          ???  For BB vectorization we can as well remove scalar
3269          stmts starting from the SLP tree root if they have no
3270          uses.  */
3271       if (loop_vinfo)
3272         vect_remove_slp_scalar_calls (root);
3273
3274       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3275                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3276         {
3277           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3278             break;
3279
3280          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3281            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3282           /* Free the attached stmt_vec_info and remove the stmt.  */
3283           gsi = gsi_for_stmt (store);
3284           unlink_stmt_vdef (store);
3285           gsi_remove (&gsi, true);
3286           release_defs (store);
3287           free_stmt_vec_info (store);
3288         }
3289     }
3290
3291   return is_store;
3292 }
3293
3294
3295 /* Vectorize the basic block.  */
3296
3297 void
3298 vect_slp_transform_bb (basic_block bb)
3299 {
3300   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3301   gimple_stmt_iterator si;
3302
3303   gcc_assert (bb_vinfo);
3304
3305   if (dump_enabled_p ())
3306     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3307
3308   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3309     {
3310       gimple stmt = gsi_stmt (si);
3311       stmt_vec_info stmt_info;
3312
3313       if (dump_enabled_p ())
3314         {
3315           dump_printf_loc (MSG_NOTE, vect_location,
3316                            "------>SLPing statement: ");
3317           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3318           dump_printf (MSG_NOTE, "\n");
3319         }
3320
3321       stmt_info = vinfo_for_stmt (stmt);
3322       gcc_assert (stmt_info);
3323
3324       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3325       if (STMT_SLP_TYPE (stmt_info))
3326         {
3327           vect_schedule_slp (NULL, bb_vinfo);
3328           break;
3329         }
3330     }
3331
3332   if (dump_enabled_p ())
3333     dump_printf_loc (MSG_NOTE, vect_location,
3334                      "BASIC BLOCK VECTORIZED\n");
3335
3336   destroy_bb_vec_info (bb_vinfo);
3337 }