tree-vect-slp.c (vect_get_and_check_slp_defs): Try swapping operands to get a def...
[platform/upstream/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2014 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "target.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-ssa-alias.h"
33 #include "internal-fn.h"
34 #include "gimple-expr.h"
35 #include "is-a.h"
36 #include "gimple.h"
37 #include "gimple-iterator.h"
38 #include "gimple-ssa.h"
39 #include "tree-phinodes.h"
40 #include "ssa-iterators.h"
41 #include "stringpool.h"
42 #include "tree-ssanames.h"
43 #include "tree-pass.h"
44 #include "cfgloop.h"
45 #include "expr.h"
46 #include "recog.h"              /* FIXME: for insn_data */
47 #include "optabs.h"
48 #include "tree-vectorizer.h"
49 #include "langhooks.h"
50
51 /* Extract the location of the basic block in the source code.
52    Return the basic block location if succeed and NULL if not.  */
53
54 source_location
55 find_bb_location (basic_block bb)
56 {
57   gimple stmt = NULL;
58   gimple_stmt_iterator si;
59
60   if (!bb)
61     return UNKNOWN_LOCATION;
62
63   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
64     {
65       stmt = gsi_stmt (si);
66       if (gimple_location (stmt) != UNKNOWN_LOCATION)
67         return gimple_location (stmt);
68     }
69
70   return UNKNOWN_LOCATION;
71 }
72
73
74 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
75
76 static void
77 vect_free_slp_tree (slp_tree node)
78 {
79   int i;
80   slp_tree child;
81
82   if (!node)
83     return;
84
85   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
86     vect_free_slp_tree (child);
87
88   SLP_TREE_CHILDREN (node).release ();
89   SLP_TREE_SCALAR_STMTS (node).release ();
90   SLP_TREE_VEC_STMTS (node).release ();
91   SLP_TREE_LOAD_PERMUTATION (node).release ();
92
93   free (node);
94 }
95
96
97 /* Free the memory allocated for the SLP instance.  */
98
99 void
100 vect_free_slp_instance (slp_instance instance)
101 {
102   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
103   SLP_INSTANCE_LOADS (instance).release ();
104   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
105   free (instance);
106 }
107
108
109 /* Create an SLP node for SCALAR_STMTS.  */
110
111 static slp_tree
112 vect_create_new_slp_node (vec<gimple> scalar_stmts)
113 {
114   slp_tree node;
115   gimple stmt = scalar_stmts[0];
116   unsigned int nops;
117
118   if (is_gimple_call (stmt))
119     nops = gimple_call_num_args (stmt);
120   else if (is_gimple_assign (stmt))
121     {
122       nops = gimple_num_ops (stmt) - 1;
123       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
124         nops++;
125     }
126   else
127     return NULL;
128
129   node = XNEW (struct _slp_tree);
130   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
131   SLP_TREE_VEC_STMTS (node).create (0);
132   SLP_TREE_CHILDREN (node).create (nops);
133   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
134
135   return node;
136 }
137
138
139 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
140    operand.  */
141 static vec<slp_oprnd_info> 
142 vect_create_oprnd_info (int nops, int group_size)
143 {
144   int i;
145   slp_oprnd_info oprnd_info;
146   vec<slp_oprnd_info> oprnds_info;
147
148   oprnds_info.create (nops);
149   for (i = 0; i < nops; i++)
150     {
151       oprnd_info = XNEW (struct _slp_oprnd_info);
152       oprnd_info->def_stmts.create (group_size);
153       oprnd_info->first_dt = vect_uninitialized_def;
154       oprnd_info->first_op_type = NULL_TREE;
155       oprnd_info->first_pattern = false;
156       oprnds_info.quick_push (oprnd_info);
157     }
158
159   return oprnds_info;
160 }
161
162
163 /* Free operands info.  */
164
165 static void
166 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
167 {
168   int i;
169   slp_oprnd_info oprnd_info;
170
171   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
172     {
173       oprnd_info->def_stmts.release ();
174       XDELETE (oprnd_info);
175     }
176
177   oprnds_info.release ();
178 }
179
180
181 /* Find the place of the data-ref in STMT in the interleaving chain that starts
182    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
183
184 static int
185 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
186 {
187   gimple next_stmt = first_stmt;
188   int result = 0;
189
190   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
191     return -1;
192
193   do
194     {
195       if (next_stmt == stmt)
196         return result;
197       result++;
198       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
199     }
200   while (next_stmt);
201
202   return -1;
203 }
204
205
206 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
207    they are of a valid type and that they match the defs of the first stmt of
208    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
209    return -1, if the error could be corrected by swapping operands of the
210    operation return 1, if everything is ok return 0.  */
211
212 static int 
213 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
214                              gimple stmt, bool first,
215                              vec<slp_oprnd_info> *oprnds_info)
216 {
217   tree oprnd;
218   unsigned int i, number_of_oprnds;
219   tree def;
220   gimple def_stmt;
221   enum vect_def_type dt = vect_uninitialized_def;
222   struct loop *loop = NULL;
223   bool pattern = false;
224   slp_oprnd_info oprnd_info;
225   int first_op_idx = 1;
226   bool commutative = false;
227   bool first_op_cond = false;
228
229   if (loop_vinfo)
230     loop = LOOP_VINFO_LOOP (loop_vinfo);
231
232   if (is_gimple_call (stmt))
233     {
234       number_of_oprnds = gimple_call_num_args (stmt);
235       first_op_idx = 3;
236     }
237   else if (is_gimple_assign (stmt))
238     {
239       enum tree_code code = gimple_assign_rhs_code (stmt);
240       number_of_oprnds = gimple_num_ops (stmt) - 1;
241       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
242         {
243           first_op_cond = true;
244           commutative = true;
245           number_of_oprnds++;
246         }
247       else
248         commutative = commutative_tree_code (code);
249     }
250   else
251     return -1;
252
253   bool swapped = false;
254   for (i = 0; i < number_of_oprnds; i++)
255     {
256 again:
257       if (first_op_cond)
258         {
259           if (i == 0 || i == 1)
260             oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
261                                   swapped ? !i : i);
262           else
263             oprnd = gimple_op (stmt, first_op_idx + i - 1);
264         }
265       else
266         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
267
268       oprnd_info = (*oprnds_info)[i];
269
270       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
271                                &def, &dt)
272           || (!def_stmt && dt != vect_constant_def))
273         {
274           if (dump_enabled_p ())
275             {
276               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
277                                "Build SLP failed: can't find def for ");
278               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
279               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
280             }
281
282           return -1;
283         }
284
285       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
286          from the pattern.  Check that all the stmts of the node are in the
287          pattern.  */
288       if (def_stmt && gimple_bb (def_stmt)
289           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
290               || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
291                   && gimple_code (def_stmt) != GIMPLE_PHI))
292           && vinfo_for_stmt (def_stmt)
293           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
294           && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
295           && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
296         {
297           pattern = true;
298           if (!first && !oprnd_info->first_pattern)
299             {
300               if (i == 0
301                   && !swapped
302                   && commutative)
303                 {
304                   swapped = true;
305                   goto again;
306                 }
307
308               if (dump_enabled_p ())
309                 {
310                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
311                                    "Build SLP failed: some of the stmts"
312                                    " are in a pattern, and others are not ");
313                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
314                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
315                 }
316
317               return 1;
318             }
319
320           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
321           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
322
323           if (dt == vect_unknown_def_type)
324             {
325               if (dump_enabled_p ())
326                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
327                                  "Unsupported pattern.\n");
328               return -1;
329             }
330
331           switch (gimple_code (def_stmt))
332             {
333               case GIMPLE_PHI:
334                 def = gimple_phi_result (def_stmt);
335                 break;
336
337               case GIMPLE_ASSIGN:
338                 def = gimple_assign_lhs (def_stmt);
339                 break;
340
341               default:
342                 if (dump_enabled_p ())
343                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
344                                    "unsupported defining stmt:\n");
345                 return -1;
346             }
347         }
348
349       if (first)
350         {
351           oprnd_info->first_dt = dt;
352           oprnd_info->first_pattern = pattern;
353           oprnd_info->first_op_type = TREE_TYPE (oprnd);
354         }
355       else
356         {
357           /* Not first stmt of the group, check that the def-stmt/s match
358              the def-stmt/s of the first stmt.  Allow different definition
359              types for reduction chains: the first stmt must be a
360              vect_reduction_def (a phi node), and the rest
361              vect_internal_def.  */
362           if (((oprnd_info->first_dt != dt
363                 && !(oprnd_info->first_dt == vect_reduction_def
364                      && dt == vect_internal_def)
365                 && !((oprnd_info->first_dt == vect_external_def
366                       || oprnd_info->first_dt == vect_constant_def)
367                      && (dt == vect_external_def
368                          || dt == vect_constant_def)))
369                || !types_compatible_p (oprnd_info->first_op_type,
370                                        TREE_TYPE (oprnd))))
371             {
372               /* Try swapping operands if we got a mismatch.  */
373               if (i == 0
374                   && !swapped
375                   && commutative)
376                 {
377                   swapped = true;
378                   goto again;
379                 }
380
381               if (dump_enabled_p ())
382                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
383                                  "Build SLP failed: different types\n");
384
385               return 1;
386             }
387         }
388
389       /* Check the types of the definitions.  */
390       switch (dt)
391         {
392         case vect_constant_def:
393         case vect_external_def:
394         case vect_reduction_def:
395           break;
396
397         case vect_internal_def:
398           oprnd_info->def_stmts.quick_push (def_stmt);
399           break;
400
401         default:
402           /* FORNOW: Not supported.  */
403           if (dump_enabled_p ())
404             {
405               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406                                "Build SLP failed: illegal type of def ");
407               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
408               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
409             }
410
411           return -1;
412         }
413     }
414
415   /* Swap operands.  */
416   if (swapped)
417     {
418       if (first_op_cond)
419         {
420           tree cond = gimple_assign_rhs1 (stmt);
421           swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
422                              &TREE_OPERAND (cond, 1));
423           TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
424         }
425       else
426         swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
427                            gimple_assign_rhs2_ptr (stmt));
428     }
429
430   return 0;
431 }
432
433
434 /* Verify if the scalar stmts STMTS are isomorphic, require data
435    permutation or are of unsupported types of operation.  Return
436    true if they are, otherwise return false and indicate in *MATCHES
437    which stmts are not isomorphic to the first one.  If MATCHES[0]
438    is false then this indicates the comparison could not be
439    carried out or the stmts will never be vectorized by SLP.  */
440
441 static bool
442 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
443                        vec<gimple> stmts, unsigned int group_size,
444                        unsigned nops, unsigned int *max_nunits,
445                        unsigned int vectorization_factor, bool *matches)
446 {
447   unsigned int i;
448   gimple stmt = stmts[0];
449   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
450   enum tree_code first_cond_code = ERROR_MARK;
451   tree lhs;
452   bool need_same_oprnds = false;
453   tree vectype, scalar_type, first_op1 = NULL_TREE;
454   optab optab;
455   int icode;
456   enum machine_mode optab_op2_mode;
457   enum machine_mode vec_mode;
458   struct data_reference *first_dr;
459   HOST_WIDE_INT dummy;
460   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
461   tree cond;
462
463   /* For every stmt in NODE find its def stmt/s.  */
464   FOR_EACH_VEC_ELT (stmts, i, stmt)
465     {
466       matches[i] = false;
467
468       if (dump_enabled_p ())
469         {
470           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
471           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
472           dump_printf (MSG_NOTE, "\n");
473         }
474
475       /* Fail to vectorize statements marked as unvectorizable.  */
476       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
477         {
478           if (dump_enabled_p ())
479             {
480               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
481                                "Build SLP failed: unvectorizable statement ");
482               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
483               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
484             }
485           /* Fatal mismatch.  */
486           matches[0] = false;
487           return false;
488         }
489
490       lhs = gimple_get_lhs (stmt);
491       if (lhs == NULL_TREE)
492         {
493           if (dump_enabled_p ())
494             {
495               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
496                                "Build SLP failed: not GIMPLE_ASSIGN nor "
497                                "GIMPLE_CALL ");
498               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
499               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
500             }
501           /* Fatal mismatch.  */
502           matches[0] = false;
503           return false;
504         }
505
506        if (is_gimple_assign (stmt)
507            && gimple_assign_rhs_code (stmt) == COND_EXPR
508            && (cond = gimple_assign_rhs1 (stmt))
509            && !COMPARISON_CLASS_P (cond))
510         {
511           if (dump_enabled_p ())
512             {
513               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
514                                "Build SLP failed: condition is not "
515                                "comparison ");
516               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
517               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
518             }
519           /* Fatal mismatch.  */
520           matches[0] = false;
521           return false;
522         }
523
524       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
525       vectype = get_vectype_for_scalar_type (scalar_type);
526       if (!vectype)
527         {
528           if (dump_enabled_p ())
529             {
530               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
531                                "Build SLP failed: unsupported data-type ");
532               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
533                                  scalar_type);
534               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
535             }
536           /* Fatal mismatch.  */
537           matches[0] = false;
538           return false;
539         }
540
541       /* In case of multiple types we need to detect the smallest type.  */
542       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
543         {
544           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
545           if (bb_vinfo)
546             vectorization_factor = *max_nunits;
547         }
548
549       if (is_gimple_call (stmt))
550         {
551           rhs_code = CALL_EXPR;
552           if (gimple_call_internal_p (stmt)
553               || gimple_call_tail_p (stmt)
554               || gimple_call_noreturn_p (stmt)
555               || !gimple_call_nothrow_p (stmt)
556               || gimple_call_chain (stmt))
557             {
558               if (dump_enabled_p ())
559                 {
560                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
561                                    "Build SLP failed: unsupported call type ");
562                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
563                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
564                 }
565               /* Fatal mismatch.  */
566               matches[0] = false;
567               return false;
568             }
569         }
570       else
571         rhs_code = gimple_assign_rhs_code (stmt);
572
573       /* Check the operation.  */
574       if (i == 0)
575         {
576           first_stmt_code = rhs_code;
577
578           /* Shift arguments should be equal in all the packed stmts for a
579              vector shift with scalar shift operand.  */
580           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
581               || rhs_code == LROTATE_EXPR
582               || rhs_code == RROTATE_EXPR)
583             {
584               vec_mode = TYPE_MODE (vectype);
585
586               /* First see if we have a vector/vector shift.  */
587               optab = optab_for_tree_code (rhs_code, vectype,
588                                            optab_vector);
589
590               if (!optab
591                   || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
592                 {
593                   /* No vector/vector shift, try for a vector/scalar shift.  */
594                   optab = optab_for_tree_code (rhs_code, vectype,
595                                                optab_scalar);
596
597                   if (!optab)
598                     {
599                       if (dump_enabled_p ())
600                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
601                                          "Build SLP failed: no optab.\n");
602                       /* Fatal mismatch.  */
603                       matches[0] = false;
604                       return false;
605                     }
606                   icode = (int) optab_handler (optab, vec_mode);
607                   if (icode == CODE_FOR_nothing)
608                     {
609                       if (dump_enabled_p ())
610                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
611                                          "Build SLP failed: "
612                                          "op not supported by target.\n");
613                       /* Fatal mismatch.  */
614                       matches[0] = false;
615                       return false;
616                     }
617                   optab_op2_mode = insn_data[icode].operand[2].mode;
618                   if (!VECTOR_MODE_P (optab_op2_mode))
619                     {
620                       need_same_oprnds = true;
621                       first_op1 = gimple_assign_rhs2 (stmt);
622                     }
623                 }
624             }
625           else if (rhs_code == WIDEN_LSHIFT_EXPR)
626             {
627               need_same_oprnds = true;
628               first_op1 = gimple_assign_rhs2 (stmt);
629             }
630         }
631       else
632         {
633           if (first_stmt_code != rhs_code
634               && (first_stmt_code != IMAGPART_EXPR
635                   || rhs_code != REALPART_EXPR)
636               && (first_stmt_code != REALPART_EXPR
637                   || rhs_code != IMAGPART_EXPR)
638               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
639                    && (first_stmt_code == ARRAY_REF
640                        || first_stmt_code == BIT_FIELD_REF
641                        || first_stmt_code == INDIRECT_REF
642                        || first_stmt_code == COMPONENT_REF
643                        || first_stmt_code == MEM_REF)))
644             {
645               if (dump_enabled_p ())
646                 {
647                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
648                                    "Build SLP failed: different operation "
649                                    "in stmt ");
650                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
651                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
652                 }
653               /* Mismatch.  */
654               continue;
655             }
656
657           if (need_same_oprnds
658               && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
659             {
660               if (dump_enabled_p ())
661                 {
662                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
663                                    "Build SLP failed: different shift "
664                                    "arguments in ");
665                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
666                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
667                 }
668               /* Mismatch.  */
669               continue;
670             }
671
672           if (rhs_code == CALL_EXPR)
673             {
674               gimple first_stmt = stmts[0];
675               if (gimple_call_num_args (stmt) != nops
676                   || !operand_equal_p (gimple_call_fn (first_stmt),
677                                        gimple_call_fn (stmt), 0)
678                   || gimple_call_fntype (first_stmt)
679                      != gimple_call_fntype (stmt))
680                 {
681                   if (dump_enabled_p ())
682                     {
683                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
684                                        "Build SLP failed: different calls in ");
685                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
686                                         stmt, 0);
687                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
688                     }
689                   /* Mismatch.  */
690                   continue;
691                 }
692             }
693         }
694
695       /* Grouped store or load.  */
696       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
697         {
698           if (REFERENCE_CLASS_P (lhs))
699             {
700               /* Store.  */
701               ;
702             }
703           else
704             {
705               /* Load.  */
706               unsigned unrolling_factor
707                 = least_common_multiple
708                     (*max_nunits, group_size) / group_size;
709               /* FORNOW: Check that there is no gap between the loads
710                  and no gap between the groups when we need to load
711                  multiple groups at once.
712                  ???  We should enhance this to only disallow gaps
713                  inside vectors.  */
714               if ((unrolling_factor > 1
715                    && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
716                    && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
717                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
718                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
719                 {
720                   if (dump_enabled_p ())
721                     {
722                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
723                                        "Build SLP failed: grouped "
724                                        "loads have gaps ");
725                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
726                                         stmt, 0);
727                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
728                     }
729                   /* Fatal mismatch.  */
730                   matches[0] = false;
731                   return false;
732                 }
733
734               /* Check that the size of interleaved loads group is not
735                  greater than the SLP group size.  */
736               unsigned ncopies
737                 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
738               if (loop_vinfo
739                   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
740                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
741                        - GROUP_GAP (vinfo_for_stmt (stmt)))
742                       > ncopies * group_size))
743                 {
744                   if (dump_enabled_p ())
745                     {
746                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
747                                        "Build SLP failed: the number "
748                                        "of interleaved loads is greater than "
749                                        "the SLP group size ");
750                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
751                                         stmt, 0);
752                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
753                     }
754                   /* Fatal mismatch.  */
755                   matches[0] = false;
756                   return false;
757                 }
758
759               old_first_load = first_load;
760               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
761               if (prev_first_load)
762                 {
763                   /* Check that there are no loads from different interleaving
764                      chains in the same node.  */
765                   if (prev_first_load != first_load)
766                     {
767                       if (dump_enabled_p ())
768                         {
769                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
770                                            vect_location, 
771                                            "Build SLP failed: different "
772                                            "interleaving chains in one node ");
773                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
774                                             stmt, 0);
775                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
776                         }
777                       /* Mismatch.  */
778                       continue;
779                     }
780                 }
781               else
782                 prev_first_load = first_load;
783
784               /* In some cases a group of loads is just the same load
785                  repeated N times.  Only analyze its cost once.  */
786               if (first_load == stmt && old_first_load != first_load)
787                 {
788                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
789                   if (vect_supportable_dr_alignment (first_dr, false)
790                       == dr_unaligned_unsupported)
791                     {
792                       if (dump_enabled_p ())
793                         {
794                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
795                                            vect_location, 
796                                            "Build SLP failed: unsupported "
797                                            "unaligned load ");
798                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
799                                             stmt, 0);
800                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
801                         }
802                       /* Fatal mismatch.  */
803                       matches[0] = false;
804                       return false;
805                     }
806                 }
807            }
808         } /* Grouped access.  */
809       else
810         {
811           if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
812             {
813               /* Not grouped load.  */
814               if (dump_enabled_p ())
815                 {
816                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
817                                    "Build SLP failed: not grouped load ");
818                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
819                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
820                 }
821
822               /* FORNOW: Not grouped loads are not supported.  */
823               /* Fatal mismatch.  */
824               matches[0] = false;
825               return false;
826             }
827
828           /* Not memory operation.  */
829           if (TREE_CODE_CLASS (rhs_code) != tcc_binary
830               && TREE_CODE_CLASS (rhs_code) != tcc_unary
831               && rhs_code != COND_EXPR
832               && rhs_code != CALL_EXPR)
833             {
834               if (dump_enabled_p ())
835                 {
836                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
837                                    "Build SLP failed: operation");
838                   dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
839                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
840                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
841                 }
842               /* Fatal mismatch.  */
843               matches[0] = false;
844               return false;
845             }
846
847           if (rhs_code == COND_EXPR)
848             {
849               tree cond_expr = gimple_assign_rhs1 (stmt);
850
851               if (i == 0)
852                 first_cond_code = TREE_CODE (cond_expr);
853               else if (first_cond_code != TREE_CODE (cond_expr))
854                 {
855                   if (dump_enabled_p ())
856                     {
857                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
858                                        "Build SLP failed: different"
859                                        " operation");
860                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
861                                         stmt, 0);
862                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
863                     }
864                   /* Mismatch.  */
865                   continue;
866                 }
867             }
868         }
869
870       matches[i] = true;
871     }
872
873   for (i = 0; i < group_size; ++i)
874     if (!matches[i])
875       return false;
876
877   return true;
878 }
879
880 /* Recursively build an SLP tree starting from NODE.
881    Fail (and return a value not equal to zero) if def-stmts are not
882    isomorphic, require data permutation or are of unsupported types of
883    operation.  Otherwise, return 0.
884    The value returned is the depth in the SLP tree where a mismatch
885    was found.  */
886
887 static bool
888 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
889                      slp_tree *node, unsigned int group_size,
890                      unsigned int *max_nunits,
891                      vec<slp_tree> *loads,
892                      unsigned int vectorization_factor,
893                      bool *matches, unsigned *npermutes, unsigned *tree_size,
894                      unsigned max_tree_size)
895 {
896   unsigned nops, i, this_npermutes = 0, this_tree_size = 0;
897   gimple stmt;
898
899   if (!matches)
900     matches = XALLOCAVEC (bool, group_size);
901   if (!npermutes)
902     npermutes = &this_npermutes;
903
904   matches[0] = false;
905
906   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
907   if (is_gimple_call (stmt))
908     nops = gimple_call_num_args (stmt);
909   else if (is_gimple_assign (stmt))
910     {
911       nops = gimple_num_ops (stmt) - 1;
912       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
913         nops++;
914     }
915   else
916     return false;
917
918   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
919                               SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
920                               max_nunits, vectorization_factor, matches))
921     return false;
922
923   /* If the SLP node is a load, terminate the recursion.  */
924   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
925       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
926     {
927       loads->safe_push (*node);
928       return true;
929     }
930
931   /* Get at the operands, verifying they are compatible.  */
932   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
933   slp_oprnd_info oprnd_info;
934   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
935     {
936       switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
937                                            stmt, (i == 0), &oprnds_info))
938         {
939         case 0:
940           break;
941         case -1:
942           matches[0] = false;
943           vect_free_oprnd_info (oprnds_info);
944           return false;
945         case 1:
946           matches[i] = false;
947           break;
948         }
949     }
950   for (i = 0; i < group_size; ++i)
951     if (!matches[i])
952       {
953         vect_free_oprnd_info (oprnds_info);
954         return false;
955       }
956
957   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
958
959   /* Create SLP_TREE nodes for the definition node/s.  */
960   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
961     {
962       slp_tree child;
963       unsigned old_nloads = loads->length ();
964       unsigned old_max_nunits = *max_nunits;
965
966       if (oprnd_info->first_dt != vect_internal_def)
967         continue;
968
969       if (++this_tree_size > max_tree_size)
970         {
971           vect_free_oprnd_info (oprnds_info);
972           return false;
973         }
974
975       child = vect_create_new_slp_node (oprnd_info->def_stmts);
976       if (!child)
977         {
978           vect_free_oprnd_info (oprnds_info);
979           return false;
980         }
981
982       bool *matches = XALLOCAVEC (bool, group_size);
983       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
984                                group_size, max_nunits, loads,
985                                vectorization_factor, matches,
986                                npermutes, &this_tree_size, max_tree_size))
987         {
988           oprnd_info->def_stmts = vNULL;
989           SLP_TREE_CHILDREN (*node).quick_push (child);
990           continue;
991         }
992
993       /* If the SLP build for operand zero failed and operand zero
994          and one can be commutated try that for the scalar stmts
995          that failed the match.  */
996       if (i == 0
997           /* A first scalar stmt mismatch signals a fatal mismatch.  */
998           && matches[0]
999           /* ???  For COND_EXPRs we can swap the comparison operands
1000              as well as the arms under some constraints.  */
1001           && nops == 2
1002           && oprnds_info[1]->first_dt == vect_internal_def
1003           && is_gimple_assign (stmt)
1004           && commutative_tree_code (gimple_assign_rhs_code (stmt))
1005           /* Do so only if the number of not successful permutes was nor more
1006              than a cut-ff as re-trying the recursive match on
1007              possibly each level of the tree would expose exponential
1008              behavior.  */
1009           && *npermutes < 4)
1010         {
1011           /* Roll back.  */
1012           *max_nunits = old_max_nunits;
1013           loads->truncate (old_nloads);
1014           /* Swap mismatched definition stmts.  */
1015           dump_printf_loc (MSG_NOTE, vect_location,
1016                            "Re-trying with swapped operands of stmts ");
1017           for (unsigned j = 0; j < group_size; ++j)
1018             if (!matches[j])
1019               {
1020                 gimple tem = oprnds_info[0]->def_stmts[j];
1021                 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1022                 oprnds_info[1]->def_stmts[j] = tem;
1023                 dump_printf (MSG_NOTE, "%d ", j);
1024               }
1025           dump_printf (MSG_NOTE, "\n");
1026           /* And try again ... */
1027           if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1028                                    group_size, max_nunits, loads,
1029                                    vectorization_factor,
1030                                    matches, npermutes, &this_tree_size,
1031                                    max_tree_size))
1032             {
1033               oprnd_info->def_stmts = vNULL;
1034               SLP_TREE_CHILDREN (*node).quick_push (child);
1035               continue;
1036             }
1037
1038           ++*npermutes;
1039         }
1040
1041       oprnd_info->def_stmts = vNULL;
1042       vect_free_slp_tree (child);
1043       vect_free_oprnd_info (oprnds_info);
1044       return false;
1045     }
1046
1047   if (tree_size)
1048     *tree_size += this_tree_size;
1049
1050   vect_free_oprnd_info (oprnds_info);
1051   return true;
1052 }
1053
1054 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1055
1056 static void
1057 vect_print_slp_tree (int dump_kind, slp_tree node)
1058 {
1059   int i;
1060   gimple stmt;
1061   slp_tree child;
1062
1063   if (!node)
1064     return;
1065
1066   dump_printf (dump_kind, "node ");
1067   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1068     {
1069       dump_printf (dump_kind, "\n\tstmt %d ", i);
1070       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1071     }
1072   dump_printf (dump_kind, "\n");
1073
1074   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1075     vect_print_slp_tree (dump_kind, child);
1076 }
1077
1078
1079 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1080    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1081    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1082    stmts in NODE are to be marked.  */
1083
1084 static void
1085 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1086 {
1087   int i;
1088   gimple stmt;
1089   slp_tree child;
1090
1091   if (!node)
1092     return;
1093
1094   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1095     if (j < 0 || i == j)
1096       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1097
1098   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1099     vect_mark_slp_stmts (child, mark, j);
1100 }
1101
1102
1103 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1104
1105 static void
1106 vect_mark_slp_stmts_relevant (slp_tree node)
1107 {
1108   int i;
1109   gimple stmt;
1110   stmt_vec_info stmt_info;
1111   slp_tree child;
1112
1113   if (!node)
1114     return;
1115
1116   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1117     {
1118       stmt_info = vinfo_for_stmt (stmt);
1119       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1120                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1121       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1122     }
1123
1124   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1125     vect_mark_slp_stmts_relevant (child);
1126 }
1127
1128
1129 /* Rearrange the statements of NODE according to PERMUTATION.  */
1130
1131 static void
1132 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1133                           vec<unsigned> permutation)
1134 {
1135   gimple stmt;
1136   vec<gimple> tmp_stmts;
1137   unsigned int i;
1138   slp_tree child;
1139
1140   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1141     vect_slp_rearrange_stmts (child, group_size, permutation);
1142
1143   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1144   tmp_stmts.create (group_size);
1145   tmp_stmts.quick_grow_cleared (group_size);
1146
1147   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1148     tmp_stmts[permutation[i]] = stmt;
1149
1150   SLP_TREE_SCALAR_STMTS (node).release ();
1151   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1152 }
1153
1154
1155 /* Check if the required load permutations in the SLP instance
1156    SLP_INSTN are supported.  */
1157
1158 static bool
1159 vect_supported_load_permutation_p (slp_instance slp_instn)
1160 {
1161   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1162   unsigned int i, j, k, next;
1163   sbitmap load_index;
1164   slp_tree node;
1165   gimple stmt, load, next_load, first_load;
1166   struct data_reference *dr;
1167
1168   if (dump_enabled_p ())
1169     {
1170       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1171       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1172         if (node->load_permutation.exists ())
1173           FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1174             dump_printf (MSG_NOTE, "%d ", next);
1175         else
1176           for (k = 0; k < group_size; ++k)
1177             dump_printf (MSG_NOTE, "%d ", k);
1178       dump_printf (MSG_NOTE, "\n");
1179     }
1180
1181   /* In case of reduction every load permutation is allowed, since the order
1182      of the reduction statements is not important (as opposed to the case of
1183      grouped stores).  The only condition we need to check is that all the
1184      load nodes are of the same size and have the same permutation (and then
1185      rearrange all the nodes of the SLP instance according to this 
1186      permutation).  */
1187
1188   /* Check that all the load nodes are of the same size.  */
1189   /* ???  Can't we assert this? */
1190   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1191     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1192       return false;
1193
1194   node = SLP_INSTANCE_TREE (slp_instn);
1195   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1196
1197   /* Reduction (there are no data-refs in the root).
1198      In reduction chain the order of the loads is important.  */
1199   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1200       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1201     {
1202       slp_tree load;
1203       unsigned int lidx;
1204
1205       /* Compare all the permutation sequences to the first one.  We know
1206          that at least one load is permuted.  */
1207       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1208       if (!node->load_permutation.exists ())
1209         return false;
1210       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1211         {
1212           if (!load->load_permutation.exists ())
1213             return false;
1214           FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1215             if (lidx != node->load_permutation[j])
1216               return false;
1217         }
1218
1219       /* Check that the loads in the first sequence are different and there
1220          are no gaps between them.  */
1221       load_index = sbitmap_alloc (group_size);
1222       bitmap_clear (load_index);
1223       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1224         {
1225           if (bitmap_bit_p (load_index, lidx))
1226             {
1227               sbitmap_free (load_index);
1228               return false;
1229             }
1230           bitmap_set_bit (load_index, lidx);
1231         }
1232       for (i = 0; i < group_size; i++)
1233         if (!bitmap_bit_p (load_index, i))
1234           {
1235             sbitmap_free (load_index);
1236             return false;
1237           }
1238       sbitmap_free (load_index);
1239
1240       /* This permutation is valid for reduction.  Since the order of the
1241          statements in the nodes is not important unless they are memory
1242          accesses, we can rearrange the statements in all the nodes
1243          according to the order of the loads.  */
1244       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1245                                 node->load_permutation);
1246
1247       /* We are done, no actual permutations need to be generated.  */
1248       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1249         SLP_TREE_LOAD_PERMUTATION (node).release ();
1250       return true;
1251     }
1252
1253   /* In basic block vectorization we allow any subchain of an interleaving
1254      chain.
1255      FORNOW: not supported in loop SLP because of realignment compications.  */
1256   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1257     {
1258       /* Check that for every node in the instance the loads
1259          form a subchain.  */
1260       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1261         {
1262           next_load = NULL;
1263           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1264             {
1265               if (j != 0 && next_load != load)
1266                 return false;
1267               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1268             }
1269         }
1270
1271       /* Check that the alignment of the first load in every subchain, i.e.,
1272          the first statement in every load node, is supported.
1273          ???  This belongs in alignment checking.  */
1274       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1275         {
1276           first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1277           if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1278             {
1279               dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1280               if (vect_supportable_dr_alignment (dr, false)
1281                   == dr_unaligned_unsupported)
1282                 {
1283                   if (dump_enabled_p ())
1284                     {
1285                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1286                                        vect_location,
1287                                        "unsupported unaligned load ");
1288                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1289                                         first_load, 0);
1290                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1291                     }
1292                   return false;
1293                 }
1294             }
1295         }
1296
1297       /* We are done, no actual permutations need to be generated.  */
1298       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1299         SLP_TREE_LOAD_PERMUTATION (node).release ();
1300       return true;
1301     }
1302
1303   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1304      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1305      well (unless it's reduction).  */
1306   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1307     return false;
1308   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1309     if (!node->load_permutation.exists ())
1310       return false;
1311
1312   load_index = sbitmap_alloc (group_size);
1313   bitmap_clear (load_index);
1314   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1315     {
1316       unsigned int lidx = node->load_permutation[0];
1317       if (bitmap_bit_p (load_index, lidx))
1318         {
1319           sbitmap_free (load_index);
1320           return false;
1321         }
1322       bitmap_set_bit (load_index, lidx);
1323       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1324         if (k != lidx)
1325           {
1326             sbitmap_free (load_index);
1327             return false;
1328           }
1329     }
1330   for (i = 0; i < group_size; i++)
1331     if (!bitmap_bit_p (load_index, i))
1332       {
1333         sbitmap_free (load_index);
1334         return false;
1335       }
1336   sbitmap_free (load_index);
1337
1338   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1339     if (node->load_permutation.exists ()
1340         && !vect_transform_slp_perm_load
1341               (node, vNULL, NULL,
1342                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1343       return false;
1344   return true;
1345 }
1346
1347
1348 /* Find the first load in the loop that belongs to INSTANCE.
1349    When loads are in several SLP nodes, there can be a case in which the first
1350    load does not appear in the first SLP node to be transformed, causing
1351    incorrect order of statements.  Since we generate all the loads together,
1352    they must be inserted before the first load of the SLP instance and not
1353    before the first load of the first node of the instance.  */
1354
1355 static gimple
1356 vect_find_first_load_in_slp_instance (slp_instance instance)
1357 {
1358   int i, j;
1359   slp_tree load_node;
1360   gimple first_load = NULL, load;
1361
1362   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1363     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1364       first_load = get_earlier_stmt (load, first_load);
1365
1366   return first_load;
1367 }
1368
1369
1370 /* Find the last store in SLP INSTANCE.  */
1371
1372 static gimple
1373 vect_find_last_store_in_slp_instance (slp_instance instance)
1374 {
1375   int i;
1376   slp_tree node;
1377   gimple last_store = NULL, store;
1378
1379   node = SLP_INSTANCE_TREE (instance);
1380   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1381     last_store = get_later_stmt (store, last_store);
1382
1383   return last_store;
1384 }
1385
1386 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1387
1388 static void
1389 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1390                          slp_instance instance, slp_tree node,
1391                          stmt_vector_for_cost *prologue_cost_vec,
1392                          unsigned ncopies_for_cost)
1393 {
1394   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1395
1396   unsigned i;
1397   slp_tree child;
1398   gimple stmt, s;
1399   stmt_vec_info stmt_info;
1400   tree lhs;
1401   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1402
1403   /* Recurse down the SLP tree.  */
1404   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1405     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1406                              instance, child, prologue_cost_vec,
1407                              ncopies_for_cost);
1408
1409   /* Look at the first scalar stmt to determine the cost.  */
1410   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1411   stmt_info = vinfo_for_stmt (stmt);
1412   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1413     {
1414       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1415         vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1416                                vect_uninitialized_def,
1417                                node, prologue_cost_vec, body_cost_vec);
1418       else
1419         {
1420           int i;
1421           gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1422           vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1423                                 node, prologue_cost_vec, body_cost_vec);
1424           /* If the load is permuted record the cost for the permutation.
1425              ???  Loads from multiple chains are let through here only
1426              for a single special case involving complex numbers where
1427              in the end no permutation is necessary.  */
1428           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1429             if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1430                  == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1431                 && vect_get_place_in_interleaving_chain
1432                      (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1433               {
1434                 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1435                                   stmt_info, 0, vect_body);
1436                 break;
1437               }
1438         }
1439     }
1440   else
1441     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1442                       stmt_info, 0, vect_body);
1443
1444   /* Scan operands and account for prologue cost of constants/externals.
1445      ???  This over-estimates cost for multiple uses and should be
1446      re-engineered.  */
1447   lhs = gimple_get_lhs (stmt);
1448   for (i = 0; i < gimple_num_ops (stmt); ++i)
1449     {
1450       tree def, op = gimple_op (stmt, i);
1451       gimple def_stmt;
1452       enum vect_def_type dt;
1453       if (!op || op == lhs)
1454         continue;
1455       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1456                               &def_stmt, &def, &dt)
1457           && (dt == vect_constant_def || dt == vect_external_def))
1458         record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1459                           stmt_info, 0, vect_prologue);
1460     }
1461 }
1462
1463 /* Compute the cost for the SLP instance INSTANCE.  */
1464
1465 static void
1466 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1467                        slp_instance instance, unsigned nunits)
1468 {
1469   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1470   unsigned ncopies_for_cost;
1471   stmt_info_for_cost *si;
1472   unsigned i;
1473
1474   /* Calculate the number of vector stmts to create based on the unrolling
1475      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1476      GROUP_SIZE / NUNITS otherwise.  */
1477   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1478   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1479
1480   prologue_cost_vec.create (10);
1481   body_cost_vec.create (10);
1482   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1483   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1484                            instance, SLP_INSTANCE_TREE (instance),
1485                            &prologue_cost_vec, ncopies_for_cost);
1486
1487   /* Record the prologue costs, which were delayed until we were
1488      sure that SLP was successful.  Unlike the body costs, we know
1489      the final values now regardless of the loop vectorization factor.  */
1490   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1491                 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1492   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1493     {
1494       struct _stmt_vec_info *stmt_info
1495         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1496       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1497                             si->misalign, vect_prologue);
1498     }
1499
1500   prologue_cost_vec.release ();
1501 }
1502
1503 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1504    vect_build_slp_tree to build a tree of packed stmts if possible.
1505    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1506
1507 static bool
1508 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1509                            gimple stmt, unsigned max_tree_size)
1510 {
1511   slp_instance new_instance;
1512   slp_tree node;
1513   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1514   unsigned int unrolling_factor = 1, nunits;
1515   tree vectype, scalar_type = NULL_TREE;
1516   gimple next;
1517   unsigned int vectorization_factor = 0;
1518   int i;
1519   unsigned int max_nunits = 0;
1520   vec<slp_tree> loads;
1521   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1522   vec<gimple> scalar_stmts;
1523
1524   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1525     {
1526       if (dr)
1527         {
1528           scalar_type = TREE_TYPE (DR_REF (dr));
1529           vectype = get_vectype_for_scalar_type (scalar_type);
1530         }
1531       else
1532         {
1533           gcc_assert (loop_vinfo);
1534           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1535         }
1536
1537       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1538     }
1539   else
1540     {
1541       gcc_assert (loop_vinfo);
1542       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1543       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1544     }
1545
1546   if (!vectype)
1547     {
1548       if (dump_enabled_p ())
1549         {
1550           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1551                            "Build SLP failed: unsupported data-type ");
1552           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1553           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1554         }
1555
1556       return false;
1557     }
1558
1559   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1560   if (loop_vinfo)
1561     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1562   else
1563     vectorization_factor = nunits;
1564
1565   /* Calculate the unrolling factor.  */
1566   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1567   if (unrolling_factor != 1 && !loop_vinfo)
1568     {
1569       if (dump_enabled_p ())
1570         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571                          "Build SLP failed: unrolling required in basic"
1572                          " block SLP\n");
1573
1574       return false;
1575     }
1576
1577   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1578   scalar_stmts.create (group_size);
1579   next = stmt;
1580   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1581     {
1582       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1583       while (next)
1584         {
1585           if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1586               && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1587             scalar_stmts.safe_push (
1588                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1589           else
1590             scalar_stmts.safe_push (next);
1591           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1592         }
1593     }
1594   else
1595     {
1596       /* Collect reduction statements.  */
1597       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1598       for (i = 0; reductions.iterate (i, &next); i++)
1599         scalar_stmts.safe_push (next);
1600     }
1601
1602   node = vect_create_new_slp_node (scalar_stmts);
1603
1604   loads.create (group_size);
1605
1606   /* Build the tree for the SLP instance.  */
1607   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1608                            &max_nunits, &loads,
1609                            vectorization_factor, NULL, NULL, NULL,
1610                            max_tree_size))
1611     {
1612       /* Calculate the unrolling factor based on the smallest type.  */
1613       if (max_nunits > nunits)
1614         unrolling_factor = least_common_multiple (max_nunits, group_size)
1615                            / group_size;
1616
1617       if (unrolling_factor != 1 && !loop_vinfo)
1618         {
1619           if (dump_enabled_p ())
1620             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1621                              "Build SLP failed: unrolling required in basic"
1622                              " block SLP\n");
1623           vect_free_slp_tree (node);
1624           loads.release ();
1625           return false;
1626         }
1627
1628       /* Create a new SLP instance.  */
1629       new_instance = XNEW (struct _slp_instance);
1630       SLP_INSTANCE_TREE (new_instance) = node;
1631       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1632       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1633       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1634       SLP_INSTANCE_LOADS (new_instance) = loads;
1635       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1636
1637       /* Compute the load permutation.  */
1638       slp_tree load_node;
1639       bool loads_permuted = false;
1640       FOR_EACH_VEC_ELT (loads, i, load_node)
1641         {
1642           vec<unsigned> load_permutation;
1643           int j;
1644           gimple load, first_stmt;
1645           bool this_load_permuted = false;
1646           load_permutation.create (group_size);
1647           first_stmt = GROUP_FIRST_ELEMENT
1648               (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1649           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1650             {
1651               int load_place
1652                 = vect_get_place_in_interleaving_chain (load, first_stmt);
1653               gcc_assert (load_place != -1);
1654               if (load_place != j)
1655                 this_load_permuted = true;
1656               load_permutation.safe_push (load_place);
1657             }
1658           if (!this_load_permuted)
1659             {
1660               load_permutation.release ();
1661               continue;
1662             }
1663           SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1664           loads_permuted = true;
1665         }
1666
1667       if (loads_permuted)
1668         {
1669           if (!vect_supported_load_permutation_p (new_instance))
1670             {
1671               if (dump_enabled_p ())
1672                 {
1673                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1674                                    "Build SLP failed: unsupported load "
1675                                    "permutation ");
1676                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1677                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1678                 }
1679               vect_free_slp_instance (new_instance);
1680               return false;
1681             }
1682
1683           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1684             = vect_find_first_load_in_slp_instance (new_instance);
1685         }
1686
1687       /* Compute the costs of this SLP instance.  */
1688       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1689                              new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1690
1691       if (loop_vinfo)
1692         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1693       else
1694         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1695
1696       if (dump_enabled_p ())
1697         vect_print_slp_tree (MSG_NOTE, node);
1698
1699       return true;
1700     }
1701
1702   /* Failed to SLP.  */
1703   /* Free the allocated memory.  */
1704   vect_free_slp_tree (node);
1705   loads.release ();
1706
1707   return false;
1708 }
1709
1710
1711 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1712    trees of packed scalar stmts if SLP is possible.  */
1713
1714 bool
1715 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1716                   unsigned max_tree_size)
1717 {
1718   unsigned int i;
1719   vec<gimple> grouped_stores;
1720   vec<gimple> reductions = vNULL;
1721   vec<gimple> reduc_chains = vNULL;
1722   gimple first_element;
1723   bool ok = false;
1724
1725   if (dump_enabled_p ())
1726     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1727
1728   if (loop_vinfo)
1729     {
1730       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1731       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1732       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1733     }
1734   else
1735     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1736
1737   /* Find SLP sequences starting from groups of grouped stores.  */
1738   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1739     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1740                                    max_tree_size))
1741       ok = true;
1742
1743   if (bb_vinfo && !ok)
1744     {
1745       if (dump_enabled_p ())
1746         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1747                          "Failed to SLP the basic block.\n");
1748
1749       return false;
1750     }
1751
1752   if (loop_vinfo
1753       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1754     {
1755       /* Find SLP sequences starting from reduction chains.  */
1756       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1757         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1758                                        max_tree_size))
1759           ok = true;
1760         else
1761           return false;
1762
1763       /* Don't try to vectorize SLP reductions if reduction chain was
1764          detected.  */
1765       return ok;
1766     }
1767
1768   /* Find SLP sequences starting from groups of reductions.  */
1769   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1770       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1771                                     max_tree_size))
1772     ok = true;
1773
1774   return true;
1775 }
1776
1777
1778 /* For each possible SLP instance decide whether to SLP it and calculate overall
1779    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1780    least one instance.  */
1781
1782 bool
1783 vect_make_slp_decision (loop_vec_info loop_vinfo)
1784 {
1785   unsigned int i, unrolling_factor = 1;
1786   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1787   slp_instance instance;
1788   int decided_to_slp = 0;
1789
1790   if (dump_enabled_p ())
1791     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1792                      "\n");
1793
1794   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1795     {
1796       /* FORNOW: SLP if you can.  */
1797       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1798         unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1799
1800       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1801          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1802          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1803       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1804       decided_to_slp++;
1805     }
1806
1807   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1808
1809   if (decided_to_slp && dump_enabled_p ())
1810     dump_printf_loc (MSG_NOTE, vect_location,
1811                      "Decided to SLP %d instances. Unrolling factor %d\n",
1812                      decided_to_slp, unrolling_factor);
1813
1814   return (decided_to_slp > 0);
1815 }
1816
1817
1818 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1819    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1820
1821 static void
1822 vect_detect_hybrid_slp_stmts (slp_tree node)
1823 {
1824   int i;
1825   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1826   gimple stmt = stmts[0];
1827   imm_use_iterator imm_iter;
1828   gimple use_stmt;
1829   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1830   slp_tree child;
1831   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1832   struct loop *loop = NULL;
1833   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1834   basic_block bb = NULL;
1835
1836   if (!node)
1837     return;
1838
1839   if (loop_vinfo)
1840     loop = LOOP_VINFO_LOOP (loop_vinfo);
1841   else
1842     bb = BB_VINFO_BB (bb_vinfo);
1843
1844   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1845     if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1846         && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1847       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1848         if (gimple_bb (use_stmt)
1849             && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1850                  || bb == gimple_bb (use_stmt))
1851             && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1852             && !STMT_SLP_TYPE (stmt_vinfo)
1853             && (STMT_VINFO_RELEVANT (stmt_vinfo)
1854                 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))
1855                 || (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
1856                     && STMT_VINFO_RELATED_STMT (stmt_vinfo)
1857                     && !STMT_SLP_TYPE (vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo)))))
1858             && !(gimple_code (use_stmt) == GIMPLE_PHI
1859                  && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1860                   == vect_reduction_def))
1861           vect_mark_slp_stmts (node, hybrid, i);
1862
1863   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1864     vect_detect_hybrid_slp_stmts (child);
1865 }
1866
1867
1868 /* Find stmts that must be both vectorized and SLPed.  */
1869
1870 void
1871 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1872 {
1873   unsigned int i;
1874   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1875   slp_instance instance;
1876
1877   if (dump_enabled_p ())
1878     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1879                      "\n");
1880
1881   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1882     vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1883 }
1884
1885
1886 /* Create and initialize a new bb_vec_info struct for BB, as well as
1887    stmt_vec_info structs for all the stmts in it.  */
1888
1889 static bb_vec_info
1890 new_bb_vec_info (basic_block bb)
1891 {
1892   bb_vec_info res = NULL;
1893   gimple_stmt_iterator gsi;
1894
1895   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1896   BB_VINFO_BB (res) = bb;
1897
1898   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1899     {
1900       gimple stmt = gsi_stmt (gsi);
1901       gimple_set_uid (stmt, 0);
1902       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1903     }
1904
1905   BB_VINFO_GROUPED_STORES (res).create (10);
1906   BB_VINFO_SLP_INSTANCES (res).create (2);
1907   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1908
1909   bb->aux = res;
1910   return res;
1911 }
1912
1913
1914 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1915    stmts in the basic block.  */
1916
1917 static void
1918 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1919 {
1920   vec<slp_instance> slp_instances;
1921   slp_instance instance;
1922   basic_block bb;
1923   gimple_stmt_iterator si;
1924   unsigned i;
1925
1926   if (!bb_vinfo)
1927     return;
1928
1929   bb = BB_VINFO_BB (bb_vinfo);
1930
1931   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1932     {
1933       gimple stmt = gsi_stmt (si);
1934       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1935
1936       if (stmt_info)
1937         /* Free stmt_vec_info.  */
1938         free_stmt_vec_info (stmt);
1939     }
1940
1941   vect_destroy_datarefs (NULL, bb_vinfo);
1942   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1943   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1944   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1945   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1946     vect_free_slp_instance (instance);
1947   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1948   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1949   free (bb_vinfo);
1950   bb->aux = NULL;
1951 }
1952
1953
1954 /* Analyze statements contained in SLP tree node after recursively analyzing
1955    the subtree. Return TRUE if the operations are supported.  */
1956
1957 static bool
1958 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1959 {
1960   bool dummy;
1961   int i;
1962   gimple stmt;
1963   slp_tree child;
1964
1965   if (!node)
1966     return true;
1967
1968   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1969     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
1970       return false;
1971
1972   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1973     {
1974       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1975       gcc_assert (stmt_info);
1976       gcc_assert (PURE_SLP_STMT (stmt_info));
1977
1978       if (!vect_analyze_stmt (stmt, &dummy, node))
1979         return false;
1980     }
1981
1982   return true;
1983 }
1984
1985
1986 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
1987    operations are supported. */
1988
1989 static bool
1990 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
1991 {
1992   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1993   slp_instance instance;
1994   int i;
1995
1996   for (i = 0; slp_instances.iterate (i, &instance); )
1997     {
1998       if (!vect_slp_analyze_node_operations (bb_vinfo,
1999                                              SLP_INSTANCE_TREE (instance)))
2000         {
2001           vect_free_slp_instance (instance);
2002           slp_instances.ordered_remove (i);
2003         }
2004       else
2005         i++;
2006     }
2007
2008   if (!slp_instances.length ())
2009     return false;
2010
2011   return true;
2012 }
2013
2014
2015 /* Compute the scalar cost of the SLP node NODE and its children
2016    and return it.  Do not account defs that are marked in LIFE and
2017    update LIFE according to uses of NODE.  */
2018
2019 static unsigned
2020 vect_bb_slp_scalar_cost (basic_block bb,
2021                          slp_tree node, vec<bool, va_heap> *life)
2022 {
2023   unsigned scalar_cost = 0;
2024   unsigned i;
2025   gimple stmt;
2026   slp_tree child;
2027
2028   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2029     {
2030       unsigned stmt_cost;
2031       ssa_op_iter op_iter;
2032       def_operand_p def_p;
2033       stmt_vec_info stmt_info;
2034
2035       if ((*life)[i])
2036         continue;
2037
2038       /* If there is a non-vectorized use of the defs then the scalar
2039          stmt is kept live in which case we do not account it or any
2040          required defs in the SLP children in the scalar cost.  This
2041          way we make the vectorization more costly when compared to
2042          the scalar cost.  */
2043       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2044         {
2045           imm_use_iterator use_iter;
2046           gimple use_stmt;
2047           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2048             if (!is_gimple_debug (use_stmt)
2049                 && (gimple_code (use_stmt) == GIMPLE_PHI
2050                     || gimple_bb (use_stmt) != bb
2051                     || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2052               {
2053                 (*life)[i] = true;
2054                 BREAK_FROM_IMM_USE_STMT (use_iter);
2055               }
2056         }
2057       if ((*life)[i])
2058         continue;
2059
2060       stmt_info = vinfo_for_stmt (stmt);
2061       if (STMT_VINFO_DATA_REF (stmt_info))
2062         {
2063           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2064             stmt_cost = vect_get_stmt_cost (scalar_load);
2065           else
2066             stmt_cost = vect_get_stmt_cost (scalar_store);
2067         }
2068       else
2069         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2070
2071       scalar_cost += stmt_cost;
2072     }
2073
2074   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2075     scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2076
2077   return scalar_cost;
2078 }
2079
2080 /* Check if vectorization of the basic block is profitable.  */
2081
2082 static bool
2083 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2084 {
2085   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2086   slp_instance instance;
2087   int i, j;
2088   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2089   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2090   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2091   stmt_vec_info stmt_info = NULL;
2092   stmt_vector_for_cost body_cost_vec;
2093   stmt_info_for_cost *ci;
2094
2095   /* Calculate vector costs.  */
2096   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2097     {
2098       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2099
2100       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2101         {
2102           stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2103           (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2104                                 stmt_info, ci->misalign, vect_body);
2105         }
2106     }
2107
2108   /* Calculate scalar cost.  */
2109   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2110     {
2111       auto_vec<bool, 20> life;
2112       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2113       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2114                                               SLP_INSTANCE_TREE (instance),
2115                                               &life);
2116     }
2117
2118   /* Complete the target-specific cost calculation.  */
2119   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2120                &vec_inside_cost, &vec_epilogue_cost);
2121
2122   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2123
2124   if (dump_enabled_p ())
2125     {
2126       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2127       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2128                    vec_inside_cost);
2129       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2130       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2131       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2132     }
2133
2134   /* Vectorization is profitable if its cost is less than the cost of scalar
2135      version.  */
2136   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2137     return false;
2138
2139   return true;
2140 }
2141
2142 /* Check if the basic block can be vectorized.  */
2143
2144 static bb_vec_info
2145 vect_slp_analyze_bb_1 (basic_block bb)
2146 {
2147   bb_vec_info bb_vinfo;
2148   vec<slp_instance> slp_instances;
2149   slp_instance instance;
2150   int i;
2151   int min_vf = 2;
2152   unsigned n_stmts = 0;
2153
2154   bb_vinfo = new_bb_vec_info (bb);
2155   if (!bb_vinfo)
2156     return NULL;
2157
2158   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2159     {
2160       if (dump_enabled_p ())
2161         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162                          "not vectorized: unhandled data-ref in basic "
2163                          "block.\n");
2164
2165       destroy_bb_vec_info (bb_vinfo);
2166       return NULL;
2167     }
2168
2169   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2170     {
2171       if (dump_enabled_p ())
2172         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173                          "not vectorized: not enough data-refs in "
2174                          "basic block.\n");
2175
2176       destroy_bb_vec_info (bb_vinfo);
2177       return NULL;
2178     }
2179
2180   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2181     {
2182      if (dump_enabled_p ())
2183        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184                         "not vectorized: unhandled data access in "
2185                         "basic block.\n");
2186
2187       destroy_bb_vec_info (bb_vinfo);
2188       return NULL;
2189     }
2190
2191   vect_pattern_recog (NULL, bb_vinfo);
2192
2193   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2194     {
2195       if (dump_enabled_p ())
2196         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2197                          "not vectorized: bad data alignment in basic "
2198                          "block.\n");
2199
2200       destroy_bb_vec_info (bb_vinfo);
2201       return NULL;
2202     }
2203
2204   /* Check the SLP opportunities in the basic block, analyze and build SLP
2205      trees.  */
2206   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2207     {
2208       if (dump_enabled_p ())
2209         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
2210                          "not vectorized: failed to find SLP opportunities "
2211                          "in basic block.\n");
2212
2213       destroy_bb_vec_info (bb_vinfo);
2214       return NULL;
2215     }
2216
2217   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2218
2219   /* Mark all the statements that we want to vectorize as pure SLP and
2220      relevant.  */
2221   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2222     {
2223       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2224       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2225     }
2226
2227   /* Mark all the statements that we do not want to vectorize.  */
2228   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2229        !gsi_end_p (gsi); gsi_next (&gsi))
2230     {
2231       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2232       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2233         STMT_VINFO_VECTORIZABLE (vinfo) = false;
2234     }
2235
2236   /* Analyze dependences.  At this point all stmts not participating in
2237      vectorization have to be marked.  Dependence analysis assumes
2238      that we either vectorize all SLP instances or none at all.  */
2239   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2240      {
2241        if (dump_enabled_p ())
2242          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243                           "not vectorized: unhandled data dependence "
2244                           "in basic block.\n");
2245
2246        destroy_bb_vec_info (bb_vinfo);
2247        return NULL;
2248      }
2249
2250   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2251     {
2252       if (dump_enabled_p ())
2253         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2254                          "not vectorized: unsupported alignment in basic "
2255                          "block.\n");
2256       destroy_bb_vec_info (bb_vinfo);
2257       return NULL;
2258     }
2259
2260   if (!vect_slp_analyze_operations (bb_vinfo))
2261     {
2262       if (dump_enabled_p ())
2263         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264                          "not vectorized: bad operation in basic block.\n");
2265
2266       destroy_bb_vec_info (bb_vinfo);
2267       return NULL;
2268     }
2269
2270   /* Cost model: check if the vectorization is worthwhile.  */
2271   if (!unlimited_cost_model (NULL)
2272       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2273     {
2274       if (dump_enabled_p ())
2275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2276                          "not vectorized: vectorization is not "
2277                          "profitable.\n");
2278
2279       destroy_bb_vec_info (bb_vinfo);
2280       return NULL;
2281     }
2282
2283   if (dump_enabled_p ())
2284     dump_printf_loc (MSG_NOTE, vect_location,
2285                      "Basic block will be vectorized using SLP\n");
2286
2287   return bb_vinfo;
2288 }
2289
2290
2291 bb_vec_info
2292 vect_slp_analyze_bb (basic_block bb)
2293 {
2294   bb_vec_info bb_vinfo;
2295   int insns = 0;
2296   gimple_stmt_iterator gsi;
2297   unsigned int vector_sizes;
2298
2299   if (dump_enabled_p ())
2300     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2301
2302   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2303     {
2304       gimple stmt = gsi_stmt (gsi);
2305       if (!is_gimple_debug (stmt)
2306           && !gimple_nop_p (stmt)
2307           && gimple_code (stmt) != GIMPLE_LABEL)
2308         insns++;
2309     }
2310
2311   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2312     {
2313       if (dump_enabled_p ())
2314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2315                          "not vectorized: too many instructions in "
2316                          "basic block.\n");
2317
2318       return NULL;
2319     }
2320
2321   /* Autodetect first vector size we try.  */
2322   current_vector_size = 0;
2323   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2324
2325   while (1)
2326     {
2327       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2328       if (bb_vinfo)
2329         return bb_vinfo;
2330
2331       destroy_bb_vec_info (bb_vinfo);
2332
2333       vector_sizes &= ~current_vector_size;
2334       if (vector_sizes == 0
2335           || current_vector_size == 0)
2336         return NULL;
2337
2338       /* Try the next biggest vector size.  */
2339       current_vector_size = 1 << floor_log2 (vector_sizes);
2340       if (dump_enabled_p ())
2341         dump_printf_loc (MSG_NOTE, vect_location,
2342                          "***** Re-trying analysis with "
2343                          "vector size %d\n", current_vector_size);
2344     }
2345 }
2346
2347
2348 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2349    the number of created vector stmts depends on the unrolling factor).
2350    However, the actual number of vector stmts for every SLP node depends on
2351    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2352    should be updated.  In this function we assume that the inside costs
2353    calculated in vect_model_xxx_cost are linear in ncopies.  */
2354
2355 void
2356 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2357 {
2358   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2359   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2360   slp_instance instance;
2361   stmt_vector_for_cost body_cost_vec;
2362   stmt_info_for_cost *si;
2363   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2364
2365   if (dump_enabled_p ())
2366     dump_printf_loc (MSG_NOTE, vect_location,
2367                      "=== vect_update_slp_costs_according_to_vf ===\n");
2368
2369   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2370     {
2371       /* We assume that costs are linear in ncopies.  */
2372       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2373
2374       /* Record the instance's instructions in the target cost model.
2375          This was delayed until here because the count of instructions
2376          isn't known beforehand.  */
2377       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2378
2379       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2380         (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2381                               vinfo_for_stmt (si->stmt), si->misalign,
2382                               vect_body);
2383     }
2384 }
2385
2386
2387 /* For constant and loop invariant defs of SLP_NODE this function returns
2388    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2389    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2390    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2391    REDUC_INDEX is the index of the reduction operand in the statements, unless
2392    it is -1.  */
2393
2394 static void
2395 vect_get_constant_vectors (tree op, slp_tree slp_node,
2396                            vec<tree> *vec_oprnds,
2397                            unsigned int op_num, unsigned int number_of_vectors,
2398                            int reduc_index)
2399 {
2400   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2401   gimple stmt = stmts[0];
2402   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2403   unsigned nunits;
2404   tree vec_cst;
2405   tree *elts;
2406   unsigned j, number_of_places_left_in_vector;
2407   tree vector_type;
2408   tree vop;
2409   int group_size = stmts.length ();
2410   unsigned int vec_num, i;
2411   unsigned number_of_copies = 1;
2412   vec<tree> voprnds;
2413   voprnds.create (number_of_vectors);
2414   bool constant_p, is_store;
2415   tree neutral_op = NULL;
2416   enum tree_code code = gimple_expr_code (stmt);
2417   gimple def_stmt;
2418   struct loop *loop;
2419   gimple_seq ctor_seq = NULL;
2420
2421   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2422       && reduc_index != -1)
2423     {
2424       op_num = reduc_index - 1;
2425       op = gimple_op (stmt, reduc_index);
2426       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2427          we need either neutral operands or the original operands.  See
2428          get_initial_def_for_reduction() for details.  */
2429       switch (code)
2430         {
2431           case WIDEN_SUM_EXPR:
2432           case DOT_PROD_EXPR:
2433           case PLUS_EXPR:
2434           case MINUS_EXPR:
2435           case BIT_IOR_EXPR:
2436           case BIT_XOR_EXPR:
2437              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2438                neutral_op = build_real (TREE_TYPE (op), dconst0);
2439              else
2440                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2441
2442              break;
2443
2444           case MULT_EXPR:
2445              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2446                neutral_op = build_real (TREE_TYPE (op), dconst1);
2447              else
2448                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2449
2450              break;
2451
2452           case BIT_AND_EXPR:
2453             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2454             break;
2455
2456           /* For MIN/MAX we don't have an easy neutral operand but
2457              the initial values can be used fine here.  Only for
2458              a reduction chain we have to force a neutral element.  */
2459           case MAX_EXPR:
2460           case MIN_EXPR:
2461             if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2462               neutral_op = NULL;
2463             else
2464               {
2465                 def_stmt = SSA_NAME_DEF_STMT (op);
2466                 loop = (gimple_bb (stmt))->loop_father;
2467                 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2468                                                     loop_preheader_edge (loop));
2469               }
2470             break;
2471
2472           default:
2473             neutral_op = NULL;
2474         }
2475     }
2476
2477   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2478     {
2479       is_store = true;
2480       op = gimple_assign_rhs1 (stmt);
2481     }
2482   else
2483     is_store = false;
2484
2485   gcc_assert (op);
2486
2487   if (CONSTANT_CLASS_P (op))
2488     constant_p = true;
2489   else
2490     constant_p = false;
2491
2492   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2493   gcc_assert (vector_type);
2494   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2495
2496   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2497      created vectors. It is greater than 1 if unrolling is performed.
2498
2499      For example, we have two scalar operands, s1 and s2 (e.g., group of
2500      strided accesses of size two), while NUNITS is four (i.e., four scalars
2501      of this type can be packed in a vector).  The output vector will contain
2502      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2503      will be 2).
2504
2505      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2506      containing the operands.
2507
2508      For example, NUNITS is four as before, and the group size is 8
2509      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2510      {s5, s6, s7, s8}.  */
2511
2512   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2513
2514   number_of_places_left_in_vector = nunits;
2515   elts = XALLOCAVEC (tree, nunits);
2516   for (j = 0; j < number_of_copies; j++)
2517     {
2518       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2519         {
2520           if (is_store)
2521             op = gimple_assign_rhs1 (stmt);
2522           else
2523             {
2524               switch (code)
2525                 {
2526                   case COND_EXPR:
2527                     if (op_num == 0 || op_num == 1)
2528                       {
2529                         tree cond = gimple_assign_rhs1 (stmt);
2530                         op = TREE_OPERAND (cond, op_num);
2531                       }
2532                     else
2533                       {
2534                         if (op_num == 2)
2535                           op = gimple_assign_rhs2 (stmt);
2536                         else
2537                           op = gimple_assign_rhs3 (stmt);
2538                       }
2539                     break;
2540
2541                   case CALL_EXPR:
2542                     op = gimple_call_arg (stmt, op_num);
2543                     break;
2544
2545                   case LSHIFT_EXPR:
2546                   case RSHIFT_EXPR:
2547                   case LROTATE_EXPR:
2548                   case RROTATE_EXPR:
2549                     op = gimple_op (stmt, op_num + 1);
2550                     /* Unlike the other binary operators, shifts/rotates have
2551                        the shift count being int, instead of the same type as
2552                        the lhs, so make sure the scalar is the right type if
2553                        we are dealing with vectors of
2554                        long long/long/short/char.  */
2555                     if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2556                       op = fold_convert (TREE_TYPE (vector_type), op);
2557                     break;
2558
2559                   default:
2560                     op = gimple_op (stmt, op_num + 1);
2561                     break;
2562                 }
2563             }
2564
2565           if (reduc_index != -1)
2566             {
2567               loop = (gimple_bb (stmt))->loop_father;
2568               def_stmt = SSA_NAME_DEF_STMT (op);
2569
2570               gcc_assert (loop);
2571
2572               /* Get the def before the loop.  In reduction chain we have only
2573                  one initial value.  */
2574               if ((j != (number_of_copies - 1)
2575                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2576                        && i != 0))
2577                   && neutral_op)
2578                 op = neutral_op;
2579               else
2580                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2581                                             loop_preheader_edge (loop));
2582             }
2583
2584           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2585           number_of_places_left_in_vector--;
2586           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2587             {
2588               if (CONSTANT_CLASS_P (op))
2589                 {
2590                   op = fold_unary (VIEW_CONVERT_EXPR,
2591                                    TREE_TYPE (vector_type), op);
2592                   gcc_assert (op && CONSTANT_CLASS_P (op));
2593                 }
2594               else
2595                 {
2596                   tree new_temp
2597                     = make_ssa_name (TREE_TYPE (vector_type), NULL);
2598                   gimple init_stmt;
2599                   op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2600                                op);               
2601                   init_stmt
2602                     = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2603                                                     new_temp, op, NULL_TREE);
2604                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
2605                   op = new_temp;
2606                 }
2607             }
2608           elts[number_of_places_left_in_vector] = op;
2609           if (!CONSTANT_CLASS_P (op))
2610             constant_p = false;
2611
2612           if (number_of_places_left_in_vector == 0)
2613             {
2614               number_of_places_left_in_vector = nunits;
2615
2616               if (constant_p)
2617                 vec_cst = build_vector (vector_type, elts);
2618               else
2619                 {
2620                   vec<constructor_elt, va_gc> *v;
2621                   unsigned k;
2622                   vec_alloc (v, nunits);
2623                   for (k = 0; k < nunits; ++k)
2624                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2625                   vec_cst = build_constructor (vector_type, v);
2626                 }
2627               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2628                                                     vector_type, NULL));
2629               if (ctor_seq != NULL)
2630                 {
2631                   gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2632                   gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2633                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2634                                                         GSI_SAME_STMT);
2635                   ctor_seq = NULL;
2636                 }
2637             }
2638         }
2639     }
2640
2641   /* Since the vectors are created in the reverse order, we should invert
2642      them.  */
2643   vec_num = voprnds.length ();
2644   for (j = vec_num; j != 0; j--)
2645     {
2646       vop = voprnds[j - 1];
2647       vec_oprnds->quick_push (vop);
2648     }
2649
2650   voprnds.release ();
2651
2652   /* In case that VF is greater than the unrolling factor needed for the SLP
2653      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2654      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2655      to replicate the vectors.  */
2656   while (number_of_vectors > vec_oprnds->length ())
2657     {
2658       tree neutral_vec = NULL;
2659
2660       if (neutral_op)
2661         {
2662           if (!neutral_vec)
2663             neutral_vec = build_vector_from_val (vector_type, neutral_op);
2664
2665           vec_oprnds->quick_push (neutral_vec);
2666         }
2667       else
2668         {
2669           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2670             vec_oprnds->quick_push (vop);
2671         }
2672     }
2673 }
2674
2675
2676 /* Get vectorized definitions from SLP_NODE that contains corresponding
2677    vectorized def-stmts.  */
2678
2679 static void
2680 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2681 {
2682   tree vec_oprnd;
2683   gimple vec_def_stmt;
2684   unsigned int i;
2685
2686   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2687
2688   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2689     {
2690       gcc_assert (vec_def_stmt);
2691       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2692       vec_oprnds->quick_push (vec_oprnd);
2693     }
2694 }
2695
2696
2697 /* Get vectorized definitions for SLP_NODE.
2698    If the scalar definitions are loop invariants or constants, collect them and
2699    call vect_get_constant_vectors() to create vector stmts.
2700    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2701    must be stored in the corresponding child of SLP_NODE, and we call
2702    vect_get_slp_vect_defs () to retrieve them.  */
2703
2704 void
2705 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2706                    vec<vec<tree> > *vec_oprnds, int reduc_index)
2707 {
2708   gimple first_stmt;
2709   int number_of_vects = 0, i;
2710   unsigned int child_index = 0;
2711   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2712   slp_tree child = NULL;
2713   vec<tree> vec_defs;
2714   tree oprnd;
2715   bool vectorized_defs;
2716
2717   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2718   FOR_EACH_VEC_ELT (ops, i, oprnd)
2719     {
2720       /* For each operand we check if it has vectorized definitions in a child
2721          node or we need to create them (for invariants and constants).  We
2722          check if the LHS of the first stmt of the next child matches OPRND.
2723          If it does, we found the correct child.  Otherwise, we call
2724          vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2725          to check this child node for the next operand.  */
2726       vectorized_defs = false;
2727       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2728         {
2729           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2730
2731           /* We have to check both pattern and original def, if available.  */
2732           gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2733           gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2734
2735           if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2736               || (related
2737                   && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2738             {
2739               /* The number of vector defs is determined by the number of
2740                  vector statements in the node from which we get those
2741                  statements.  */
2742               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2743               vectorized_defs = true;
2744               child_index++;
2745             }
2746         }
2747
2748       if (!vectorized_defs)
2749         {
2750           if (i == 0)
2751             {
2752               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2753               /* Number of vector stmts was calculated according to LHS in
2754                  vect_schedule_slp_instance (), fix it by replacing LHS with
2755                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2756                  details.  */
2757               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2758                                              &rhs_size_unit);
2759               if (rhs_size_unit != lhs_size_unit)
2760                 {
2761                   number_of_vects *= rhs_size_unit;
2762                   number_of_vects /= lhs_size_unit;
2763                 }
2764             }
2765         }
2766
2767       /* Allocate memory for vectorized defs.  */
2768       vec_defs = vNULL;
2769       vec_defs.create (number_of_vects);
2770
2771       /* For reduction defs we call vect_get_constant_vectors (), since we are
2772          looking for initial loop invariant values.  */
2773       if (vectorized_defs && reduc_index == -1)
2774         /* The defs are already vectorized.  */
2775         vect_get_slp_vect_defs (child, &vec_defs);
2776       else
2777         /* Build vectors from scalar defs.  */
2778         vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2779                                    number_of_vects, reduc_index);
2780
2781       vec_oprnds->quick_push (vec_defs);
2782
2783       /* For reductions, we only need initial values.  */
2784       if (reduc_index != -1)
2785         return;
2786     }
2787 }
2788
2789
2790 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2791    building a vector of type MASK_TYPE from it) and two input vectors placed in
2792    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2793    shifting by STRIDE elements of DR_CHAIN for every copy.
2794    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2795    copies).
2796    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2797    the created stmts must be inserted.  */
2798
2799 static inline void
2800 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2801                            tree mask, int first_vec_indx, int second_vec_indx,
2802                            gimple_stmt_iterator *gsi, slp_tree node,
2803                            tree vectype, vec<tree> dr_chain,
2804                            int ncopies, int vect_stmts_counter)
2805 {
2806   tree perm_dest;
2807   gimple perm_stmt = NULL;
2808   stmt_vec_info next_stmt_info;
2809   int i, stride;
2810   tree first_vec, second_vec, data_ref;
2811
2812   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2813
2814   /* Initialize the vect stmts of NODE to properly insert the generated
2815      stmts later.  */
2816   for (i = SLP_TREE_VEC_STMTS (node).length ();
2817        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2818     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2819
2820   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2821   for (i = 0; i < ncopies; i++)
2822     {
2823       first_vec = dr_chain[first_vec_indx];
2824       second_vec = dr_chain[second_vec_indx];
2825
2826       /* Generate the permute statement.  */
2827       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2828                                                 first_vec, second_vec, mask);
2829       data_ref = make_ssa_name (perm_dest, perm_stmt);
2830       gimple_set_lhs (perm_stmt, data_ref);
2831       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2832
2833       /* Store the vector statement in NODE.  */
2834       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2835
2836       first_vec_indx += stride;
2837       second_vec_indx += stride;
2838     }
2839
2840   /* Mark the scalar stmt as vectorized.  */
2841   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2842   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2843 }
2844
2845
2846 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2847    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2848    representation.  Check that the mask is valid and return FALSE if not.
2849    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2850    the next vector, i.e., the current first vector is not needed.  */
2851
2852 static bool
2853 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2854                        int mask_nunits, bool only_one_vec, int index,
2855                        unsigned char *mask, int *current_mask_element,
2856                        bool *need_next_vector, int *number_of_mask_fixes,
2857                        bool *mask_fixed, bool *needs_first_vector)
2858 {
2859   int i;
2860
2861   /* Convert to target specific representation.  */
2862   *current_mask_element = first_mask_element + m;
2863   /* Adjust the value in case it's a mask for second and third vectors.  */
2864   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2865
2866   if (*current_mask_element < mask_nunits)
2867     *needs_first_vector = true;
2868
2869   /* We have only one input vector to permute but the mask accesses values in
2870      the next vector as well.  */
2871   if (only_one_vec && *current_mask_element >= mask_nunits)
2872     {
2873       if (dump_enabled_p ())
2874         {
2875           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2876                            "permutation requires at least two vectors ");
2877           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2878           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2879         }
2880
2881       return false;
2882     }
2883
2884   /* The mask requires the next vector.  */
2885   if (*current_mask_element >= mask_nunits * 2)
2886     {
2887       if (*needs_first_vector || *mask_fixed)
2888         {
2889           /* We either need the first vector too or have already moved to the
2890              next vector. In both cases, this permutation needs three
2891              vectors.  */
2892           if (dump_enabled_p ())
2893             {
2894               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2895                                "permutation requires at "
2896                                "least three vectors ");
2897               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2898               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2899             }
2900
2901           return false;
2902         }
2903
2904       /* We move to the next vector, dropping the first one and working with
2905          the second and the third - we need to adjust the values of the mask
2906          accordingly.  */
2907       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2908
2909       for (i = 0; i < index; i++)
2910         mask[i] -= mask_nunits * *number_of_mask_fixes;
2911
2912       (*number_of_mask_fixes)++;
2913       *mask_fixed = true;
2914     }
2915
2916   *need_next_vector = *mask_fixed;
2917
2918   /* This was the last element of this mask. Start a new one.  */
2919   if (index == mask_nunits - 1)
2920     {
2921       *number_of_mask_fixes = 1;
2922       *mask_fixed = false;
2923       *needs_first_vector = false;
2924     }
2925
2926   return true;
2927 }
2928
2929
2930 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2931    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2932    permute statements for the SLP node NODE of the SLP instance
2933    SLP_NODE_INSTANCE.  */
2934
2935 bool
2936 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
2937                               gimple_stmt_iterator *gsi, int vf,
2938                               slp_instance slp_node_instance, bool analyze_only)
2939 {
2940   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2941   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2942   tree mask_element_type = NULL_TREE, mask_type;
2943   int i, j, k, nunits, vec_index = 0, scalar_index;
2944   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2945   gimple next_scalar_stmt;
2946   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2947   int first_mask_element;
2948   int index, unroll_factor, current_mask_element, ncopies;
2949   unsigned char *mask;
2950   bool only_one_vec = false, need_next_vector = false;
2951   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2952   int number_of_mask_fixes = 1;
2953   bool mask_fixed = false;
2954   bool needs_first_vector = false;
2955   enum machine_mode mode;
2956
2957   mode = TYPE_MODE (vectype);
2958
2959   if (!can_vec_perm_p (mode, false, NULL))
2960     {
2961       if (dump_enabled_p ())
2962         {
2963           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2964                            "no vect permute for ");
2965           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2966           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2967         }
2968       return false;
2969     }
2970
2971   /* The generic VEC_PERM_EXPR code always uses an integral type of the
2972      same size as the vector element being permuted.  */
2973   mask_element_type = lang_hooks.types.type_for_mode
2974                 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2975   mask_type = get_vectype_for_scalar_type (mask_element_type);
2976   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2977   mask = XALLOCAVEC (unsigned char, nunits);
2978   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2979
2980   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2981      unrolling factor.  */
2982   orig_vec_stmts_num = group_size *
2983                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2984   if (orig_vec_stmts_num == 1)
2985     only_one_vec = true;
2986
2987   /* Number of copies is determined by the final vectorization factor
2988      relatively to SLP_NODE_INSTANCE unrolling factor.  */
2989   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2990
2991   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2992     return false;
2993
2994   /* Generate permutation masks for every NODE. Number of masks for each NODE
2995      is equal to GROUP_SIZE.
2996      E.g., we have a group of three nodes with three loads from the same
2997      location in each node, and the vector size is 4. I.e., we have a
2998      a0b0c0a1b1c1... sequence and we need to create the following vectors:
2999      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3000      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3001      ...
3002
3003      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3004      The last mask is illegal since we assume two operands for permute
3005      operation, and the mask element values can't be outside that range.
3006      Hence, the last mask must be converted into {2,5,5,5}.
3007      For the first two permutations we need the first and the second input
3008      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3009      we need the second and the third vectors: {b1,c1,a2,b2} and
3010      {c2,a3,b3,c3}.  */
3011
3012     {
3013       scalar_index = 0;
3014       index = 0;
3015       vect_stmts_counter = 0;
3016       vec_index = 0;
3017       first_vec_index = vec_index++;
3018       if (only_one_vec)
3019         second_vec_index = first_vec_index;
3020       else
3021         second_vec_index =  vec_index++;
3022
3023       for (j = 0; j < unroll_factor; j++)
3024         {
3025           for (k = 0; k < group_size; k++)
3026             {
3027               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3028               first_mask_element = i + j * group_size;
3029               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3030                                           nunits, only_one_vec, index,
3031                                           mask, &current_mask_element,
3032                                           &need_next_vector,
3033                                           &number_of_mask_fixes, &mask_fixed,
3034                                           &needs_first_vector))
3035                 return false;
3036               mask[index++] = current_mask_element;
3037
3038               if (index == nunits)
3039                 {
3040                   index = 0;
3041                   if (!can_vec_perm_p (mode, false, mask))
3042                     {
3043                       if (dump_enabled_p ())
3044                         {
3045                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3046                                            vect_location, 
3047                                            "unsupported vect permute { ");
3048                           for (i = 0; i < nunits; ++i)
3049                             dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3050                                          mask[i]);
3051                           dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3052                         }
3053                       return false;
3054                     }
3055
3056                   if (!analyze_only)
3057                     {
3058                       int l;
3059                       tree mask_vec, *mask_elts;
3060                       mask_elts = XALLOCAVEC (tree, nunits);
3061                       for (l = 0; l < nunits; ++l)
3062                         mask_elts[l] = build_int_cst (mask_element_type,
3063                                                       mask[l]);
3064                       mask_vec = build_vector (mask_type, mask_elts);
3065
3066                       if (need_next_vector)
3067                         {
3068                           first_vec_index = second_vec_index;
3069                           second_vec_index = vec_index;
3070                         }
3071
3072                       next_scalar_stmt
3073                           = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3074
3075                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3076                                mask_vec, first_vec_index, second_vec_index,
3077                                gsi, node, vectype, dr_chain,
3078                                ncopies, vect_stmts_counter++);
3079                     }
3080                 }
3081             }
3082         }
3083     }
3084
3085   return true;
3086 }
3087
3088
3089
3090 /* Vectorize SLP instance tree in postorder.  */
3091
3092 static bool
3093 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3094                             unsigned int vectorization_factor)
3095 {
3096   gimple stmt;
3097   bool grouped_store, is_store;
3098   gimple_stmt_iterator si;
3099   stmt_vec_info stmt_info;
3100   unsigned int vec_stmts_size, nunits, group_size;
3101   tree vectype;
3102   int i;
3103   slp_tree child;
3104
3105   if (!node)
3106     return false;
3107
3108   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3109     vect_schedule_slp_instance (child, instance, vectorization_factor);
3110
3111   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3112   stmt_info = vinfo_for_stmt (stmt);
3113
3114   /* VECTYPE is the type of the destination.  */
3115   vectype = STMT_VINFO_VECTYPE (stmt_info);
3116   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3117   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3118
3119   /* For each SLP instance calculate number of vector stmts to be created
3120      for the scalar stmts in each node of the SLP tree.  Number of vector
3121      elements in one vector iteration is the number of scalar elements in
3122      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3123      size.  */
3124   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3125
3126   if (!SLP_TREE_VEC_STMTS (node).exists ())
3127     {
3128       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3129       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3130     }
3131
3132   if (dump_enabled_p ())
3133     {
3134       dump_printf_loc (MSG_NOTE,vect_location,
3135                        "------>vectorizing SLP node starting from: ");
3136       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3137       dump_printf (MSG_NOTE, "\n");
3138     }
3139
3140   /* Loads should be inserted before the first load.  */
3141   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3142       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3143       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3144       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3145     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3146   else if (is_pattern_stmt_p (stmt_info))
3147     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3148   else
3149     si = gsi_for_stmt (stmt);
3150
3151   /* Stores should be inserted just before the last store.  */
3152   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3153       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3154     { 
3155       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3156       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3157        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3158       si = gsi_for_stmt (last_store);
3159     }
3160
3161   /* Mark the first element of the reduction chain as reduction to properly
3162      transform the node.  In the analysis phase only the last element of the
3163      chain is marked as reduction.  */
3164   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3165       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3166     {
3167       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3168       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3169     }
3170
3171   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3172   return is_store;
3173 }
3174
3175 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3176    For loop vectorization this is done in vectorizable_call, but for SLP
3177    it needs to be deferred until end of vect_schedule_slp, because multiple
3178    SLP instances may refer to the same scalar stmt.  */
3179
3180 static void
3181 vect_remove_slp_scalar_calls (slp_tree node)
3182 {
3183   gimple stmt, new_stmt;
3184   gimple_stmt_iterator gsi;
3185   int i;
3186   slp_tree child;
3187   tree lhs;
3188   stmt_vec_info stmt_info;
3189
3190   if (!node)
3191     return;
3192
3193   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3194     vect_remove_slp_scalar_calls (child);
3195
3196   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3197     {
3198       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3199         continue;
3200       stmt_info = vinfo_for_stmt (stmt);
3201       if (stmt_info == NULL
3202           || is_pattern_stmt_p (stmt_info)
3203           || !PURE_SLP_STMT (stmt_info))
3204         continue;
3205       lhs = gimple_call_lhs (stmt);
3206       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3207       set_vinfo_for_stmt (new_stmt, stmt_info);
3208       set_vinfo_for_stmt (stmt, NULL);
3209       STMT_VINFO_STMT (stmt_info) = new_stmt;
3210       gsi = gsi_for_stmt (stmt);
3211       gsi_replace (&gsi, new_stmt, false);
3212       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3213     }
3214 }
3215
3216 /* Generate vector code for all SLP instances in the loop/basic block.  */
3217
3218 bool
3219 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3220 {
3221   vec<slp_instance> slp_instances;
3222   slp_instance instance;
3223   unsigned int i, vf;
3224   bool is_store = false;
3225
3226   if (loop_vinfo)
3227     {
3228       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3229       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3230     }
3231   else
3232     {
3233       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3234       vf = 1;
3235     }
3236
3237   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3238     {
3239       /* Schedule the tree of INSTANCE.  */
3240       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3241                                              instance, vf);
3242       if (dump_enabled_p ())
3243         dump_printf_loc (MSG_NOTE, vect_location,
3244                          "vectorizing stmts using SLP.\n");
3245     }
3246
3247   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3248     {
3249       slp_tree root = SLP_INSTANCE_TREE (instance);
3250       gimple store;
3251       unsigned int j;
3252       gimple_stmt_iterator gsi;
3253
3254       /* Remove scalar call stmts.  Do not do this for basic-block
3255          vectorization as not all uses may be vectorized.
3256          ???  Why should this be necessary?  DCE should be able to
3257          remove the stmts itself.
3258          ???  For BB vectorization we can as well remove scalar
3259          stmts starting from the SLP tree root if they have no
3260          uses.  */
3261       if (loop_vinfo)
3262         vect_remove_slp_scalar_calls (root);
3263
3264       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3265                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3266         {
3267           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3268             break;
3269
3270          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3271            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3272           /* Free the attached stmt_vec_info and remove the stmt.  */
3273           gsi = gsi_for_stmt (store);
3274           unlink_stmt_vdef (store);
3275           gsi_remove (&gsi, true);
3276           release_defs (store);
3277           free_stmt_vec_info (store);
3278         }
3279     }
3280
3281   return is_store;
3282 }
3283
3284
3285 /* Vectorize the basic block.  */
3286
3287 void
3288 vect_slp_transform_bb (basic_block bb)
3289 {
3290   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3291   gimple_stmt_iterator si;
3292
3293   gcc_assert (bb_vinfo);
3294
3295   if (dump_enabled_p ())
3296     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3297
3298   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3299     {
3300       gimple stmt = gsi_stmt (si);
3301       stmt_vec_info stmt_info;
3302
3303       if (dump_enabled_p ())
3304         {
3305           dump_printf_loc (MSG_NOTE, vect_location,
3306                            "------>SLPing statement: ");
3307           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3308           dump_printf (MSG_NOTE, "\n");
3309         }
3310
3311       stmt_info = vinfo_for_stmt (stmt);
3312       gcc_assert (stmt_info);
3313
3314       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3315       if (STMT_SLP_TYPE (stmt_info))
3316         {
3317           vect_schedule_slp (NULL, bb_vinfo);
3318           break;
3319         }
3320     }
3321
3322   if (dump_enabled_p ())
3323     dump_printf_loc (MSG_NOTE, vect_location,
3324                      "BASIC BLOCK VECTORIZED\n");
3325
3326   destroy_bb_vec_info (bb_vinfo);
3327 }