1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
36 #include "recog.h" /* FIXME: for insn_data */
38 #include "tree-vectorizer.h"
39 #include "langhooks.h"
41 /* Extract the location of the basic block in the source code.
42 Return the basic block location if succeed and NULL if not. */
45 find_bb_location (basic_block bb)
48 gimple_stmt_iterator si;
53 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
56 if (gimple_location (stmt) != UNKNOWN_LOC)
57 return gimple_location (stmt);
64 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
67 vect_free_slp_tree (slp_tree node)
75 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
76 vect_free_slp_tree (child);
78 SLP_TREE_CHILDREN (node).release ();
79 SLP_TREE_SCALAR_STMTS (node).release ();
80 SLP_TREE_VEC_STMTS (node).release ();
86 /* Free the memory allocated for the SLP instance. */
89 vect_free_slp_instance (slp_instance instance)
91 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
92 SLP_INSTANCE_LOAD_PERMUTATION (instance).release ();
93 SLP_INSTANCE_LOADS (instance).release ();
94 SLP_INSTANCE_BODY_COST_VEC (instance).release ();
99 /* Create an SLP node for SCALAR_STMTS. */
102 vect_create_new_slp_node (vec<gimple> scalar_stmts)
105 gimple stmt = scalar_stmts[0];
108 if (is_gimple_call (stmt))
109 nops = gimple_call_num_args (stmt);
110 else if (is_gimple_assign (stmt))
112 nops = gimple_num_ops (stmt) - 1;
113 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
119 node = XNEW (struct _slp_tree);
120 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
121 SLP_TREE_VEC_STMTS (node).create (0);
122 SLP_TREE_CHILDREN (node).create (nops);
128 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
130 static vec<slp_oprnd_info>
131 vect_create_oprnd_info (int nops, int group_size)
134 slp_oprnd_info oprnd_info;
135 vec<slp_oprnd_info> oprnds_info;
137 oprnds_info.create (nops);
138 for (i = 0; i < nops; i++)
140 oprnd_info = XNEW (struct _slp_oprnd_info);
141 oprnd_info->def_stmts.create (group_size);
142 oprnd_info->first_dt = vect_uninitialized_def;
143 oprnd_info->first_def_type = NULL_TREE;
144 oprnd_info->first_const_oprnd = NULL_TREE;
145 oprnd_info->first_pattern = false;
146 oprnds_info.quick_push (oprnd_info);
153 /* Free operands info. */
156 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
159 slp_oprnd_info oprnd_info;
161 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
163 oprnd_info->def_stmts.release ();
164 XDELETE (oprnd_info);
167 oprnds_info.release ();
171 /* Find the place of the data-ref in STMT in the interleaving chain that starts
172 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
175 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
177 gimple next_stmt = first_stmt;
180 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
185 if (next_stmt == stmt)
188 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
196 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
197 they are of a valid type and that they match the defs of the first stmt of
198 the SLP group (stored in OPRNDS_INFO). */
201 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
202 slp_tree slp_node, gimple stmt,
203 int ncopies_for_cost, bool first,
204 vec<slp_oprnd_info> *oprnds_info,
205 stmt_vector_for_cost *prologue_cost_vec,
206 stmt_vector_for_cost *body_cost_vec)
209 unsigned int i, number_of_oprnds;
210 tree def, def_op0 = NULL_TREE;
212 enum vect_def_type dt = vect_uninitialized_def;
213 enum vect_def_type dt_op0 = vect_uninitialized_def;
214 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
215 tree lhs = gimple_get_lhs (stmt);
216 struct loop *loop = NULL;
217 enum tree_code rhs_code;
218 bool different_types = false;
219 bool pattern = false;
220 slp_oprnd_info oprnd_info, oprnd0_info, oprnd1_info;
222 tree compare_rhs = NULL_TREE;
225 loop = LOOP_VINFO_LOOP (loop_vinfo);
227 if (is_gimple_call (stmt))
229 number_of_oprnds = gimple_call_num_args (stmt);
232 else if (is_gimple_assign (stmt))
234 number_of_oprnds = gimple_num_ops (stmt) - 1;
235 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
241 for (i = 0; i < number_of_oprnds; i++)
246 compare_rhs = NULL_TREE;
249 oprnd = gimple_op (stmt, op_idx++);
251 oprnd_info = (*oprnds_info)[i];
253 if (COMPARISON_CLASS_P (oprnd))
255 compare_rhs = TREE_OPERAND (oprnd, 1);
256 oprnd = TREE_OPERAND (oprnd, 0);
259 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
261 || (!def_stmt && dt != vect_constant_def))
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
266 "Build SLP failed: can't find def for ");
267 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
273 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
274 from the pattern. Check that all the stmts of the node are in the
276 if (def_stmt && gimple_bb (def_stmt)
277 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
278 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
279 && gimple_code (def_stmt) != GIMPLE_PHI))
280 && vinfo_for_stmt (def_stmt)
281 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
282 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
283 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
286 if (!first && !oprnd_info->first_pattern)
288 if (dump_enabled_p ())
290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
291 "Build SLP failed: some of the stmts"
292 " are in a pattern, and others are not ");
293 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
299 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
300 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
302 if (dt == vect_unknown_def_type)
304 if (dump_enabled_p ())
305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306 "Unsupported pattern.");
310 switch (gimple_code (def_stmt))
313 def = gimple_phi_result (def_stmt);
317 def = gimple_assign_lhs (def_stmt);
321 if (dump_enabled_p ())
322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
323 "unsupported defining stmt: ");
330 oprnd_info->first_dt = dt;
331 oprnd_info->first_pattern = pattern;
334 oprnd_info->first_def_type = TREE_TYPE (def);
335 oprnd_info->first_const_oprnd = NULL_TREE;
339 oprnd_info->first_def_type = NULL_TREE;
340 oprnd_info->first_const_oprnd = oprnd;
347 /* Analyze costs (for the first stmt of the group only). */
348 if (REFERENCE_CLASS_P (lhs))
350 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
351 dt, slp_node, prologue_cost_vec,
355 enum vect_def_type dts[2];
357 dts[1] = vect_uninitialized_def;
358 /* Not memory operation (we don't call this function for
360 vect_model_simple_cost (stmt_info, ncopies_for_cost, dts,
361 prologue_cost_vec, body_cost_vec);
367 /* Not first stmt of the group, check that the def-stmt/s match
368 the def-stmt/s of the first stmt. Allow different definition
369 types for reduction chains: the first stmt must be a
370 vect_reduction_def (a phi node), and the rest
371 vect_internal_def. */
372 if (((oprnd_info->first_dt != dt
373 && !(oprnd_info->first_dt == vect_reduction_def
374 && dt == vect_internal_def))
375 || (oprnd_info->first_def_type != NULL_TREE
377 && !types_compatible_p (oprnd_info->first_def_type,
380 && !types_compatible_p (TREE_TYPE (oprnd_info->first_const_oprnd),
384 if (number_of_oprnds != 2)
386 if (dump_enabled_p ())
387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
388 "Build SLP failed: different types ");
393 /* Try to swap operands in case of binary operation. */
395 different_types = true;
398 oprnd0_info = (*oprnds_info)[0];
399 if (is_gimple_assign (stmt)
400 && (rhs_code = gimple_assign_rhs_code (stmt))
401 && TREE_CODE_CLASS (rhs_code) == tcc_binary
402 && commutative_tree_code (rhs_code)
403 && oprnd0_info->first_dt == dt
404 && oprnd_info->first_dt == dt_op0
406 && !(oprnd0_info->first_def_type
407 && !types_compatible_p (oprnd0_info->first_def_type,
409 && !(oprnd_info->first_def_type
410 && !types_compatible_p (oprnd_info->first_def_type,
411 TREE_TYPE (def_op0))))
413 if (dump_enabled_p ())
415 dump_printf_loc (MSG_NOTE, vect_location,
416 "Swapping operands of ");
417 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
420 swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
421 gimple_assign_rhs2_ptr (stmt));
425 if (dump_enabled_p ())
426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
427 "Build SLP failed: different types ");
435 /* Check the types of the definitions. */
438 case vect_constant_def:
439 case vect_external_def:
440 case vect_reduction_def:
443 case vect_internal_def:
446 oprnd0_info = (*oprnds_info)[0];
447 oprnd1_info = (*oprnds_info)[0];
449 oprnd1_info->def_stmts.quick_push (def_stmt);
451 oprnd0_info->def_stmts.quick_push (def_stmt);
454 oprnd_info->def_stmts.quick_push (def_stmt);
459 /* FORNOW: Not supported. */
460 if (dump_enabled_p ())
462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
463 "Build SLP failed: illegal type of def ");
464 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
475 /* Recursively build an SLP tree starting from NODE.
476 Fail (and return FALSE) if def-stmts are not isomorphic, require data
477 permutation or are of unsupported types of operation. Otherwise, return
481 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
482 slp_tree *node, unsigned int group_size, int *outside_cost,
483 int ncopies_for_cost, unsigned int *max_nunits,
484 vec<int> *load_permutation,
485 vec<slp_tree> *loads,
486 unsigned int vectorization_factor, bool *loads_permuted,
487 stmt_vector_for_cost *prologue_cost_vec,
488 stmt_vector_for_cost *body_cost_vec)
491 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (*node);
492 gimple stmt = stmts[0];
493 enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
494 enum tree_code first_cond_code = ERROR_MARK;
496 bool stop_recursion = false, need_same_oprnds = false;
497 tree vectype, scalar_type, first_op1 = NULL_TREE;
500 enum machine_mode optab_op2_mode;
501 enum machine_mode vec_mode;
502 struct data_reference *first_dr;
504 bool permutation = false;
505 unsigned int load_place;
506 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
507 vec<slp_oprnd_info> oprnds_info;
509 slp_oprnd_info oprnd_info;
512 if (is_gimple_call (stmt))
513 nops = gimple_call_num_args (stmt);
514 else if (is_gimple_assign (stmt))
516 nops = gimple_num_ops (stmt) - 1;
517 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
523 oprnds_info = vect_create_oprnd_info (nops, group_size);
525 /* For every stmt in NODE find its def stmt/s. */
526 FOR_EACH_VEC_ELT (stmts, i, stmt)
528 if (dump_enabled_p ())
530 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
531 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
534 /* Fail to vectorize statements marked as unvectorizable. */
535 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
537 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 "Build SLP failed: unvectorizable statement ");
541 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
544 vect_free_oprnd_info (oprnds_info);
548 lhs = gimple_get_lhs (stmt);
549 if (lhs == NULL_TREE)
551 if (dump_enabled_p ())
553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
554 "Build SLP failed: not GIMPLE_ASSIGN nor "
556 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
559 vect_free_oprnd_info (oprnds_info);
563 if (is_gimple_assign (stmt)
564 && gimple_assign_rhs_code (stmt) == COND_EXPR
565 && (cond = gimple_assign_rhs1 (stmt))
566 && !COMPARISON_CLASS_P (cond))
568 if (dump_enabled_p ())
570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
571 "Build SLP failed: condition is not "
573 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
576 vect_free_oprnd_info (oprnds_info);
580 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
581 vectype = get_vectype_for_scalar_type (scalar_type);
584 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Build SLP failed: unsupported data-type ");
588 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
592 vect_free_oprnd_info (oprnds_info);
596 /* In case of multiple types we need to detect the smallest type. */
597 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
599 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
601 vectorization_factor = *max_nunits;
604 if (is_gimple_call (stmt))
606 rhs_code = CALL_EXPR;
607 if (gimple_call_internal_p (stmt)
608 || gimple_call_tail_p (stmt)
609 || gimple_call_noreturn_p (stmt)
610 || !gimple_call_nothrow_p (stmt)
611 || gimple_call_chain (stmt))
613 if (dump_enabled_p ())
615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
616 "Build SLP failed: unsupported call type ");
617 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
620 vect_free_oprnd_info (oprnds_info);
625 rhs_code = gimple_assign_rhs_code (stmt);
627 /* Check the operation. */
630 first_stmt_code = rhs_code;
632 /* Shift arguments should be equal in all the packed stmts for a
633 vector shift with scalar shift operand. */
634 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
635 || rhs_code == LROTATE_EXPR
636 || rhs_code == RROTATE_EXPR)
638 vec_mode = TYPE_MODE (vectype);
640 /* First see if we have a vector/vector shift. */
641 optab = optab_for_tree_code (rhs_code, vectype,
645 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
647 /* No vector/vector shift, try for a vector/scalar shift. */
648 optab = optab_for_tree_code (rhs_code, vectype,
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
655 "Build SLP failed: no optab.");
656 vect_free_oprnd_info (oprnds_info);
659 icode = (int) optab_handler (optab, vec_mode);
660 if (icode == CODE_FOR_nothing)
662 if (dump_enabled_p ())
663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "op not supported by target.");
666 vect_free_oprnd_info (oprnds_info);
669 optab_op2_mode = insn_data[icode].operand[2].mode;
670 if (!VECTOR_MODE_P (optab_op2_mode))
672 need_same_oprnds = true;
673 first_op1 = gimple_assign_rhs2 (stmt);
677 else if (rhs_code == WIDEN_LSHIFT_EXPR)
679 need_same_oprnds = true;
680 first_op1 = gimple_assign_rhs2 (stmt);
685 if (first_stmt_code != rhs_code
686 && (first_stmt_code != IMAGPART_EXPR
687 || rhs_code != REALPART_EXPR)
688 && (first_stmt_code != REALPART_EXPR
689 || rhs_code != IMAGPART_EXPR)
690 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
691 && (first_stmt_code == ARRAY_REF
692 || first_stmt_code == BIT_FIELD_REF
693 || first_stmt_code == INDIRECT_REF
694 || first_stmt_code == COMPONENT_REF
695 || first_stmt_code == MEM_REF)))
697 if (dump_enabled_p ())
699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
700 "Build SLP failed: different operation "
702 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
705 vect_free_oprnd_info (oprnds_info);
710 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
712 if (dump_enabled_p ())
714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
715 "Build SLP failed: different shift "
717 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
720 vect_free_oprnd_info (oprnds_info);
724 if (rhs_code == CALL_EXPR)
726 gimple first_stmt = stmts[0];
727 if (gimple_call_num_args (stmt) != nops
728 || !operand_equal_p (gimple_call_fn (first_stmt),
729 gimple_call_fn (stmt), 0)
730 || gimple_call_fntype (first_stmt)
731 != gimple_call_fntype (stmt))
733 if (dump_enabled_p ())
735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
736 "Build SLP failed: different calls in ");
737 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
741 vect_free_oprnd_info (oprnds_info);
747 /* Grouped store or load. */
748 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
750 if (REFERENCE_CLASS_P (lhs))
753 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node,
754 stmt, ncopies_for_cost,
755 (i == 0), &oprnds_info,
759 vect_free_oprnd_info (oprnds_info);
766 unsigned unrolling_factor
767 = least_common_multiple
768 (*max_nunits, group_size) / group_size;
769 /* FORNOW: Check that there is no gap between the loads
770 and no gap between the groups when we need to load
771 multiple groups at once.
772 ??? We should enhance this to only disallow gaps
774 if ((unrolling_factor > 1
775 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
776 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
777 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
778 && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Build SLP failed: grouped "
785 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
789 vect_free_oprnd_info (oprnds_info);
793 /* Check that the size of interleaved loads group is not
794 greater than the SLP group size. */
796 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
798 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
799 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
800 - GROUP_GAP (vinfo_for_stmt (stmt)))
801 > ncopies * group_size))
803 if (dump_enabled_p ())
805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
806 "Build SLP failed: the number "
807 "of interleaved loads is greater than "
808 "the SLP group size ");
809 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
813 vect_free_oprnd_info (oprnds_info);
817 old_first_load = first_load;
818 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
821 /* Check that there are no loads from different interleaving
822 chains in the same node. The only exception is complex
824 if (prev_first_load != first_load
825 && rhs_code != REALPART_EXPR
826 && rhs_code != IMAGPART_EXPR)
828 if (dump_enabled_p ())
830 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
832 "Build SLP failed: different "
833 "interleaving chains in one node ");
834 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
838 vect_free_oprnd_info (oprnds_info);
843 prev_first_load = first_load;
845 /* In some cases a group of loads is just the same load
846 repeated N times. Only analyze its cost once. */
847 if (first_load == stmt && old_first_load != first_load)
849 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
850 if (vect_supportable_dr_alignment (first_dr, false)
851 == dr_unaligned_unsupported)
853 if (dump_enabled_p ())
855 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
857 "Build SLP failed: unsupported "
859 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
863 vect_free_oprnd_info (oprnds_info);
867 /* Analyze costs (for the first stmt in the group). */
868 vect_model_load_cost (vinfo_for_stmt (stmt),
869 ncopies_for_cost, false, *node,
870 prologue_cost_vec, body_cost_vec);
873 /* Store the place of this load in the interleaving chain. In
874 case that permutation is needed we later decide if a specific
875 permutation is supported. */
876 load_place = vect_get_place_in_interleaving_chain (stmt,
881 load_permutation->safe_push (load_place);
883 /* We stop the tree when we reach a group of loads. */
884 stop_recursion = true;
887 } /* Grouped access. */
890 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
892 /* Not grouped load. */
893 if (dump_enabled_p ())
895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
896 "Build SLP failed: not grouped load ");
897 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
900 /* FORNOW: Not grouped loads are not supported. */
901 vect_free_oprnd_info (oprnds_info);
905 /* Not memory operation. */
906 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
907 && TREE_CODE_CLASS (rhs_code) != tcc_unary
908 && rhs_code != COND_EXPR
909 && rhs_code != CALL_EXPR)
911 if (dump_enabled_p ())
913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 "Build SLP failed: operation");
915 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
916 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
919 vect_free_oprnd_info (oprnds_info);
923 if (rhs_code == COND_EXPR)
925 tree cond_expr = gimple_assign_rhs1 (stmt);
928 first_cond_code = TREE_CODE (cond_expr);
929 else if (first_cond_code != TREE_CODE (cond_expr))
931 if (dump_enabled_p ())
933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
934 "Build SLP failed: different"
936 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
940 vect_free_oprnd_info (oprnds_info);
945 /* Find the def-stmts. */
946 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, stmt,
947 ncopies_for_cost, (i == 0),
948 &oprnds_info, prologue_cost_vec,
951 vect_free_oprnd_info (oprnds_info);
957 /* Grouped loads were reached - stop the recursion. */
960 loads->safe_push (*node);
963 gimple first_stmt = stmts[0];
964 *loads_permuted = true;
965 (void) record_stmt_cost (body_cost_vec, group_size, vec_perm,
966 vinfo_for_stmt (first_stmt), 0, vect_body);
970 /* We don't check here complex numbers chains, so we set
971 LOADS_PERMUTED for further check in
972 vect_supported_load_permutation_p. */
973 if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
974 *loads_permuted = true;
977 vect_free_oprnd_info (oprnds_info);
981 /* Create SLP_TREE nodes for the definition node/s. */
982 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
986 if (oprnd_info->first_dt != vect_internal_def)
989 child = vect_create_new_slp_node (oprnd_info->def_stmts);
991 || !vect_build_slp_tree (loop_vinfo, bb_vinfo, &child, group_size,
992 outside_cost, ncopies_for_cost,
993 max_nunits, load_permutation, loads,
994 vectorization_factor, loads_permuted,
995 prologue_cost_vec, body_cost_vec))
998 oprnd_info->def_stmts = vNULL;
999 vect_free_slp_tree (child);
1000 vect_free_oprnd_info (oprnds_info);
1004 oprnd_info->def_stmts.create (0);
1005 SLP_TREE_CHILDREN (*node).quick_push (child);
1008 vect_free_oprnd_info (oprnds_info);
1012 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1015 vect_print_slp_tree (int dump_kind, slp_tree node)
1024 dump_printf (dump_kind, "node ");
1025 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1027 dump_printf (dump_kind, "\n\tstmt %d ", i);
1028 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1030 dump_printf (dump_kind, "\n");
1032 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1033 vect_print_slp_tree (dump_kind, child);
1037 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1038 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1039 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1040 stmts in NODE are to be marked. */
1043 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1052 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1053 if (j < 0 || i == j)
1054 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1056 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1057 vect_mark_slp_stmts (child, mark, j);
1061 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1064 vect_mark_slp_stmts_relevant (slp_tree node)
1068 stmt_vec_info stmt_info;
1074 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1076 stmt_info = vinfo_for_stmt (stmt);
1077 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1078 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1079 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1082 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1083 vect_mark_slp_stmts_relevant (child);
1087 /* Check if the permutation required by the SLP INSTANCE is supported.
1088 Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
1091 vect_supported_slp_permutation_p (slp_instance instance)
1093 slp_tree node = SLP_INSTANCE_LOADS (instance)[0];
1094 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1095 gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
1096 vec<slp_tree> sorted_loads = vNULL;
1098 slp_tree *tmp_loads = NULL;
1099 int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
1102 /* FORNOW: The only supported loads permutation is loads from the same
1103 location in all the loads in the node, when the data-refs in
1104 nodes of LOADS constitute an interleaving chain.
1105 Sort the nodes according to the order of accesses in the chain. */
1106 tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
1108 SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index)
1109 && SLP_INSTANCE_LOADS (instance).iterate (j, &load);
1110 i += group_size, j++)
1112 gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0];
1113 /* Check that the loads are all in the same interleaving chain. */
1114 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
1116 if (dump_enabled_p ())
1118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119 "Build SLP failed: unsupported data "
1121 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1129 tmp_loads[index] = load;
1132 sorted_loads.create (group_size);
1133 for (i = 0; i < group_size; i++)
1134 sorted_loads.safe_push (tmp_loads[i]);
1136 SLP_INSTANCE_LOADS (instance).release ();
1137 SLP_INSTANCE_LOADS (instance) = sorted_loads;
1140 if (!vect_transform_slp_perm_load (stmt, vNULL, NULL,
1141 SLP_INSTANCE_UNROLLING_FACTOR (instance),
1149 /* Rearrange the statements of NODE according to PERMUTATION. */
1152 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1153 vec<int> permutation)
1156 vec<gimple> tmp_stmts;
1160 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1161 vect_slp_rearrange_stmts (child, group_size, permutation);
1163 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1164 tmp_stmts.create (group_size);
1165 tmp_stmts.quick_grow_cleared (group_size);
1167 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1168 tmp_stmts[permutation[i]] = stmt;
1170 SLP_TREE_SCALAR_STMTS (node).release ();
1171 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1175 /* Check if the required load permutation is supported.
1176 LOAD_PERMUTATION contains a list of indices of the loads.
1177 In SLP this permutation is relative to the order of grouped stores that are
1178 the base of the SLP instance. */
1181 vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
1182 vec<int> load_permutation)
1184 int i = 0, j, prev = -1, next, k, number_of_groups;
1185 bool supported, bad_permutation = false;
1187 slp_tree node, other_complex_node;
1188 gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
1189 unsigned complex_numbers = 0;
1190 struct data_reference *dr;
1191 bb_vec_info bb_vinfo;
1193 /* FORNOW: permutations are only supported in SLP. */
1197 if (dump_enabled_p ())
1199 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1200 FOR_EACH_VEC_ELT (load_permutation, i, next)
1201 dump_printf (MSG_NOTE, "%d ", next);
1204 /* In case of reduction every load permutation is allowed, since the order
1205 of the reduction statements is not important (as opposed to the case of
1206 grouped stores). The only condition we need to check is that all the
1207 load nodes are of the same size and have the same permutation (and then
1208 rearrange all the nodes of the SLP instance according to this
1211 /* Check that all the load nodes are of the same size. */
1212 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1214 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1217 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1218 if (is_gimple_assign (stmt)
1219 && (gimple_assign_rhs_code (stmt) == REALPART_EXPR
1220 || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR))
1224 /* Complex operands can be swapped as following:
1225 real_c = real_b + real_a;
1226 imag_c = imag_a + imag_b;
1227 i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of
1228 {real_a, imag_a} and {real_b, imag_b}. We check here that if interleaving
1229 chains are mixed, they match the above pattern. */
1230 if (complex_numbers)
1232 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1234 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, stmt)
1240 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != first)
1242 if (complex_numbers != 2)
1250 other_complex_node = SLP_INSTANCE_LOADS (slp_instn)[k];
1252 SLP_TREE_SCALAR_STMTS (other_complex_node)[0];
1254 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1255 != other_node_first)
1263 /* We checked that this case ok, so there is no need to proceed with
1264 permutation tests. */
1265 if (complex_numbers == 2
1266 && SLP_INSTANCE_LOADS (slp_instn).length () == 2)
1268 SLP_INSTANCE_LOADS (slp_instn).release ();
1269 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1273 node = SLP_INSTANCE_TREE (slp_instn);
1274 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1275 /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
1276 instance, not all the loads belong to the same node or interleaving
1277 group. Hence, we need to divide them into groups according to
1279 number_of_groups = load_permutation.length () / group_size;
1281 /* Reduction (there are no data-refs in the root).
1282 In reduction chain the order of the loads is important. */
1283 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1284 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1286 int first_group_load_index;
1288 /* Compare all the permutation sequences to the first one. */
1289 for (i = 1; i < number_of_groups; i++)
1292 for (j = i * group_size; j < i * group_size + group_size; j++)
1294 next = load_permutation[j];
1295 first_group_load_index = load_permutation[k];
1297 if (next != first_group_load_index)
1299 bad_permutation = true;
1306 if (bad_permutation)
1310 if (!bad_permutation)
1312 /* Check that the loads in the first sequence are different and there
1313 are no gaps between them. */
1314 load_index = sbitmap_alloc (group_size);
1315 bitmap_clear (load_index);
1316 for (k = 0; k < group_size; k++)
1318 first_group_load_index = load_permutation[k];
1319 if (bitmap_bit_p (load_index, first_group_load_index))
1321 bad_permutation = true;
1325 bitmap_set_bit (load_index, first_group_load_index);
1328 if (!bad_permutation)
1329 for (k = 0; k < group_size; k++)
1330 if (!bitmap_bit_p (load_index, k))
1332 bad_permutation = true;
1336 sbitmap_free (load_index);
1339 if (!bad_permutation)
1341 /* This permutation is valid for reduction. Since the order of the
1342 statements in the nodes is not important unless they are memory
1343 accesses, we can rearrange the statements in all the nodes
1344 according to the order of the loads. */
1345 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1347 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1352 /* In basic block vectorization we allow any subchain of an interleaving
1354 FORNOW: not supported in loop SLP because of realignment compications. */
1355 bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
1356 bad_permutation = false;
1357 /* Check that for every node in the instance the loads form a subchain. */
1360 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1364 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1367 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
1369 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
1371 bad_permutation = true;
1375 if (j != 0 && next_load != load)
1377 bad_permutation = true;
1381 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1384 if (bad_permutation)
1388 /* Check that the alignment of the first load in every subchain, i.e.,
1389 the first statement in every load node, is supported. */
1390 if (!bad_permutation)
1392 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1394 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1396 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1398 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1399 if (vect_supportable_dr_alignment (dr, false)
1400 == dr_unaligned_unsupported)
1402 if (dump_enabled_p ())
1404 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1406 "unsupported unaligned load ");
1407 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1410 bad_permutation = true;
1416 if (!bad_permutation)
1418 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1424 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1425 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1426 well (unless it's reduction). */
1427 if (load_permutation.length ()
1428 != (unsigned int) (group_size * group_size))
1432 load_index = sbitmap_alloc (group_size);
1433 bitmap_clear (load_index);
1434 for (j = 0; j < group_size; j++)
1436 for (i = j * group_size, k = 0;
1437 load_permutation.iterate (i, &next) && k < group_size;
1440 if (i != j * group_size && next != prev)
1449 if (bitmap_bit_p (load_index, prev))
1455 bitmap_set_bit (load_index, prev);
1458 for (j = 0; j < group_size; j++)
1459 if (!bitmap_bit_p (load_index, j))
1461 sbitmap_free (load_index);
1465 sbitmap_free (load_index);
1467 if (supported && i == group_size * group_size
1468 && vect_supported_slp_permutation_p (slp_instn))
1475 /* Find the first load in the loop that belongs to INSTANCE.
1476 When loads are in several SLP nodes, there can be a case in which the first
1477 load does not appear in the first SLP node to be transformed, causing
1478 incorrect order of statements. Since we generate all the loads together,
1479 they must be inserted before the first load of the SLP instance and not
1480 before the first load of the first node of the instance. */
1483 vect_find_first_load_in_slp_instance (slp_instance instance)
1487 gimple first_load = NULL, load;
1489 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1490 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1491 first_load = get_earlier_stmt (load, first_load);
1497 /* Find the last store in SLP INSTANCE. */
1500 vect_find_last_store_in_slp_instance (slp_instance instance)
1504 gimple last_store = NULL, store;
1506 node = SLP_INSTANCE_TREE (instance);
1507 for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1508 last_store = get_later_stmt (store, last_store);
1514 /* Analyze an SLP instance starting from a group of grouped stores. Call
1515 vect_build_slp_tree to build a tree of packed stmts if possible.
1516 Return FALSE if it's impossible to SLP any stmt in the loop. */
1519 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1522 slp_instance new_instance;
1524 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1525 unsigned int unrolling_factor = 1, nunits;
1526 tree vectype, scalar_type = NULL_TREE;
1528 unsigned int vectorization_factor = 0;
1529 int outside_cost = 0, ncopies_for_cost, i;
1530 unsigned int max_nunits = 0;
1531 vec<int> load_permutation;
1532 vec<slp_tree> loads;
1533 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1534 bool loads_permuted = false;
1535 vec<gimple> scalar_stmts;
1536 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1537 stmt_info_for_cost *si;
1539 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1543 scalar_type = TREE_TYPE (DR_REF (dr));
1544 vectype = get_vectype_for_scalar_type (scalar_type);
1548 gcc_assert (loop_vinfo);
1549 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1552 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1556 gcc_assert (loop_vinfo);
1557 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1558 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1563 if (dump_enabled_p ())
1565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566 "Build SLP failed: unsupported data-type ");
1567 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1573 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1575 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1577 vectorization_factor = nunits;
1579 /* Calculate the unrolling factor. */
1580 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1581 if (unrolling_factor != 1 && !loop_vinfo)
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1585 "Build SLP failed: unrolling required in basic"
1591 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1592 scalar_stmts.create (group_size);
1594 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1596 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1599 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1600 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1601 scalar_stmts.safe_push (
1602 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1604 scalar_stmts.safe_push (next);
1605 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1610 /* Collect reduction statements. */
1611 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1612 for (i = 0; reductions.iterate (i, &next); i++)
1613 scalar_stmts.safe_push (next);
1616 node = vect_create_new_slp_node (scalar_stmts);
1618 /* Calculate the number of vector stmts to create based on the unrolling
1619 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1620 GROUP_SIZE / NUNITS otherwise. */
1621 ncopies_for_cost = unrolling_factor * group_size / nunits;
1623 load_permutation.create (group_size * group_size);
1624 loads.create (group_size);
1625 prologue_cost_vec.create (10);
1626 body_cost_vec.create (10);
1628 /* Build the tree for the SLP instance. */
1629 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1630 &outside_cost, ncopies_for_cost,
1631 &max_nunits, &load_permutation, &loads,
1632 vectorization_factor, &loads_permuted,
1633 &prologue_cost_vec, &body_cost_vec))
1635 void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1636 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1638 /* Calculate the unrolling factor based on the smallest type. */
1639 if (max_nunits > nunits)
1640 unrolling_factor = least_common_multiple (max_nunits, group_size)
1643 if (unrolling_factor != 1 && !loop_vinfo)
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "Build SLP failed: unrolling required in basic"
1649 vect_free_slp_tree (node);
1650 body_cost_vec.release ();
1651 prologue_cost_vec.release ();
1652 load_permutation.release ();
1657 /* Create a new SLP instance. */
1658 new_instance = XNEW (struct _slp_instance);
1659 SLP_INSTANCE_TREE (new_instance) = node;
1660 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1661 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1662 SLP_INSTANCE_BODY_COST_VEC (new_instance) = body_cost_vec;
1663 SLP_INSTANCE_LOADS (new_instance) = loads;
1664 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1665 SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
1669 if (!vect_supported_load_permutation_p (new_instance, group_size,
1672 if (dump_enabled_p ())
1674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1675 "Build SLP failed: unsupported load "
1677 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1680 vect_free_slp_instance (new_instance);
1681 prologue_cost_vec.release ();
1685 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1686 = vect_find_first_load_in_slp_instance (new_instance);
1689 SLP_INSTANCE_LOAD_PERMUTATION (new_instance).release ();
1691 /* Record the prologue costs, which were delayed until we were
1692 sure that SLP was successful. Unlike the body costs, we know
1693 the final values now regardless of the loop vectorization factor. */
1694 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1696 struct _stmt_vec_info *stmt_info
1697 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1698 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1699 si->misalign, vect_prologue);
1702 prologue_cost_vec.release ();
1705 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1707 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1709 if (dump_enabled_p ())
1710 vect_print_slp_tree (MSG_NOTE, node);
1716 body_cost_vec.release ();
1717 prologue_cost_vec.release ();
1720 /* Failed to SLP. */
1721 /* Free the allocated memory. */
1722 vect_free_slp_tree (node);
1723 load_permutation.release ();
1730 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1731 trees of packed scalar stmts if SLP is possible. */
1734 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1737 vec<gimple> grouped_stores;
1738 vec<gimple> reductions = vNULL;
1739 vec<gimple> reduc_chains = vNULL;
1740 gimple first_element;
1743 if (dump_enabled_p ())
1744 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===");
1748 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1749 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1750 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1753 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1755 /* Find SLP sequences starting from groups of grouped stores. */
1756 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1757 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1760 if (bb_vinfo && !ok)
1762 if (dump_enabled_p ())
1763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1764 "Failed to SLP the basic block.");
1770 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1772 /* Find SLP sequences starting from reduction chains. */
1773 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1774 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1779 /* Don't try to vectorize SLP reductions if reduction chain was
1784 /* Find SLP sequences starting from groups of reductions. */
1785 if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1786 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0]))
1793 /* For each possible SLP instance decide whether to SLP it and calculate overall
1794 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1795 least one instance. */
1798 vect_make_slp_decision (loop_vec_info loop_vinfo)
1800 unsigned int i, unrolling_factor = 1;
1801 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1802 slp_instance instance;
1803 int decided_to_slp = 0;
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ===");
1808 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1810 /* FORNOW: SLP if you can. */
1811 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1812 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1814 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1815 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1816 loop-based vectorization. Such stmts will be marked as HYBRID. */
1817 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1821 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1823 if (decided_to_slp && dump_enabled_p ())
1824 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
1825 "Decided to SLP %d instances. Unrolling factor %d",
1826 decided_to_slp, unrolling_factor);
1828 return (decided_to_slp > 0);
1832 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1833 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1836 vect_detect_hybrid_slp_stmts (slp_tree node)
1839 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1840 gimple stmt = stmts[0];
1841 imm_use_iterator imm_iter;
1843 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1845 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1846 struct loop *loop = NULL;
1847 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1848 basic_block bb = NULL;
1854 loop = LOOP_VINFO_LOOP (loop_vinfo);
1856 bb = BB_VINFO_BB (bb_vinfo);
1858 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1859 if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1860 && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1861 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1862 if (gimple_bb (use_stmt)
1863 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1864 || bb == gimple_bb (use_stmt))
1865 && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1866 && !STMT_SLP_TYPE (stmt_vinfo)
1867 && (STMT_VINFO_RELEVANT (stmt_vinfo)
1868 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
1869 && !(gimple_code (use_stmt) == GIMPLE_PHI
1870 && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1871 == vect_reduction_def))
1872 vect_mark_slp_stmts (node, hybrid, i);
1874 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1875 vect_detect_hybrid_slp_stmts (child);
1879 /* Find stmts that must be both vectorized and SLPed. */
1882 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1885 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1886 slp_instance instance;
1888 if (dump_enabled_p ())
1889 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ===");
1891 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1892 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1896 /* Create and initialize a new bb_vec_info struct for BB, as well as
1897 stmt_vec_info structs for all the stmts in it. */
1900 new_bb_vec_info (basic_block bb)
1902 bb_vec_info res = NULL;
1903 gimple_stmt_iterator gsi;
1905 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1906 BB_VINFO_BB (res) = bb;
1908 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1910 gimple stmt = gsi_stmt (gsi);
1911 gimple_set_uid (stmt, 0);
1912 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1915 BB_VINFO_GROUPED_STORES (res).create (10);
1916 BB_VINFO_SLP_INSTANCES (res).create (2);
1917 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1924 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1925 stmts in the basic block. */
1928 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1930 vec<slp_instance> slp_instances;
1931 slp_instance instance;
1933 gimple_stmt_iterator si;
1939 bb = BB_VINFO_BB (bb_vinfo);
1941 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1943 gimple stmt = gsi_stmt (si);
1944 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1947 /* Free stmt_vec_info. */
1948 free_stmt_vec_info (stmt);
1951 free_data_refs (BB_VINFO_DATAREFS (bb_vinfo));
1952 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1953 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1954 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1955 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1956 vect_free_slp_instance (instance);
1957 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1958 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1964 /* Analyze statements contained in SLP tree node after recursively analyzing
1965 the subtree. Return TRUE if the operations are supported. */
1968 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1978 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1979 if (!vect_slp_analyze_node_operations (bb_vinfo, child))
1982 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1984 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1985 gcc_assert (stmt_info);
1986 gcc_assert (PURE_SLP_STMT (stmt_info));
1988 if (!vect_analyze_stmt (stmt, &dummy, node))
1996 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1997 operations are supported. */
2000 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2002 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2003 slp_instance instance;
2006 for (i = 0; slp_instances.iterate (i, &instance); )
2008 if (!vect_slp_analyze_node_operations (bb_vinfo,
2009 SLP_INSTANCE_TREE (instance)))
2011 vect_free_slp_instance (instance);
2012 slp_instances.ordered_remove (i);
2018 if (!slp_instances.length ())
2024 /* Check if vectorization of the basic block is profitable. */
2027 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2029 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2030 slp_instance instance;
2032 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2033 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2034 unsigned int stmt_cost;
2036 gimple_stmt_iterator si;
2037 basic_block bb = BB_VINFO_BB (bb_vinfo);
2038 void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2039 stmt_vec_info stmt_info = NULL;
2040 stmt_vector_for_cost body_cost_vec;
2041 stmt_info_for_cost *ci;
2043 /* Calculate vector costs. */
2044 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2046 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2048 FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2050 stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2051 (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2052 stmt_info, ci->misalign, vect_body);
2056 /* Calculate scalar cost. */
2057 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2059 stmt = gsi_stmt (si);
2060 stmt_info = vinfo_for_stmt (stmt);
2062 if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
2063 || !PURE_SLP_STMT (stmt_info))
2066 if (STMT_VINFO_DATA_REF (stmt_info))
2068 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2069 stmt_cost = vect_get_stmt_cost (scalar_load);
2071 stmt_cost = vect_get_stmt_cost (scalar_store);
2074 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2076 scalar_cost += stmt_cost;
2079 /* Complete the target-specific cost calculation. */
2080 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2081 &vec_inside_cost, &vec_epilogue_cost);
2083 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2085 if (dump_enabled_p ())
2087 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2088 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2090 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2091 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2092 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d", scalar_cost);
2095 /* Vectorization is profitable if its cost is less than the cost of scalar
2097 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2103 /* Check if the basic block can be vectorized. */
2106 vect_slp_analyze_bb_1 (basic_block bb)
2108 bb_vec_info bb_vinfo;
2109 vec<slp_instance> slp_instances;
2110 slp_instance instance;
2114 bb_vinfo = new_bb_vec_info (bb);
2118 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf))
2120 if (dump_enabled_p ())
2121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2122 "not vectorized: unhandled data-ref in basic "
2125 destroy_bb_vec_info (bb_vinfo);
2129 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2131 if (dump_enabled_p ())
2132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133 "not vectorized: not enough data-refs in "
2136 destroy_bb_vec_info (bb_vinfo);
2140 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 "not vectorized: unhandled data access in "
2147 destroy_bb_vec_info (bb_vinfo);
2151 vect_pattern_recog (NULL, bb_vinfo);
2153 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2155 if (dump_enabled_p ())
2156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2157 "not vectorized: unhandled data dependence "
2158 "in basic block.\n");
2160 destroy_bb_vec_info (bb_vinfo);
2164 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2166 if (dump_enabled_p ())
2167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168 "not vectorized: bad data alignment in basic "
2171 destroy_bb_vec_info (bb_vinfo);
2175 /* Check the SLP opportunities in the basic block, analyze and build SLP
2177 if (!vect_analyze_slp (NULL, bb_vinfo))
2179 if (dump_enabled_p ())
2180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2181 "not vectorized: failed to find SLP opportunities "
2182 "in basic block.\n");
2184 destroy_bb_vec_info (bb_vinfo);
2188 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2190 /* Mark all the statements that we want to vectorize as pure SLP and
2192 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2194 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2195 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2198 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2200 if (dump_enabled_p ())
2201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2202 "not vectorized: unsupported alignment in basic "
2204 destroy_bb_vec_info (bb_vinfo);
2208 if (!vect_slp_analyze_operations (bb_vinfo))
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212 "not vectorized: bad operation in basic block.\n");
2214 destroy_bb_vec_info (bb_vinfo);
2218 /* Cost model: check if the vectorization is worthwhile. */
2219 if (flag_vect_cost_model
2220 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224 "not vectorized: vectorization is not "
2227 destroy_bb_vec_info (bb_vinfo);
2231 if (dump_enabled_p ())
2232 dump_printf_loc (MSG_NOTE, vect_location,
2233 "Basic block will be vectorized using SLP\n");
2240 vect_slp_analyze_bb (basic_block bb)
2242 bb_vec_info bb_vinfo;
2244 gimple_stmt_iterator gsi;
2245 unsigned int vector_sizes;
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2250 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2252 gimple stmt = gsi_stmt (gsi);
2253 if (!is_gimple_debug (stmt)
2254 && !gimple_nop_p (stmt)
2255 && gimple_code (stmt) != GIMPLE_LABEL)
2259 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 "not vectorized: too many instructions in "
2269 /* Autodetect first vector size we try. */
2270 current_vector_size = 0;
2271 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2275 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2279 destroy_bb_vec_info (bb_vinfo);
2281 vector_sizes &= ~current_vector_size;
2282 if (vector_sizes == 0
2283 || current_vector_size == 0)
2286 /* Try the next biggest vector size. */
2287 current_vector_size = 1 << floor_log2 (vector_sizes);
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_NOTE, vect_location,
2290 "***** Re-trying analysis with "
2291 "vector size %d\n", current_vector_size);
2296 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2297 the number of created vector stmts depends on the unrolling factor).
2298 However, the actual number of vector stmts for every SLP node depends on
2299 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2300 should be updated. In this function we assume that the inside costs
2301 calculated in vect_model_xxx_cost are linear in ncopies. */
2304 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2306 unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2307 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2308 slp_instance instance;
2309 stmt_vector_for_cost body_cost_vec;
2310 stmt_info_for_cost *si;
2311 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2313 if (dump_enabled_p ())
2314 dump_printf_loc (MSG_NOTE, vect_location,
2315 "=== vect_update_slp_costs_according_to_vf ===");
2317 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2319 /* We assume that costs are linear in ncopies. */
2320 int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2322 /* Record the instance's instructions in the target cost model.
2323 This was delayed until here because the count of instructions
2324 isn't known beforehand. */
2325 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2327 FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2328 (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2329 vinfo_for_stmt (si->stmt), si->misalign,
2335 /* For constant and loop invariant defs of SLP_NODE this function returns
2336 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2337 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2338 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2339 REDUC_INDEX is the index of the reduction operand in the statements, unless
2343 vect_get_constant_vectors (tree op, slp_tree slp_node,
2344 vec<tree> *vec_oprnds,
2345 unsigned int op_num, unsigned int number_of_vectors,
2348 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2349 gimple stmt = stmts[0];
2350 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2354 unsigned j, number_of_places_left_in_vector;
2357 int group_size = stmts.length ();
2358 unsigned int vec_num, i;
2359 unsigned number_of_copies = 1;
2361 voprnds.create (number_of_vectors);
2362 bool constant_p, is_store;
2363 tree neutral_op = NULL;
2364 enum tree_code code = gimple_expr_code (stmt);
2367 gimple_seq ctor_seq = NULL;
2369 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2370 && reduc_index != -1)
2372 op_num = reduc_index - 1;
2373 op = gimple_op (stmt, reduc_index);
2374 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2375 we need either neutral operands or the original operands. See
2376 get_initial_def_for_reduction() for details. */
2379 case WIDEN_SUM_EXPR:
2385 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2386 neutral_op = build_real (TREE_TYPE (op), dconst0);
2388 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2393 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2394 neutral_op = build_real (TREE_TYPE (op), dconst1);
2396 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2401 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2406 def_stmt = SSA_NAME_DEF_STMT (op);
2407 loop = (gimple_bb (stmt))->loop_father;
2408 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2409 loop_preheader_edge (loop));
2417 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2420 op = gimple_assign_rhs1 (stmt);
2427 if (CONSTANT_CLASS_P (op))
2432 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2433 gcc_assert (vector_type);
2434 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2436 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2437 created vectors. It is greater than 1 if unrolling is performed.
2439 For example, we have two scalar operands, s1 and s2 (e.g., group of
2440 strided accesses of size two), while NUNITS is four (i.e., four scalars
2441 of this type can be packed in a vector). The output vector will contain
2442 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2445 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2446 containing the operands.
2448 For example, NUNITS is four as before, and the group size is 8
2449 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2450 {s5, s6, s7, s8}. */
2452 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2454 number_of_places_left_in_vector = nunits;
2455 elts = XALLOCAVEC (tree, nunits);
2456 for (j = 0; j < number_of_copies; j++)
2458 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2461 op = gimple_assign_rhs1 (stmt);
2467 if (op_num == 0 || op_num == 1)
2469 tree cond = gimple_assign_rhs1 (stmt);
2470 op = TREE_OPERAND (cond, op_num);
2475 op = gimple_assign_rhs2 (stmt);
2477 op = gimple_assign_rhs3 (stmt);
2482 op = gimple_call_arg (stmt, op_num);
2489 op = gimple_op (stmt, op_num + 1);
2490 /* Unlike the other binary operators, shifts/rotates have
2491 the shift count being int, instead of the same type as
2492 the lhs, so make sure the scalar is the right type if
2493 we are dealing with vectors of
2494 long long/long/short/char. */
2495 if (op_num == 1 && constant_p)
2496 op = fold_convert (TREE_TYPE (vector_type), op);
2500 op = gimple_op (stmt, op_num + 1);
2505 if (reduc_index != -1)
2507 loop = (gimple_bb (stmt))->loop_father;
2508 def_stmt = SSA_NAME_DEF_STMT (op);
2512 /* Get the def before the loop. In reduction chain we have only
2513 one initial value. */
2514 if ((j != (number_of_copies - 1)
2515 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2520 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2521 loop_preheader_edge (loop));
2524 /* Create 'vect_ = {op0,op1,...,opn}'. */
2525 number_of_places_left_in_vector--;
2526 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2530 op = fold_unary (VIEW_CONVERT_EXPR,
2531 TREE_TYPE (vector_type), op);
2532 gcc_assert (op && CONSTANT_CLASS_P (op));
2537 = make_ssa_name (TREE_TYPE (vector_type), NULL);
2539 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2542 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2543 new_temp, op, NULL_TREE);
2544 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2548 elts[number_of_places_left_in_vector] = op;
2550 if (number_of_places_left_in_vector == 0)
2552 number_of_places_left_in_vector = nunits;
2555 vec_cst = build_vector (vector_type, elts);
2558 vec<constructor_elt, va_gc> *v;
2560 vec_alloc (v, nunits);
2561 for (k = 0; k < nunits; ++k)
2562 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2563 vec_cst = build_constructor (vector_type, v);
2565 voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2566 vector_type, NULL));
2567 if (ctor_seq != NULL)
2569 gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2570 gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2571 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2579 /* Since the vectors are created in the reverse order, we should invert
2581 vec_num = voprnds.length ();
2582 for (j = vec_num; j != 0; j--)
2584 vop = voprnds[j - 1];
2585 vec_oprnds->quick_push (vop);
2590 /* In case that VF is greater than the unrolling factor needed for the SLP
2591 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2592 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2593 to replicate the vectors. */
2594 while (number_of_vectors > vec_oprnds->length ())
2596 tree neutral_vec = NULL;
2601 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2603 vec_oprnds->quick_push (neutral_vec);
2607 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2608 vec_oprnds->quick_push (vop);
2614 /* Get vectorized definitions from SLP_NODE that contains corresponding
2615 vectorized def-stmts. */
2618 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2621 gimple vec_def_stmt;
2624 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2626 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2628 gcc_assert (vec_def_stmt);
2629 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2630 vec_oprnds->quick_push (vec_oprnd);
2635 /* Get vectorized definitions for SLP_NODE.
2636 If the scalar definitions are loop invariants or constants, collect them and
2637 call vect_get_constant_vectors() to create vector stmts.
2638 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2639 must be stored in the corresponding child of SLP_NODE, and we call
2640 vect_get_slp_vect_defs () to retrieve them. */
2643 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2644 vec<vec<tree> > *vec_oprnds, int reduc_index)
2647 int number_of_vects = 0, i;
2648 unsigned int child_index = 0;
2649 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2650 slp_tree child = NULL;
2653 bool vectorized_defs;
2655 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2656 FOR_EACH_VEC_ELT (ops, i, oprnd)
2658 /* For each operand we check if it has vectorized definitions in a child
2659 node or we need to create them (for invariants and constants). We
2660 check if the LHS of the first stmt of the next child matches OPRND.
2661 If it does, we found the correct child. Otherwise, we call
2662 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2663 to check this child node for the next operand. */
2664 vectorized_defs = false;
2665 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2667 child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index];
2669 /* We have to check both pattern and original def, if available. */
2670 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2671 gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2673 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2675 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2677 /* The number of vector defs is determined by the number of
2678 vector statements in the node from which we get those
2680 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2681 vectorized_defs = true;
2686 if (!vectorized_defs)
2690 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2691 /* Number of vector stmts was calculated according to LHS in
2692 vect_schedule_slp_instance (), fix it by replacing LHS with
2693 RHS, if necessary. See vect_get_smallest_scalar_type () for
2695 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2697 if (rhs_size_unit != lhs_size_unit)
2699 number_of_vects *= rhs_size_unit;
2700 number_of_vects /= lhs_size_unit;
2705 /* Allocate memory for vectorized defs. */
2707 vec_defs.create (number_of_vects);
2709 /* For reduction defs we call vect_get_constant_vectors (), since we are
2710 looking for initial loop invariant values. */
2711 if (vectorized_defs && reduc_index == -1)
2712 /* The defs are already vectorized. */
2713 vect_get_slp_vect_defs (child, &vec_defs);
2715 /* Build vectors from scalar defs. */
2716 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2717 number_of_vects, reduc_index);
2719 vec_oprnds->quick_push (vec_defs);
2721 /* For reductions, we only need initial values. */
2722 if (reduc_index != -1)
2728 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2729 building a vector of type MASK_TYPE from it) and two input vectors placed in
2730 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2731 shifting by STRIDE elements of DR_CHAIN for every copy.
2732 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2734 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2735 the created stmts must be inserted. */
2738 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2739 tree mask, int first_vec_indx, int second_vec_indx,
2740 gimple_stmt_iterator *gsi, slp_tree node,
2741 tree vectype, vec<tree> dr_chain,
2742 int ncopies, int vect_stmts_counter)
2745 gimple perm_stmt = NULL;
2746 stmt_vec_info next_stmt_info;
2748 tree first_vec, second_vec, data_ref;
2750 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2752 /* Initialize the vect stmts of NODE to properly insert the generated
2754 for (i = SLP_TREE_VEC_STMTS (node).length ();
2755 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2756 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2758 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2759 for (i = 0; i < ncopies; i++)
2761 first_vec = dr_chain[first_vec_indx];
2762 second_vec = dr_chain[second_vec_indx];
2764 /* Generate the permute statement. */
2765 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2766 first_vec, second_vec, mask);
2767 data_ref = make_ssa_name (perm_dest, perm_stmt);
2768 gimple_set_lhs (perm_stmt, data_ref);
2769 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2771 /* Store the vector statement in NODE. */
2772 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2774 first_vec_indx += stride;
2775 second_vec_indx += stride;
2778 /* Mark the scalar stmt as vectorized. */
2779 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2780 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2784 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2785 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2786 representation. Check that the mask is valid and return FALSE if not.
2787 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2788 the next vector, i.e., the current first vector is not needed. */
2791 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2792 int mask_nunits, bool only_one_vec, int index,
2793 unsigned char *mask, int *current_mask_element,
2794 bool *need_next_vector, int *number_of_mask_fixes,
2795 bool *mask_fixed, bool *needs_first_vector)
2799 /* Convert to target specific representation. */
2800 *current_mask_element = first_mask_element + m;
2801 /* Adjust the value in case it's a mask for second and third vectors. */
2802 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2804 if (*current_mask_element < mask_nunits)
2805 *needs_first_vector = true;
2807 /* We have only one input vector to permute but the mask accesses values in
2808 the next vector as well. */
2809 if (only_one_vec && *current_mask_element >= mask_nunits)
2811 if (dump_enabled_p ())
2813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2814 "permutation requires at least two vectors ");
2815 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2821 /* The mask requires the next vector. */
2822 if (*current_mask_element >= mask_nunits * 2)
2824 if (*needs_first_vector || *mask_fixed)
2826 /* We either need the first vector too or have already moved to the
2827 next vector. In both cases, this permutation needs three
2829 if (dump_enabled_p ())
2831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2832 "permutation requires at "
2833 "least three vectors ");
2834 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2840 /* We move to the next vector, dropping the first one and working with
2841 the second and the third - we need to adjust the values of the mask
2843 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2845 for (i = 0; i < index; i++)
2846 mask[i] -= mask_nunits * *number_of_mask_fixes;
2848 (*number_of_mask_fixes)++;
2852 *need_next_vector = *mask_fixed;
2854 /* This was the last element of this mask. Start a new one. */
2855 if (index == mask_nunits - 1)
2857 *number_of_mask_fixes = 1;
2858 *mask_fixed = false;
2859 *needs_first_vector = false;
2866 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2867 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2868 permute statements for SLP_NODE_INSTANCE. */
2870 vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
2871 gimple_stmt_iterator *gsi, int vf,
2872 slp_instance slp_node_instance, bool analyze_only)
2874 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2875 tree mask_element_type = NULL_TREE, mask_type;
2876 int i, j, k, nunits, vec_index = 0, scalar_index;
2878 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2879 gimple next_scalar_stmt;
2880 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2881 int first_mask_element;
2882 int index, unroll_factor, current_mask_element, ncopies;
2883 unsigned char *mask;
2884 bool only_one_vec = false, need_next_vector = false;
2885 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2886 int number_of_mask_fixes = 1;
2887 bool mask_fixed = false;
2888 bool needs_first_vector = false;
2889 enum machine_mode mode;
2891 mode = TYPE_MODE (vectype);
2893 if (!can_vec_perm_p (mode, false, NULL))
2895 if (dump_enabled_p ())
2897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2898 "no vect permute for ");
2899 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2904 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2905 same size as the vector element being permuted. */
2906 mask_element_type = lang_hooks.types.type_for_mode
2907 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2908 mask_type = get_vectype_for_scalar_type (mask_element_type);
2909 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2910 mask = XALLOCAVEC (unsigned char, nunits);
2911 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2913 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2914 unrolling factor. */
2915 orig_vec_stmts_num = group_size *
2916 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2917 if (orig_vec_stmts_num == 1)
2918 only_one_vec = true;
2920 /* Number of copies is determined by the final vectorization factor
2921 relatively to SLP_NODE_INSTANCE unrolling factor. */
2922 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2924 /* Generate permutation masks for every NODE. Number of masks for each NODE
2925 is equal to GROUP_SIZE.
2926 E.g., we have a group of three nodes with three loads from the same
2927 location in each node, and the vector size is 4. I.e., we have a
2928 a0b0c0a1b1c1... sequence and we need to create the following vectors:
2929 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2930 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2933 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2934 The last mask is illegal since we assume two operands for permute
2935 operation, and the mask element values can't be outside that range.
2936 Hence, the last mask must be converted into {2,5,5,5}.
2937 For the first two permutations we need the first and the second input
2938 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2939 we need the second and the third vectors: {b1,c1,a2,b2} and
2942 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_node_instance), i, node)
2946 vect_stmts_counter = 0;
2948 first_vec_index = vec_index++;
2950 second_vec_index = first_vec_index;
2952 second_vec_index = vec_index++;
2954 for (j = 0; j < unroll_factor; j++)
2956 for (k = 0; k < group_size; k++)
2958 first_mask_element = i + j * group_size;
2959 if (!vect_get_mask_element (stmt, first_mask_element, 0,
2960 nunits, only_one_vec, index,
2961 mask, ¤t_mask_element,
2963 &number_of_mask_fixes, &mask_fixed,
2964 &needs_first_vector))
2966 mask[index++] = current_mask_element;
2968 if (index == nunits)
2970 tree mask_vec, *mask_elts;
2973 if (!can_vec_perm_p (mode, false, mask))
2975 if (dump_enabled_p ())
2977 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2979 "unsupported vect permute { ");
2980 for (i = 0; i < nunits; ++i)
2981 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
2983 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
2988 mask_elts = XALLOCAVEC (tree, nunits);
2989 for (l = 0; l < nunits; ++l)
2990 mask_elts[l] = build_int_cst (mask_element_type, mask[l]);
2991 mask_vec = build_vector (mask_type, mask_elts);
2996 if (need_next_vector)
2998 first_vec_index = second_vec_index;
2999 second_vec_index = vec_index;
3003 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3005 vect_create_mask_and_perm (stmt, next_scalar_stmt,
3006 mask_vec, first_vec_index, second_vec_index,
3007 gsi, node, vectype, dr_chain,
3008 ncopies, vect_stmts_counter++);
3020 /* Vectorize SLP instance tree in postorder. */
3023 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3024 unsigned int vectorization_factor)
3027 bool grouped_store, is_store;
3028 gimple_stmt_iterator si;
3029 stmt_vec_info stmt_info;
3030 unsigned int vec_stmts_size, nunits, group_size;
3033 slp_tree loads_node;
3039 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3040 vect_schedule_slp_instance (child, instance, vectorization_factor);
3042 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3043 stmt_info = vinfo_for_stmt (stmt);
3045 /* VECTYPE is the type of the destination. */
3046 vectype = STMT_VINFO_VECTYPE (stmt_info);
3047 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3048 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3050 /* For each SLP instance calculate number of vector stmts to be created
3051 for the scalar stmts in each node of the SLP tree. Number of vector
3052 elements in one vector iteration is the number of scalar elements in
3053 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3055 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3057 /* In case of load permutation we have to allocate vectorized statements for
3058 all the nodes that participate in that permutation. */
3059 if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3061 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node)
3063 if (!SLP_TREE_VEC_STMTS (loads_node).exists ())
3065 SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size);
3066 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
3071 if (!SLP_TREE_VEC_STMTS (node).exists ())
3073 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3074 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3077 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE,vect_location,
3080 "------>vectorizing SLP node starting from: ");
3081 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3084 /* Loads should be inserted before the first load. */
3085 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3086 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3087 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3088 && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3089 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3090 else if (is_pattern_stmt_p (stmt_info))
3091 si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3093 si = gsi_for_stmt (stmt);
3095 /* Stores should be inserted just before the last store. */
3096 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3097 && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3099 gimple last_store = vect_find_last_store_in_slp_instance (instance);
3100 if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3101 last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3102 si = gsi_for_stmt (last_store);
3105 /* Mark the first element of the reduction chain as reduction to properly
3106 transform the node. In the analysis phase only the last element of the
3107 chain is marked as reduction. */
3108 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3109 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3111 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3112 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3115 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3119 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3120 For loop vectorization this is done in vectorizable_call, but for SLP
3121 it needs to be deferred until end of vect_schedule_slp, because multiple
3122 SLP instances may refer to the same scalar stmt. */
3125 vect_remove_slp_scalar_calls (slp_tree node)
3127 gimple stmt, new_stmt;
3128 gimple_stmt_iterator gsi;
3132 stmt_vec_info stmt_info;
3137 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3138 vect_remove_slp_scalar_calls (child);
3140 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3142 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3144 stmt_info = vinfo_for_stmt (stmt);
3145 if (stmt_info == NULL
3146 || is_pattern_stmt_p (stmt_info)
3147 || !PURE_SLP_STMT (stmt_info))
3149 lhs = gimple_call_lhs (stmt);
3150 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3151 set_vinfo_for_stmt (new_stmt, stmt_info);
3152 set_vinfo_for_stmt (stmt, NULL);
3153 STMT_VINFO_STMT (stmt_info) = new_stmt;
3154 gsi = gsi_for_stmt (stmt);
3155 gsi_replace (&gsi, new_stmt, false);
3156 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3160 /* Generate vector code for all SLP instances in the loop/basic block. */
3163 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3165 vec<slp_instance> slp_instances;
3166 slp_instance instance;
3167 slp_tree loads_node;
3168 unsigned int i, j, vf;
3169 bool is_store = false;
3173 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3174 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3178 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3182 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3184 /* Schedule the tree of INSTANCE. */
3185 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3188 /* Clear STMT_VINFO_VEC_STMT of all loads. With shared loads
3189 between SLP instances we fail to properly initialize the
3190 vectorized SLP stmts and confuse different load permutations. */
3191 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node)
3193 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL;
3195 if (dump_enabled_p ())
3196 dump_printf_loc (MSG_NOTE, vect_location,
3197 "vectorizing stmts using SLP.");
3200 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3202 slp_tree root = SLP_INSTANCE_TREE (instance);
3205 gimple_stmt_iterator gsi;
3207 /* Remove scalar call stmts. Do not do this for basic-block
3208 vectorization as not all uses may be vectorized.
3209 ??? Why should this be necessary? DCE should be able to
3210 remove the stmts itself.
3211 ??? For BB vectorization we can as well remove scalar
3212 stmts starting from the SLP tree root if they have no
3215 vect_remove_slp_scalar_calls (root);
3217 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3218 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3220 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3223 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3224 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3225 /* Free the attached stmt_vec_info and remove the stmt. */
3226 gsi = gsi_for_stmt (store);
3227 unlink_stmt_vdef (store);
3228 gsi_remove (&gsi, true);
3229 release_defs (store);
3230 free_stmt_vec_info (store);
3238 /* Vectorize the basic block. */
3241 vect_slp_transform_bb (basic_block bb)
3243 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3244 gimple_stmt_iterator si;
3246 gcc_assert (bb_vinfo);
3248 if (dump_enabled_p ())
3249 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3251 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3253 gimple stmt = gsi_stmt (si);
3254 stmt_vec_info stmt_info;
3256 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_NOTE, vect_location,
3259 "------>SLPing statement: ");
3260 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3263 stmt_info = vinfo_for_stmt (stmt);
3264 gcc_assert (stmt_info);
3266 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3267 if (STMT_SLP_TYPE (stmt_info))
3269 vect_schedule_slp (NULL, bb_vinfo);
3274 if (dump_enabled_p ())
3275 dump_printf (MSG_OPTIMIZED_LOCATIONS, "BASIC BLOCK VECTORIZED\n");
3277 destroy_bb_vec_info (bb_vinfo);