1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "fold-const.h"
32 #include "stor-layout.h"
35 #include "hard-reg-set.h"
37 #include "basic-block.h"
38 #include "gimple-pretty-print.h"
39 #include "tree-ssa-alias.h"
40 #include "internal-fn.h"
41 #include "gimple-expr.h"
44 #include "gimple-iterator.h"
45 #include "gimple-ssa.h"
46 #include "tree-phinodes.h"
47 #include "ssa-iterators.h"
48 #include "stringpool.h"
49 #include "tree-ssanames.h"
50 #include "tree-pass.h"
54 #include "insn-config.h"
63 #include "recog.h" /* FIXME: for insn_data */
64 #include "insn-codes.h"
66 #include "tree-vectorizer.h"
67 #include "langhooks.h"
68 #include "gimple-walk.h"
70 /* Extract the location of the basic block in the source code.
71 Return the basic block location if succeed and NULL if not. */
74 find_bb_location (basic_block bb)
77 gimple_stmt_iterator si;
80 return UNKNOWN_LOCATION;
82 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
85 if (gimple_location (stmt) != UNKNOWN_LOCATION)
86 return gimple_location (stmt);
89 return UNKNOWN_LOCATION;
93 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
96 vect_free_slp_tree (slp_tree node)
104 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
105 vect_free_slp_tree (child);
107 SLP_TREE_CHILDREN (node).release ();
108 SLP_TREE_SCALAR_STMTS (node).release ();
109 SLP_TREE_VEC_STMTS (node).release ();
110 SLP_TREE_LOAD_PERMUTATION (node).release ();
116 /* Free the memory allocated for the SLP instance. */
119 vect_free_slp_instance (slp_instance instance)
121 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
122 SLP_INSTANCE_LOADS (instance).release ();
127 /* Create an SLP node for SCALAR_STMTS. */
130 vect_create_new_slp_node (vec<gimple> scalar_stmts)
133 gimple stmt = scalar_stmts[0];
136 if (is_gimple_call (stmt))
137 nops = gimple_call_num_args (stmt);
138 else if (is_gimple_assign (stmt))
140 nops = gimple_num_ops (stmt) - 1;
141 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
147 node = XNEW (struct _slp_tree);
148 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
149 SLP_TREE_VEC_STMTS (node).create (0);
150 SLP_TREE_CHILDREN (node).create (nops);
151 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
152 SLP_TREE_TWO_OPERATORS (node) = false;
158 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
160 static vec<slp_oprnd_info>
161 vect_create_oprnd_info (int nops, int group_size)
164 slp_oprnd_info oprnd_info;
165 vec<slp_oprnd_info> oprnds_info;
167 oprnds_info.create (nops);
168 for (i = 0; i < nops; i++)
170 oprnd_info = XNEW (struct _slp_oprnd_info);
171 oprnd_info->def_stmts.create (group_size);
172 oprnd_info->first_dt = vect_uninitialized_def;
173 oprnd_info->first_op_type = NULL_TREE;
174 oprnd_info->first_pattern = false;
175 oprnd_info->second_pattern = false;
176 oprnds_info.quick_push (oprnd_info);
183 /* Free operands info. */
186 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
189 slp_oprnd_info oprnd_info;
191 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
193 oprnd_info->def_stmts.release ();
194 XDELETE (oprnd_info);
197 oprnds_info.release ();
201 /* Find the place of the data-ref in STMT in the interleaving chain that starts
202 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
205 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
207 gimple next_stmt = first_stmt;
210 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
215 if (next_stmt == stmt)
217 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
219 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
227 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
228 they are of a valid type and that they match the defs of the first stmt of
229 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
230 return -1, if the error could be corrected by swapping operands of the
231 operation return 1, if everything is ok return 0. */
234 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
235 gimple stmt, unsigned stmt_num,
236 vec<slp_oprnd_info> *oprnds_info)
239 unsigned int i, number_of_oprnds;
242 enum vect_def_type dt = vect_uninitialized_def;
243 struct loop *loop = NULL;
244 bool pattern = false;
245 slp_oprnd_info oprnd_info;
246 int first_op_idx = 1;
247 bool commutative = false;
248 bool first_op_cond = false;
249 bool first = stmt_num == 0;
250 bool second = stmt_num == 1;
253 loop = LOOP_VINFO_LOOP (loop_vinfo);
255 if (is_gimple_call (stmt))
257 number_of_oprnds = gimple_call_num_args (stmt);
260 else if (is_gimple_assign (stmt))
262 enum tree_code code = gimple_assign_rhs_code (stmt);
263 number_of_oprnds = gimple_num_ops (stmt) - 1;
264 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
266 first_op_cond = true;
271 commutative = commutative_tree_code (code);
276 bool swapped = false;
277 for (i = 0; i < number_of_oprnds; i++)
282 if (i == 0 || i == 1)
283 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
286 oprnd = gimple_op (stmt, first_op_idx + i - 1);
289 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
291 oprnd_info = (*oprnds_info)[i];
293 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
296 if (dump_enabled_p ())
298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
299 "Build SLP failed: can't analyze def for ");
300 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
307 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
308 from the pattern. Check that all the stmts of the node are in the
310 if (def_stmt && gimple_bb (def_stmt)
311 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
312 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
313 && gimple_code (def_stmt) != GIMPLE_PHI))
314 && vinfo_for_stmt (def_stmt)
315 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
316 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
317 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
320 if (!first && !oprnd_info->first_pattern
321 /* Allow different pattern state for the defs of the
322 first stmt in reduction chains. */
323 && (oprnd_info->first_dt != vect_reduction_def
324 || (!second && !oprnd_info->second_pattern)))
334 if (dump_enabled_p ())
336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
337 "Build SLP failed: some of the stmts"
338 " are in a pattern, and others are not ");
339 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
340 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
346 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
347 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
349 if (dt == vect_unknown_def_type)
351 if (dump_enabled_p ())
352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
353 "Unsupported pattern.\n");
357 switch (gimple_code (def_stmt))
360 def = gimple_phi_result (def_stmt);
364 def = gimple_assign_lhs (def_stmt);
368 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "unsupported defining stmt:\n");
376 oprnd_info->second_pattern = pattern;
380 oprnd_info->first_dt = dt;
381 oprnd_info->first_pattern = pattern;
382 oprnd_info->first_op_type = TREE_TYPE (oprnd);
386 /* Not first stmt of the group, check that the def-stmt/s match
387 the def-stmt/s of the first stmt. Allow different definition
388 types for reduction chains: the first stmt must be a
389 vect_reduction_def (a phi node), and the rest
390 vect_internal_def. */
391 if (((oprnd_info->first_dt != dt
392 && !(oprnd_info->first_dt == vect_reduction_def
393 && dt == vect_internal_def)
394 && !((oprnd_info->first_dt == vect_external_def
395 || oprnd_info->first_dt == vect_constant_def)
396 && (dt == vect_external_def
397 || dt == vect_constant_def)))
398 || !types_compatible_p (oprnd_info->first_op_type,
401 /* Try swapping operands if we got a mismatch. */
410 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 "Build SLP failed: different types\n");
418 /* Check the types of the definitions. */
421 case vect_constant_def:
422 case vect_external_def:
423 case vect_reduction_def:
426 case vect_internal_def:
427 oprnd_info->def_stmts.quick_push (def_stmt);
431 /* FORNOW: Not supported. */
432 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "Build SLP failed: illegal type of def ");
436 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
449 tree cond = gimple_assign_rhs1 (stmt);
450 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451 &TREE_OPERAND (cond, 1));
452 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
455 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456 gimple_assign_rhs2_ptr (stmt));
463 /* Verify if the scalar stmts STMTS are isomorphic, require data
464 permutation or are of unsupported types of operation. Return
465 true if they are, otherwise return false and indicate in *MATCHES
466 which stmts are not isomorphic to the first one. If MATCHES[0]
467 is false then this indicates the comparison could not be
468 carried out or the stmts will never be vectorized by SLP. */
471 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472 vec<gimple> stmts, unsigned int group_size,
473 unsigned nops, unsigned int *max_nunits,
474 unsigned int vectorization_factor, bool *matches,
478 gimple first_stmt = stmts[0], stmt = stmts[0];
479 enum tree_code first_stmt_code = ERROR_MARK;
480 enum tree_code alt_stmt_code = ERROR_MARK;
481 enum tree_code rhs_code = ERROR_MARK;
482 enum tree_code first_cond_code = ERROR_MARK;
484 bool need_same_oprnds = false;
485 tree vectype, scalar_type, first_op1 = NULL_TREE;
488 machine_mode optab_op2_mode;
489 machine_mode vec_mode;
490 struct data_reference *first_dr;
492 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
495 /* For every stmt in NODE find its def stmt/s. */
496 FOR_EACH_VEC_ELT (stmts, i, stmt)
500 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
503 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
504 dump_printf (MSG_NOTE, "\n");
507 /* Fail to vectorize statements marked as unvectorizable. */
508 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
510 if (dump_enabled_p ())
512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
513 "Build SLP failed: unvectorizable statement ");
514 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
515 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
517 /* Fatal mismatch. */
522 lhs = gimple_get_lhs (stmt);
523 if (lhs == NULL_TREE)
525 if (dump_enabled_p ())
527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
528 "Build SLP failed: not GIMPLE_ASSIGN nor "
530 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
531 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
533 /* Fatal mismatch. */
538 if (is_gimple_assign (stmt)
539 && gimple_assign_rhs_code (stmt) == COND_EXPR
540 && (cond = gimple_assign_rhs1 (stmt))
541 && !COMPARISON_CLASS_P (cond))
543 if (dump_enabled_p ())
545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
546 "Build SLP failed: condition is not "
548 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
549 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
551 /* Fatal mismatch. */
556 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
557 vectype = get_vectype_for_scalar_type (scalar_type);
560 if (dump_enabled_p ())
562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
563 "Build SLP failed: unsupported data-type ");
564 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
566 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
568 /* Fatal mismatch. */
573 /* If populating the vector type requires unrolling then fail
574 before adjusting *max_nunits for basic-block vectorization. */
576 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
579 "Build SLP failed: unrolling required "
580 "in basic block SLP\n");
581 /* Fatal mismatch. */
586 /* In case of multiple types we need to detect the smallest type. */
587 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
589 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
591 vectorization_factor = *max_nunits;
594 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
596 rhs_code = CALL_EXPR;
597 if (gimple_call_internal_p (call_stmt)
598 || gimple_call_tail_p (call_stmt)
599 || gimple_call_noreturn_p (call_stmt)
600 || !gimple_call_nothrow_p (call_stmt)
601 || gimple_call_chain (call_stmt))
603 if (dump_enabled_p ())
605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
606 "Build SLP failed: unsupported call type ");
607 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
609 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
611 /* Fatal mismatch. */
617 rhs_code = gimple_assign_rhs_code (stmt);
619 /* Check the operation. */
622 first_stmt_code = rhs_code;
624 /* Shift arguments should be equal in all the packed stmts for a
625 vector shift with scalar shift operand. */
626 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
627 || rhs_code == LROTATE_EXPR
628 || rhs_code == RROTATE_EXPR)
630 vec_mode = TYPE_MODE (vectype);
632 /* First see if we have a vector/vector shift. */
633 optab = optab_for_tree_code (rhs_code, vectype,
637 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
639 /* No vector/vector shift, try for a vector/scalar shift. */
640 optab = optab_for_tree_code (rhs_code, vectype,
645 if (dump_enabled_p ())
646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
647 "Build SLP failed: no optab.\n");
648 /* Fatal mismatch. */
652 icode = (int) optab_handler (optab, vec_mode);
653 if (icode == CODE_FOR_nothing)
655 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
658 "op not supported by target.\n");
659 /* Fatal mismatch. */
663 optab_op2_mode = insn_data[icode].operand[2].mode;
664 if (!VECTOR_MODE_P (optab_op2_mode))
666 need_same_oprnds = true;
667 first_op1 = gimple_assign_rhs2 (stmt);
671 else if (rhs_code == WIDEN_LSHIFT_EXPR)
673 need_same_oprnds = true;
674 first_op1 = gimple_assign_rhs2 (stmt);
679 if (first_stmt_code != rhs_code
680 && alt_stmt_code == ERROR_MARK)
681 alt_stmt_code = rhs_code;
682 if (first_stmt_code != rhs_code
683 && (first_stmt_code != IMAGPART_EXPR
684 || rhs_code != REALPART_EXPR)
685 && (first_stmt_code != REALPART_EXPR
686 || rhs_code != IMAGPART_EXPR)
687 /* Handle mismatches in plus/minus by computing both
688 and merging the results. */
689 && !((first_stmt_code == PLUS_EXPR
690 || first_stmt_code == MINUS_EXPR)
691 && (alt_stmt_code == PLUS_EXPR
692 || alt_stmt_code == MINUS_EXPR)
693 && rhs_code == alt_stmt_code)
694 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
695 && (first_stmt_code == ARRAY_REF
696 || first_stmt_code == BIT_FIELD_REF
697 || first_stmt_code == INDIRECT_REF
698 || first_stmt_code == COMPONENT_REF
699 || first_stmt_code == MEM_REF)))
701 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Build SLP failed: different operation "
706 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
709 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
717 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
719 if (dump_enabled_p ())
721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
722 "Build SLP failed: different shift "
724 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
725 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
731 if (rhs_code == CALL_EXPR)
733 gimple first_stmt = stmts[0];
734 if (gimple_call_num_args (stmt) != nops
735 || !operand_equal_p (gimple_call_fn (first_stmt),
736 gimple_call_fn (stmt), 0)
737 || gimple_call_fntype (first_stmt)
738 != gimple_call_fntype (stmt))
740 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "Build SLP failed: different calls in ");
744 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
746 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
754 /* Grouped store or load. */
755 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
757 if (REFERENCE_CLASS_P (lhs))
765 unsigned unrolling_factor
766 = least_common_multiple
767 (*max_nunits, group_size) / group_size;
768 /* FORNOW: Check that there is no gap between the loads
769 and no gap between the groups when we need to load
770 multiple groups at once. */
771 if (unrolling_factor > 1
772 && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
774 /* If the group is split up then GROUP_GAP
775 isn't correct here, nor is GROUP_FIRST_ELEMENT. */
776 || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
778 if (dump_enabled_p ())
780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781 "Build SLP failed: grouped "
783 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 /* Fatal mismatch. */
792 /* Check that the size of interleaved loads group is not
793 greater than the SLP group size. */
795 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
797 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
798 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
799 - GROUP_GAP (vinfo_for_stmt (stmt)))
800 > ncopies * group_size))
802 if (dump_enabled_p ())
804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
805 "Build SLP failed: the number "
806 "of interleaved loads is greater than "
807 "the SLP group size ");
808 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
810 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
812 /* Fatal mismatch. */
817 old_first_load = first_load;
818 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
821 /* Check that there are no loads from different interleaving
822 chains in the same node. */
823 if (prev_first_load != first_load)
825 if (dump_enabled_p ())
827 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
829 "Build SLP failed: different "
830 "interleaving chains in one node ");
831 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
833 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
840 prev_first_load = first_load;
842 /* In some cases a group of loads is just the same load
843 repeated N times. Only analyze its cost once. */
844 if (first_load == stmt && old_first_load != first_load)
846 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
847 if (vect_supportable_dr_alignment (first_dr, false)
848 == dr_unaligned_unsupported)
850 if (dump_enabled_p ())
852 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
854 "Build SLP failed: unsupported "
856 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
858 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
860 /* Fatal mismatch. */
866 } /* Grouped access. */
869 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
871 /* Not grouped load. */
872 if (dump_enabled_p ())
874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
875 "Build SLP failed: not grouped load ");
876 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
877 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
880 /* FORNOW: Not grouped loads are not supported. */
881 /* Fatal mismatch. */
886 /* Not memory operation. */
887 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
888 && TREE_CODE_CLASS (rhs_code) != tcc_unary
889 && TREE_CODE_CLASS (rhs_code) != tcc_expression
890 && rhs_code != CALL_EXPR)
892 if (dump_enabled_p ())
894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
895 "Build SLP failed: operation");
896 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
897 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
898 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
900 /* Fatal mismatch. */
905 if (rhs_code == COND_EXPR)
907 tree cond_expr = gimple_assign_rhs1 (stmt);
910 first_cond_code = TREE_CODE (cond_expr);
911 else if (first_cond_code != TREE_CODE (cond_expr))
913 if (dump_enabled_p ())
915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
916 "Build SLP failed: different"
918 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
920 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
931 for (i = 0; i < group_size; ++i)
935 /* If we allowed a two-operation SLP node verify the target can cope
936 with the permute we are going to use. */
937 if (alt_stmt_code != ERROR_MARK
938 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
941 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
942 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
945 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
946 sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
948 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
950 for (i = 0; i < group_size; ++i)
951 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
954 if (dump_enabled_p ())
956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
957 "Build SLP failed: different operation "
959 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
963 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
969 *two_operators = true;
975 /* Recursively build an SLP tree starting from NODE.
976 Fail (and return a value not equal to zero) if def-stmts are not
977 isomorphic, require data permutation or are of unsupported types of
978 operation. Otherwise, return 0.
979 The value returned is the depth in the SLP tree where a mismatch
983 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
984 slp_tree *node, unsigned int group_size,
985 unsigned int *max_nunits,
986 vec<slp_tree> *loads,
987 unsigned int vectorization_factor,
988 bool *matches, unsigned *npermutes, unsigned *tree_size,
989 unsigned max_tree_size)
991 unsigned nops, i, this_tree_size = 0;
996 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
997 if (is_gimple_call (stmt))
998 nops = gimple_call_num_args (stmt);
999 else if (is_gimple_assign (stmt))
1001 nops = gimple_num_ops (stmt) - 1;
1002 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1008 bool two_operators = false;
1009 if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
1010 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
1011 max_nunits, vectorization_factor, matches,
1014 SLP_TREE_TWO_OPERATORS (*node) = two_operators;
1016 /* If the SLP node is a load, terminate the recursion. */
1017 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1018 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1020 loads->safe_push (*node);
1024 /* Get at the operands, verifying they are compatible. */
1025 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1026 slp_oprnd_info oprnd_info;
1027 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
1029 switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
1030 stmt, i, &oprnds_info))
1036 vect_free_oprnd_info (oprnds_info);
1043 for (i = 0; i < group_size; ++i)
1046 vect_free_oprnd_info (oprnds_info);
1050 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
1052 /* Create SLP_TREE nodes for the definition node/s. */
1053 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1056 unsigned old_nloads = loads->length ();
1057 unsigned old_max_nunits = *max_nunits;
1059 if (oprnd_info->first_dt != vect_internal_def)
1062 if (++this_tree_size > max_tree_size)
1064 vect_free_oprnd_info (oprnds_info);
1068 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1071 vect_free_oprnd_info (oprnds_info);
1075 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1076 group_size, max_nunits, loads,
1077 vectorization_factor, matches,
1078 npermutes, &this_tree_size, max_tree_size))
1080 /* If we have all children of child built up from scalars then just
1081 throw that away and build it up this node from scalars. */
1082 if (!SLP_TREE_CHILDREN (child).is_empty ())
1085 slp_tree grandchild;
1087 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1088 if (grandchild != NULL)
1093 *max_nunits = old_max_nunits;
1094 loads->truncate (old_nloads);
1095 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1096 vect_free_slp_tree (grandchild);
1097 SLP_TREE_CHILDREN (child).truncate (0);
1099 dump_printf_loc (MSG_NOTE, vect_location,
1100 "Building parent vector operands from "
1101 "scalars instead\n");
1102 oprnd_info->def_stmts = vNULL;
1103 vect_free_slp_tree (child);
1104 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1109 oprnd_info->def_stmts = vNULL;
1110 SLP_TREE_CHILDREN (*node).quick_push (child);
1114 /* If the SLP build failed fatally and we analyze a basic-block
1115 simply treat nodes we fail to build as externally defined
1116 (and thus build vectors from the scalar defs).
1117 The cost model will reject outright expensive cases.
1118 ??? This doesn't treat cases where permutation ultimatively
1119 fails (or we don't try permutation below). Ideally we'd
1120 even compute a permutation that will end up with the maximum
1124 /* ??? Rejecting patterns this way doesn't work. We'd have to
1125 do extra work to cancel the pattern so the uses see the
1127 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1130 slp_tree grandchild;
1133 *max_nunits = old_max_nunits;
1134 loads->truncate (old_nloads);
1135 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1136 vect_free_slp_tree (grandchild);
1137 SLP_TREE_CHILDREN (child).truncate (0);
1139 dump_printf_loc (MSG_NOTE, vect_location,
1140 "Building vector operands from scalars\n");
1141 oprnd_info->def_stmts = vNULL;
1142 vect_free_slp_tree (child);
1143 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1147 /* If the SLP build for operand zero failed and operand zero
1148 and one can be commutated try that for the scalar stmts
1149 that failed the match. */
1151 /* A first scalar stmt mismatch signals a fatal mismatch. */
1153 /* ??? For COND_EXPRs we can swap the comparison operands
1154 as well as the arms under some constraints. */
1156 && oprnds_info[1]->first_dt == vect_internal_def
1157 && is_gimple_assign (stmt)
1158 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1159 && !SLP_TREE_TWO_OPERATORS (*node)
1160 /* Do so only if the number of not successful permutes was nor more
1161 than a cut-ff as re-trying the recursive match on
1162 possibly each level of the tree would expose exponential
1167 slp_tree grandchild;
1170 *max_nunits = old_max_nunits;
1171 loads->truncate (old_nloads);
1172 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1173 vect_free_slp_tree (grandchild);
1174 SLP_TREE_CHILDREN (child).truncate (0);
1176 /* Swap mismatched definition stmts. */
1177 dump_printf_loc (MSG_NOTE, vect_location,
1178 "Re-trying with swapped operands of stmts ");
1179 for (j = 0; j < group_size; ++j)
1182 gimple tem = oprnds_info[0]->def_stmts[j];
1183 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1184 oprnds_info[1]->def_stmts[j] = tem;
1185 dump_printf (MSG_NOTE, "%d ", j);
1187 dump_printf (MSG_NOTE, "\n");
1188 /* And try again with scratch 'matches' ... */
1189 bool *tem = XALLOCAVEC (bool, group_size);
1190 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1191 group_size, max_nunits, loads,
1192 vectorization_factor,
1193 tem, npermutes, &this_tree_size,
1196 /* ... so if successful we can apply the operand swapping
1197 to the GIMPLE IL. This is necessary because for example
1198 vect_get_slp_defs uses operand indexes and thus expects
1199 canonical operand order. */
1200 for (j = 0; j < group_size; ++j)
1203 gimple stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1204 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1205 gimple_assign_rhs2_ptr (stmt));
1207 oprnd_info->def_stmts = vNULL;
1208 SLP_TREE_CHILDREN (*node).quick_push (child);
1215 oprnd_info->def_stmts = vNULL;
1216 vect_free_slp_tree (child);
1217 vect_free_oprnd_info (oprnds_info);
1222 *tree_size += this_tree_size;
1224 vect_free_oprnd_info (oprnds_info);
1228 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1231 vect_print_slp_tree (int dump_kind, slp_tree node)
1240 dump_printf (dump_kind, "node ");
1241 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1243 dump_printf (dump_kind, "\n\tstmt %d ", i);
1244 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1246 dump_printf (dump_kind, "\n");
1248 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1249 vect_print_slp_tree (dump_kind, child);
1253 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1254 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1255 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1256 stmts in NODE are to be marked. */
1259 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1268 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1269 if (j < 0 || i == j)
1270 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1272 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1273 vect_mark_slp_stmts (child, mark, j);
1277 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1280 vect_mark_slp_stmts_relevant (slp_tree node)
1284 stmt_vec_info stmt_info;
1290 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1292 stmt_info = vinfo_for_stmt (stmt);
1293 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1294 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1295 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1298 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1299 vect_mark_slp_stmts_relevant (child);
1303 /* Rearrange the statements of NODE according to PERMUTATION. */
1306 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1307 vec<unsigned> permutation)
1310 vec<gimple> tmp_stmts;
1314 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1315 vect_slp_rearrange_stmts (child, group_size, permutation);
1317 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1318 tmp_stmts.create (group_size);
1319 tmp_stmts.quick_grow_cleared (group_size);
1321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1322 tmp_stmts[permutation[i]] = stmt;
1324 SLP_TREE_SCALAR_STMTS (node).release ();
1325 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1329 /* Check if the required load permutations in the SLP instance
1330 SLP_INSTN are supported. */
1333 vect_supported_load_permutation_p (slp_instance slp_instn)
1335 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1336 unsigned int i, j, k, next;
1339 gimple stmt, load, next_load, first_load;
1340 struct data_reference *dr;
1342 if (dump_enabled_p ())
1344 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1345 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1346 if (node->load_permutation.exists ())
1347 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1348 dump_printf (MSG_NOTE, "%d ", next);
1350 for (k = 0; k < group_size; ++k)
1351 dump_printf (MSG_NOTE, "%d ", k);
1352 dump_printf (MSG_NOTE, "\n");
1355 /* In case of reduction every load permutation is allowed, since the order
1356 of the reduction statements is not important (as opposed to the case of
1357 grouped stores). The only condition we need to check is that all the
1358 load nodes are of the same size and have the same permutation (and then
1359 rearrange all the nodes of the SLP instance according to this
1362 /* Check that all the load nodes are of the same size. */
1363 /* ??? Can't we assert this? */
1364 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1365 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1368 node = SLP_INSTANCE_TREE (slp_instn);
1369 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1371 /* Reduction (there are no data-refs in the root).
1372 In reduction chain the order of the loads is important. */
1373 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1374 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1379 /* Compare all the permutation sequences to the first one. We know
1380 that at least one load is permuted. */
1381 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1382 if (!node->load_permutation.exists ())
1384 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1386 if (!load->load_permutation.exists ())
1388 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1389 if (lidx != node->load_permutation[j])
1393 /* Check that the loads in the first sequence are different and there
1394 are no gaps between them. */
1395 load_index = sbitmap_alloc (group_size);
1396 bitmap_clear (load_index);
1397 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1399 if (bitmap_bit_p (load_index, lidx))
1401 sbitmap_free (load_index);
1404 bitmap_set_bit (load_index, lidx);
1406 for (i = 0; i < group_size; i++)
1407 if (!bitmap_bit_p (load_index, i))
1409 sbitmap_free (load_index);
1412 sbitmap_free (load_index);
1414 /* This permutation is valid for reduction. Since the order of the
1415 statements in the nodes is not important unless they are memory
1416 accesses, we can rearrange the statements in all the nodes
1417 according to the order of the loads. */
1418 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1419 node->load_permutation);
1421 /* We are done, no actual permutations need to be generated. */
1422 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1423 SLP_TREE_LOAD_PERMUTATION (node).release ();
1427 /* In basic block vectorization we allow any subchain of an interleaving
1429 FORNOW: not supported in loop SLP because of realignment compications. */
1430 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1432 /* Check whether the loads in an instance form a subchain and thus
1433 no permutation is necessary. */
1434 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1436 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1438 bool subchain_p = true;
1440 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1442 if (j != 0 && next_load != load)
1447 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1450 SLP_TREE_LOAD_PERMUTATION (node).release ();
1453 /* Verify the permutation can be generated. */
1455 if (!vect_transform_slp_perm_load (node, tem, NULL,
1456 1, slp_instn, true))
1458 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1460 "unsupported load permutation\n");
1466 /* Check that the alignment of the first load in every subchain, i.e.,
1467 the first statement in every load node, is supported.
1468 ??? This belongs in alignment checking. */
1469 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1471 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1472 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1474 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1475 if (vect_supportable_dr_alignment (dr, false)
1476 == dr_unaligned_unsupported)
1478 if (dump_enabled_p ())
1480 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1482 "unsupported unaligned load ");
1483 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1485 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1495 /* For loop vectorization verify we can generate the permutation. */
1496 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1497 if (node->load_permutation.exists ()
1498 && !vect_transform_slp_perm_load
1500 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1507 /* Find the last store in SLP INSTANCE. */
1510 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1512 gimple last = NULL, stmt;
1514 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1516 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1517 if (is_pattern_stmt_p (stmt_vinfo))
1518 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1520 last = get_later_stmt (stmt, last);
1526 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1529 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1530 stmt_vector_for_cost *prologue_cost_vec,
1531 stmt_vector_for_cost *body_cost_vec,
1532 unsigned ncopies_for_cost)
1537 stmt_vec_info stmt_info;
1539 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1541 /* Recurse down the SLP tree. */
1542 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1544 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1545 body_cost_vec, ncopies_for_cost);
1547 /* Look at the first scalar stmt to determine the cost. */
1548 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1549 stmt_info = vinfo_for_stmt (stmt);
1550 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1553 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1554 vect_uninitialized_def,
1555 node, prologue_cost_vec, body_cost_vec);
1559 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1560 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1561 node, prologue_cost_vec, body_cost_vec);
1562 /* If the load is permuted record the cost for the permutation.
1563 ??? Loads from multiple chains are let through here only
1564 for a single special case involving complex numbers where
1565 in the end no permutation is necessary. */
1566 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1567 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1568 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1569 && vect_get_place_in_interleaving_chain
1570 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1572 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1573 stmt_info, 0, vect_body);
1580 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1581 stmt_info, 0, vect_body);
1582 if (SLP_TREE_TWO_OPERATORS (node))
1584 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1585 stmt_info, 0, vect_body);
1586 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1587 stmt_info, 0, vect_body);
1591 /* Scan operands and account for prologue cost of constants/externals.
1592 ??? This over-estimates cost for multiple uses and should be
1594 lhs = gimple_get_lhs (stmt);
1595 for (i = 0; i < gimple_num_ops (stmt); ++i)
1597 tree def, op = gimple_op (stmt, i);
1599 enum vect_def_type dt;
1600 if (!op || op == lhs)
1602 if (vect_is_simple_use (op, NULL, STMT_VINFO_LOOP_VINFO (stmt_info),
1603 STMT_VINFO_BB_VINFO (stmt_info),
1604 &def_stmt, &def, &dt))
1606 /* Without looking at the actual initializer a vector of
1607 constants can be implemented as load from the constant pool.
1608 ??? We need to pass down stmt_info for a vector type
1609 even if it points to the wrong stmt. */
1610 if (dt == vect_constant_def)
1611 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1612 stmt_info, 0, vect_prologue);
1613 else if (dt == vect_external_def)
1614 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1615 stmt_info, 0, vect_prologue);
1620 /* Compute the cost for the SLP instance INSTANCE. */
1623 vect_analyze_slp_cost (slp_instance instance, void *data)
1625 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1626 unsigned ncopies_for_cost;
1627 stmt_info_for_cost *si;
1630 /* Calculate the number of vector stmts to create based on the unrolling
1631 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1632 GROUP_SIZE / NUNITS otherwise. */
1633 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1634 slp_tree node = SLP_INSTANCE_TREE (instance);
1635 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1636 /* Adjust the group_size by the vectorization factor which is always one
1637 for basic-block vectorization. */
1638 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1639 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1640 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1641 /* For reductions look at a reduction operand in case the reduction
1642 operation is widening like DOT_PROD or SAD. */
1643 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1645 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1646 switch (gimple_assign_rhs_code (stmt))
1650 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1651 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1656 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1658 prologue_cost_vec.create (10);
1659 body_cost_vec.create (10);
1660 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1661 &prologue_cost_vec, &body_cost_vec,
1664 /* Record the prologue costs, which were delayed until we were
1665 sure that SLP was successful. */
1666 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1668 struct _stmt_vec_info *stmt_info
1669 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1670 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1671 si->misalign, vect_prologue);
1674 /* Record the instance's instructions in the target cost model. */
1675 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1677 struct _stmt_vec_info *stmt_info
1678 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1679 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1680 si->misalign, vect_body);
1683 prologue_cost_vec.release ();
1684 body_cost_vec.release ();
1687 /* Analyze an SLP instance starting from a group of grouped stores. Call
1688 vect_build_slp_tree to build a tree of packed stmts if possible.
1689 Return FALSE if it's impossible to SLP any stmt in the loop. */
1692 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1693 gimple stmt, unsigned max_tree_size)
1695 slp_instance new_instance;
1697 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1698 unsigned int unrolling_factor = 1, nunits;
1699 tree vectype, scalar_type = NULL_TREE;
1701 unsigned int vectorization_factor = 0;
1703 unsigned int max_nunits = 0;
1704 vec<slp_tree> loads;
1705 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1706 vec<gimple> scalar_stmts;
1708 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1712 scalar_type = TREE_TYPE (DR_REF (dr));
1713 vectype = get_vectype_for_scalar_type (scalar_type);
1717 gcc_assert (loop_vinfo);
1718 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1721 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1725 gcc_assert (loop_vinfo);
1726 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1727 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1732 if (dump_enabled_p ())
1734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735 "Build SLP failed: unsupported data-type ");
1736 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1737 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1743 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1745 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1747 vectorization_factor = nunits;
1749 /* Calculate the unrolling factor. */
1750 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1751 if (unrolling_factor != 1 && !loop_vinfo)
1753 if (dump_enabled_p ())
1754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1755 "Build SLP failed: unrolling required in basic"
1761 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1762 scalar_stmts.create (group_size);
1764 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1766 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1769 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1770 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1771 scalar_stmts.safe_push (
1772 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1774 scalar_stmts.safe_push (next);
1775 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1777 /* Mark the first element of the reduction chain as reduction to properly
1778 transform the node. In the reduction analysis phase only the last
1779 element of the chain is marked as reduction. */
1780 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1781 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1785 /* Collect reduction statements. */
1786 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1787 for (i = 0; reductions.iterate (i, &next); i++)
1788 scalar_stmts.safe_push (next);
1791 node = vect_create_new_slp_node (scalar_stmts);
1793 loads.create (group_size);
1795 /* Build the tree for the SLP instance. */
1796 bool *matches = XALLOCAVEC (bool, group_size);
1797 unsigned npermutes = 0;
1798 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1799 &max_nunits, &loads,
1800 vectorization_factor, matches, &npermutes, NULL,
1803 /* Calculate the unrolling factor based on the smallest type. */
1804 if (max_nunits > nunits)
1805 unrolling_factor = least_common_multiple (max_nunits, group_size)
1808 if (unrolling_factor != 1 && !loop_vinfo)
1810 if (dump_enabled_p ())
1811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1812 "Build SLP failed: unrolling required in basic"
1814 vect_free_slp_tree (node);
1819 /* Create a new SLP instance. */
1820 new_instance = XNEW (struct _slp_instance);
1821 SLP_INSTANCE_TREE (new_instance) = node;
1822 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1823 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1824 SLP_INSTANCE_LOADS (new_instance) = loads;
1826 /* Compute the load permutation. */
1828 bool loads_permuted = false;
1829 FOR_EACH_VEC_ELT (loads, i, load_node)
1831 vec<unsigned> load_permutation;
1833 gimple load, first_stmt;
1834 bool this_load_permuted = false;
1835 load_permutation.create (group_size);
1836 first_stmt = GROUP_FIRST_ELEMENT
1837 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1838 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1841 = vect_get_place_in_interleaving_chain (load, first_stmt);
1842 gcc_assert (load_place != -1);
1843 if (load_place != j)
1844 this_load_permuted = true;
1845 load_permutation.safe_push (load_place);
1847 if (!this_load_permuted)
1849 load_permutation.release ();
1852 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1853 loads_permuted = true;
1858 if (!vect_supported_load_permutation_p (new_instance))
1860 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 "Build SLP failed: unsupported load "
1865 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1866 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1868 vect_free_slp_instance (new_instance);
1875 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1877 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1879 if (dump_enabled_p ())
1880 vect_print_slp_tree (MSG_NOTE, node);
1885 /* Failed to SLP. */
1886 /* Free the allocated memory. */
1887 vect_free_slp_tree (node);
1894 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1895 trees of packed scalar stmts if SLP is possible. */
1898 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1899 unsigned max_tree_size)
1902 vec<gimple> grouped_stores;
1903 vec<gimple> reductions = vNULL;
1904 vec<gimple> reduc_chains = vNULL;
1905 gimple first_element;
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1913 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1914 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1915 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1918 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1920 /* Find SLP sequences starting from groups of grouped stores. */
1921 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1922 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1926 if (reduc_chains.length () > 0)
1928 /* Find SLP sequences starting from reduction chains. */
1929 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1930 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1936 /* Don't try to vectorize SLP reductions if reduction chain was
1941 /* Find SLP sequences starting from groups of reductions. */
1942 if (reductions.length () > 1
1943 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1951 /* For each possible SLP instance decide whether to SLP it and calculate overall
1952 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1953 least one instance. */
1956 vect_make_slp_decision (loop_vec_info loop_vinfo)
1958 unsigned int i, unrolling_factor = 1;
1959 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1960 slp_instance instance;
1961 int decided_to_slp = 0;
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1967 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1969 /* FORNOW: SLP if you can. */
1970 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1971 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1973 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1974 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1975 loop-based vectorization. Such stmts will be marked as HYBRID. */
1976 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1980 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1982 if (decided_to_slp && dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE, vect_location,
1984 "Decided to SLP %d instances. Unrolling factor %d\n",
1985 decided_to_slp, unrolling_factor);
1987 return (decided_to_slp > 0);
1991 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1992 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1995 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1997 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1998 imm_use_iterator imm_iter;
2000 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2002 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2003 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2006 /* Propagate hybrid down the SLP tree. */
2007 if (stype == hybrid)
2009 else if (HYBRID_SLP_STMT (stmt_vinfo))
2013 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2014 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2015 /* We always get the pattern stmt here, but for immediate
2016 uses we have to use the LHS of the original stmt. */
2017 gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2018 if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
2019 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2020 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
2021 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
2023 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2025 use_vinfo = vinfo_for_stmt (use_stmt);
2026 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2027 && STMT_VINFO_RELATED_STMT (use_vinfo))
2028 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2029 if (!STMT_SLP_TYPE (use_vinfo)
2030 && (STMT_VINFO_RELEVANT (use_vinfo)
2031 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2032 && !(gimple_code (use_stmt) == GIMPLE_PHI
2033 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2038 if (stype == hybrid)
2040 if (dump_enabled_p ())
2042 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2043 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2045 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2050 vect_detect_hybrid_slp_stmts (child, i, stype);
2053 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2056 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2058 walk_stmt_info *wi = (walk_stmt_info *)data;
2059 struct loop *loopp = (struct loop *)wi->info;
2064 if (TREE_CODE (*tp) == SSA_NAME
2065 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2067 gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
2068 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2069 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2071 if (dump_enabled_p ())
2073 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2074 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2076 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2084 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2087 /* If the stmt is in a SLP instance then this isn't a reason
2088 to mark use definitions in other SLP instances as hybrid. */
2089 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2094 /* Find stmts that must be both vectorized and SLPed. */
2097 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2100 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2101 slp_instance instance;
2103 if (dump_enabled_p ())
2104 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2107 /* First walk all pattern stmt in the loop and mark defs of uses as
2108 hybrid because immediate uses in them are not recorded. */
2109 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2111 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2112 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2115 gimple stmt = gsi_stmt (gsi);
2116 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2117 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2120 memset (&wi, 0, sizeof (wi));
2121 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2122 gimple_stmt_iterator gsi2
2123 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2124 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2125 vect_detect_hybrid_slp_1, &wi);
2126 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2127 vect_detect_hybrid_slp_2,
2128 vect_detect_hybrid_slp_1, &wi);
2133 /* Then walk the SLP instance trees marking stmts with uses in
2134 non-SLP stmts as hybrid, also propagating hybrid down the
2135 SLP tree, collecting the above info on-the-fly. */
2136 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2138 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2139 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2145 /* Create and initialize a new bb_vec_info struct for BB, as well as
2146 stmt_vec_info structs for all the stmts in it. */
2149 new_bb_vec_info (basic_block bb)
2151 bb_vec_info res = NULL;
2152 gimple_stmt_iterator gsi;
2154 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2155 BB_VINFO_BB (res) = bb;
2157 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2159 gimple stmt = gsi_stmt (gsi);
2160 gimple_set_uid (stmt, 0);
2161 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2164 BB_VINFO_GROUPED_STORES (res).create (10);
2165 BB_VINFO_SLP_INSTANCES (res).create (2);
2166 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2173 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2174 stmts in the basic block. */
2177 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2179 vec<slp_instance> slp_instances;
2180 slp_instance instance;
2182 gimple_stmt_iterator si;
2188 bb = BB_VINFO_BB (bb_vinfo);
2190 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2192 gimple stmt = gsi_stmt (si);
2193 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2196 /* Free stmt_vec_info. */
2197 free_stmt_vec_info (stmt);
2200 vect_destroy_datarefs (NULL, bb_vinfo);
2201 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2202 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2203 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2204 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2205 vect_free_slp_instance (instance);
2206 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2207 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2213 /* Analyze statements contained in SLP tree node after recursively analyzing
2214 the subtree. Return TRUE if the operations are supported. */
2217 vect_slp_analyze_node_operations (slp_tree node)
2227 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2228 if (!vect_slp_analyze_node_operations (child))
2231 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2233 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2234 gcc_assert (stmt_info);
2235 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2237 if (!vect_analyze_stmt (stmt, &dummy, node))
2245 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2246 operations are supported. */
2249 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2251 slp_instance instance;
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_NOTE, vect_location,
2256 "=== vect_slp_analyze_operations ===\n");
2258 for (i = 0; slp_instances.iterate (i, &instance); )
2260 if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2262 dump_printf_loc (MSG_NOTE, vect_location,
2263 "removing SLP instance operations starting from: ");
2264 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2265 SLP_TREE_SCALAR_STMTS
2266 (SLP_INSTANCE_TREE (instance))[0], 0);
2267 vect_free_slp_instance (instance);
2268 slp_instances.ordered_remove (i);
2272 /* Compute the costs of the SLP instance. */
2273 vect_analyze_slp_cost (instance, data);
2278 if (!slp_instances.length ())
2285 /* Compute the scalar cost of the SLP node NODE and its children
2286 and return it. Do not account defs that are marked in LIFE and
2287 update LIFE according to uses of NODE. */
2290 vect_bb_slp_scalar_cost (basic_block bb,
2291 slp_tree node, vec<bool, va_heap> *life)
2293 unsigned scalar_cost = 0;
2298 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2301 ssa_op_iter op_iter;
2302 def_operand_p def_p;
2303 stmt_vec_info stmt_info;
2308 /* If there is a non-vectorized use of the defs then the scalar
2309 stmt is kept live in which case we do not account it or any
2310 required defs in the SLP children in the scalar cost. This
2311 way we make the vectorization more costly when compared to
2313 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2315 imm_use_iterator use_iter;
2317 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2318 if (!is_gimple_debug (use_stmt)
2319 && (gimple_code (use_stmt) == GIMPLE_PHI
2320 || gimple_bb (use_stmt) != bb
2321 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2324 BREAK_FROM_IMM_USE_STMT (use_iter);
2330 stmt_info = vinfo_for_stmt (stmt);
2331 if (STMT_VINFO_DATA_REF (stmt_info))
2333 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2334 stmt_cost = vect_get_stmt_cost (scalar_load);
2336 stmt_cost = vect_get_stmt_cost (scalar_store);
2339 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2341 scalar_cost += stmt_cost;
2344 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2346 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2351 /* Check if vectorization of the basic block is profitable. */
2354 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2356 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2357 slp_instance instance;
2359 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2360 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2362 /* Calculate scalar cost. */
2363 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2365 auto_vec<bool, 20> life;
2366 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2367 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2368 SLP_INSTANCE_TREE (instance),
2372 /* Complete the target-specific cost calculation. */
2373 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2374 &vec_inside_cost, &vec_epilogue_cost);
2376 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2378 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2381 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2383 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2384 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2385 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2388 /* Vectorization is profitable if its cost is less than the cost of scalar
2390 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2396 /* Check if the basic block can be vectorized. */
2399 vect_slp_analyze_bb_1 (basic_block bb)
2401 bb_vec_info bb_vinfo;
2402 vec<slp_instance> slp_instances;
2403 slp_instance instance;
2406 unsigned n_stmts = 0;
2408 bb_vinfo = new_bb_vec_info (bb);
2412 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2414 if (dump_enabled_p ())
2415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2416 "not vectorized: unhandled data-ref in basic "
2419 destroy_bb_vec_info (bb_vinfo);
2423 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2425 if (dump_enabled_p ())
2426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427 "not vectorized: not enough data-refs in "
2430 destroy_bb_vec_info (bb_vinfo);
2434 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2436 if (dump_enabled_p ())
2437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2438 "not vectorized: unhandled data access in "
2441 destroy_bb_vec_info (bb_vinfo);
2445 vect_pattern_recog (NULL, bb_vinfo);
2447 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2449 if (dump_enabled_p ())
2450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2451 "not vectorized: bad data alignment in basic "
2454 destroy_bb_vec_info (bb_vinfo);
2458 /* Check the SLP opportunities in the basic block, analyze and build SLP
2460 if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2462 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2465 "Failed to SLP the basic block.\n");
2466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2467 "not vectorized: failed to find SLP opportunities "
2468 "in basic block.\n");
2471 destroy_bb_vec_info (bb_vinfo);
2475 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2477 /* Mark all the statements that we want to vectorize as pure SLP and
2479 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2481 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2482 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2485 /* Mark all the statements that we do not want to vectorize. */
2486 for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2487 !gsi_end_p (gsi); gsi_next (&gsi))
2489 stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2490 if (STMT_SLP_TYPE (vinfo) != pure_slp)
2491 STMT_VINFO_VECTORIZABLE (vinfo) = false;
2494 /* Analyze dependences. At this point all stmts not participating in
2495 vectorization have to be marked. Dependence analysis assumes
2496 that we either vectorize all SLP instances or none at all. */
2497 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2499 if (dump_enabled_p ())
2500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2501 "not vectorized: unhandled data dependence "
2502 "in basic block.\n");
2504 destroy_bb_vec_info (bb_vinfo);
2508 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2510 if (dump_enabled_p ())
2511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2512 "not vectorized: unsupported alignment in basic "
2514 destroy_bb_vec_info (bb_vinfo);
2518 if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2519 BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2521 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523 "not vectorized: bad operation in basic block.\n");
2525 destroy_bb_vec_info (bb_vinfo);
2529 /* Cost model: check if the vectorization is worthwhile. */
2530 if (!unlimited_cost_model (NULL)
2531 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2535 "not vectorized: vectorization is not "
2538 destroy_bb_vec_info (bb_vinfo);
2542 if (dump_enabled_p ())
2543 dump_printf_loc (MSG_NOTE, vect_location,
2544 "Basic block will be vectorized using SLP\n");
2551 vect_slp_analyze_bb (basic_block bb)
2553 bb_vec_info bb_vinfo;
2555 gimple_stmt_iterator gsi;
2556 unsigned int vector_sizes;
2558 if (dump_enabled_p ())
2559 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2561 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2563 gimple stmt = gsi_stmt (gsi);
2564 if (!is_gimple_debug (stmt)
2565 && !gimple_nop_p (stmt)
2566 && gimple_code (stmt) != GIMPLE_LABEL)
2570 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2572 if (dump_enabled_p ())
2573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2574 "not vectorized: too many instructions in "
2580 /* Autodetect first vector size we try. */
2581 current_vector_size = 0;
2582 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2586 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2590 destroy_bb_vec_info (bb_vinfo);
2592 vector_sizes &= ~current_vector_size;
2593 if (vector_sizes == 0
2594 || current_vector_size == 0)
2597 /* Try the next biggest vector size. */
2598 current_vector_size = 1 << floor_log2 (vector_sizes);
2599 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_NOTE, vect_location,
2601 "***** Re-trying analysis with "
2602 "vector size %d\n", current_vector_size);
2607 /* For constant and loop invariant defs of SLP_NODE this function returns
2608 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2609 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2610 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2611 REDUC_INDEX is the index of the reduction operand in the statements, unless
2615 vect_get_constant_vectors (tree op, slp_tree slp_node,
2616 vec<tree> *vec_oprnds,
2617 unsigned int op_num, unsigned int number_of_vectors,
2620 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2621 gimple stmt = stmts[0];
2622 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2626 unsigned j, number_of_places_left_in_vector;
2629 int group_size = stmts.length ();
2630 unsigned int vec_num, i;
2631 unsigned number_of_copies = 1;
2633 voprnds.create (number_of_vectors);
2634 bool constant_p, is_store;
2635 tree neutral_op = NULL;
2636 enum tree_code code = gimple_expr_code (stmt);
2639 gimple_seq ctor_seq = NULL;
2641 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2642 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2644 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2645 && reduc_index != -1)
2647 op_num = reduc_index;
2648 op = gimple_op (stmt, op_num + 1);
2649 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2650 we need either neutral operands or the original operands. See
2651 get_initial_def_for_reduction() for details. */
2654 case WIDEN_SUM_EXPR:
2661 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2662 neutral_op = build_real (TREE_TYPE (op), dconst0);
2664 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2669 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2670 neutral_op = build_real (TREE_TYPE (op), dconst1);
2672 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2677 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2680 /* For MIN/MAX we don't have an easy neutral operand but
2681 the initial values can be used fine here. Only for
2682 a reduction chain we have to force a neutral element. */
2685 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2689 def_stmt = SSA_NAME_DEF_STMT (op);
2690 loop = (gimple_bb (stmt))->loop_father;
2691 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2692 loop_preheader_edge (loop));
2697 gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2702 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2705 op = gimple_assign_rhs1 (stmt);
2712 if (CONSTANT_CLASS_P (op))
2717 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2718 created vectors. It is greater than 1 if unrolling is performed.
2720 For example, we have two scalar operands, s1 and s2 (e.g., group of
2721 strided accesses of size two), while NUNITS is four (i.e., four scalars
2722 of this type can be packed in a vector). The output vector will contain
2723 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2726 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2727 containing the operands.
2729 For example, NUNITS is four as before, and the group size is 8
2730 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2731 {s5, s6, s7, s8}. */
2733 number_of_copies = nunits * number_of_vectors / group_size;
2735 number_of_places_left_in_vector = nunits;
2736 elts = XALLOCAVEC (tree, nunits);
2737 bool place_after_defs = false;
2738 for (j = 0; j < number_of_copies; j++)
2740 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2743 op = gimple_assign_rhs1 (stmt);
2749 if (op_num == 0 || op_num == 1)
2751 tree cond = gimple_assign_rhs1 (stmt);
2752 op = TREE_OPERAND (cond, op_num);
2757 op = gimple_assign_rhs2 (stmt);
2759 op = gimple_assign_rhs3 (stmt);
2764 op = gimple_call_arg (stmt, op_num);
2771 op = gimple_op (stmt, op_num + 1);
2772 /* Unlike the other binary operators, shifts/rotates have
2773 the shift count being int, instead of the same type as
2774 the lhs, so make sure the scalar is the right type if
2775 we are dealing with vectors of
2776 long long/long/short/char. */
2777 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2778 op = fold_convert (TREE_TYPE (vector_type), op);
2782 op = gimple_op (stmt, op_num + 1);
2787 if (reduc_index != -1)
2789 loop = (gimple_bb (stmt))->loop_father;
2790 def_stmt = SSA_NAME_DEF_STMT (op);
2794 /* Get the def before the loop. In reduction chain we have only
2795 one initial value. */
2796 if ((j != (number_of_copies - 1)
2797 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2802 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2803 loop_preheader_edge (loop));
2806 /* Create 'vect_ = {op0,op1,...,opn}'. */
2807 number_of_places_left_in_vector--;
2809 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2811 if (CONSTANT_CLASS_P (op))
2813 op = fold_unary (VIEW_CONVERT_EXPR,
2814 TREE_TYPE (vector_type), op);
2815 gcc_assert (op && CONSTANT_CLASS_P (op));
2819 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2821 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2823 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2824 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2828 elts[number_of_places_left_in_vector] = op;
2829 if (!CONSTANT_CLASS_P (op))
2831 if (TREE_CODE (orig_op) == SSA_NAME
2832 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2833 && STMT_VINFO_BB_VINFO (stmt_vinfo)
2834 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2835 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2836 place_after_defs = true;
2838 if (number_of_places_left_in_vector == 0)
2840 number_of_places_left_in_vector = nunits;
2843 vec_cst = build_vector (vector_type, elts);
2846 vec<constructor_elt, va_gc> *v;
2848 vec_alloc (v, nunits);
2849 for (k = 0; k < nunits; ++k)
2850 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2851 vec_cst = build_constructor (vector_type, v);
2854 gimple_stmt_iterator gsi;
2855 if (place_after_defs)
2858 (vect_find_last_scalar_stmt_in_slp (slp_node));
2859 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2862 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2863 if (ctor_seq != NULL)
2865 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2866 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2870 voprnds.quick_push (init);
2871 place_after_defs = false;
2876 /* Since the vectors are created in the reverse order, we should invert
2878 vec_num = voprnds.length ();
2879 for (j = vec_num; j != 0; j--)
2881 vop = voprnds[j - 1];
2882 vec_oprnds->quick_push (vop);
2887 /* In case that VF is greater than the unrolling factor needed for the SLP
2888 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2889 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2890 to replicate the vectors. */
2891 while (number_of_vectors > vec_oprnds->length ())
2893 tree neutral_vec = NULL;
2898 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2900 vec_oprnds->quick_push (neutral_vec);
2904 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2905 vec_oprnds->quick_push (vop);
2911 /* Get vectorized definitions from SLP_NODE that contains corresponding
2912 vectorized def-stmts. */
2915 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2918 gimple vec_def_stmt;
2921 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2923 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2925 gcc_assert (vec_def_stmt);
2926 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2927 vec_oprnds->quick_push (vec_oprnd);
2932 /* Get vectorized definitions for SLP_NODE.
2933 If the scalar definitions are loop invariants or constants, collect them and
2934 call vect_get_constant_vectors() to create vector stmts.
2935 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2936 must be stored in the corresponding child of SLP_NODE, and we call
2937 vect_get_slp_vect_defs () to retrieve them. */
2940 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2941 vec<vec<tree> > *vec_oprnds, int reduc_index)
2944 int number_of_vects = 0, i;
2945 unsigned int child_index = 0;
2946 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2947 slp_tree child = NULL;
2950 bool vectorized_defs;
2952 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2953 FOR_EACH_VEC_ELT (ops, i, oprnd)
2955 /* For each operand we check if it has vectorized definitions in a child
2956 node or we need to create them (for invariants and constants). We
2957 check if the LHS of the first stmt of the next child matches OPRND.
2958 If it does, we found the correct child. Otherwise, we call
2959 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2960 to check this child node for the next operand. */
2961 vectorized_defs = false;
2962 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2964 child = SLP_TREE_CHILDREN (slp_node)[child_index];
2966 /* We have to check both pattern and original def, if available. */
2969 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2971 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2973 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2975 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2977 /* The number of vector defs is determined by the number of
2978 vector statements in the node from which we get those
2980 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2981 vectorized_defs = true;
2989 if (!vectorized_defs)
2993 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2994 /* Number of vector stmts was calculated according to LHS in
2995 vect_schedule_slp_instance (), fix it by replacing LHS with
2996 RHS, if necessary. See vect_get_smallest_scalar_type () for
2998 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3000 if (rhs_size_unit != lhs_size_unit)
3002 number_of_vects *= rhs_size_unit;
3003 number_of_vects /= lhs_size_unit;
3008 /* Allocate memory for vectorized defs. */
3010 vec_defs.create (number_of_vects);
3012 /* For reduction defs we call vect_get_constant_vectors (), since we are
3013 looking for initial loop invariant values. */
3014 if (vectorized_defs && reduc_index == -1)
3015 /* The defs are already vectorized. */
3016 vect_get_slp_vect_defs (child, &vec_defs);
3018 /* Build vectors from scalar defs. */
3019 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3020 number_of_vects, reduc_index);
3022 vec_oprnds->quick_push (vec_defs);
3024 /* For reductions, we only need initial values. */
3025 if (reduc_index != -1)
3031 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
3032 building a vector of type MASK_TYPE from it) and two input vectors placed in
3033 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
3034 shifting by STRIDE elements of DR_CHAIN for every copy.
3035 (STRIDE is the number of vectorized stmts for NODE divided by the number of
3037 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
3038 the created stmts must be inserted. */
3041 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
3042 tree mask, int first_vec_indx, int second_vec_indx,
3043 gimple_stmt_iterator *gsi, slp_tree node,
3044 tree vectype, vec<tree> dr_chain,
3045 int ncopies, int vect_stmts_counter)
3048 gimple perm_stmt = NULL;
3049 stmt_vec_info next_stmt_info;
3051 tree first_vec, second_vec, data_ref;
3053 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3055 /* Initialize the vect stmts of NODE to properly insert the generated
3057 for (i = SLP_TREE_VEC_STMTS (node).length ();
3058 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3059 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3061 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3062 for (i = 0; i < ncopies; i++)
3064 first_vec = dr_chain[first_vec_indx];
3065 second_vec = dr_chain[second_vec_indx];
3067 /* Generate the permute statement. */
3068 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3069 first_vec, second_vec, mask);
3070 data_ref = make_ssa_name (perm_dest, perm_stmt);
3071 gimple_set_lhs (perm_stmt, data_ref);
3072 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3074 /* Store the vector statement in NODE. */
3075 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3077 first_vec_indx += stride;
3078 second_vec_indx += stride;
3081 /* Mark the scalar stmt as vectorized. */
3082 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
3083 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
3087 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3088 return in CURRENT_MASK_ELEMENT its equivalent in target specific
3089 representation. Check that the mask is valid and return FALSE if not.
3090 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3091 the next vector, i.e., the current first vector is not needed. */
3094 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
3095 int mask_nunits, bool only_one_vec, int index,
3096 unsigned char *mask, int *current_mask_element,
3097 bool *need_next_vector, int *number_of_mask_fixes,
3098 bool *mask_fixed, bool *needs_first_vector)
3102 /* Convert to target specific representation. */
3103 *current_mask_element = first_mask_element + m;
3104 /* Adjust the value in case it's a mask for second and third vectors. */
3105 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3107 if (*current_mask_element < 0)
3109 if (dump_enabled_p ())
3111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3112 "permutation requires past vector ");
3113 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3114 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3119 if (*current_mask_element < mask_nunits)
3120 *needs_first_vector = true;
3122 /* We have only one input vector to permute but the mask accesses values in
3123 the next vector as well. */
3124 if (only_one_vec && *current_mask_element >= mask_nunits)
3126 if (dump_enabled_p ())
3128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3129 "permutation requires at least two vectors ");
3130 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3131 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3137 /* The mask requires the next vector. */
3138 while (*current_mask_element >= mask_nunits * 2)
3140 if (*needs_first_vector || *mask_fixed)
3142 /* We either need the first vector too or have already moved to the
3143 next vector. In both cases, this permutation needs three
3145 if (dump_enabled_p ())
3147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3148 "permutation requires at "
3149 "least three vectors ");
3150 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3151 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3157 /* We move to the next vector, dropping the first one and working with
3158 the second and the third - we need to adjust the values of the mask
3160 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3162 for (i = 0; i < index; i++)
3163 mask[i] -= mask_nunits * *number_of_mask_fixes;
3165 (*number_of_mask_fixes)++;
3169 *need_next_vector = *mask_fixed;
3171 /* This was the last element of this mask. Start a new one. */
3172 if (index == mask_nunits - 1)
3174 *number_of_mask_fixes = 1;
3175 *mask_fixed = false;
3176 *needs_first_vector = false;
3183 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3184 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3185 permute statements for the SLP node NODE of the SLP instance
3186 SLP_NODE_INSTANCE. */
3189 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3190 gimple_stmt_iterator *gsi, int vf,
3191 slp_instance slp_node_instance, bool analyze_only)
3193 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3194 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3195 tree mask_element_type = NULL_TREE, mask_type;
3196 int i, j, k, nunits, vec_index = 0, scalar_index;
3197 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3198 gimple next_scalar_stmt;
3199 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3200 int first_mask_element;
3201 int index, unroll_factor, current_mask_element, ncopies;
3202 unsigned char *mask;
3203 bool only_one_vec = false, need_next_vector = false;
3204 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3205 int number_of_mask_fixes = 1;
3206 bool mask_fixed = false;
3207 bool needs_first_vector = false;
3210 mode = TYPE_MODE (vectype);
3212 if (!can_vec_perm_p (mode, false, NULL))
3214 if (dump_enabled_p ())
3216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3217 "no vect permute for ");
3218 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3219 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3224 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3225 same size as the vector element being permuted. */
3226 mask_element_type = lang_hooks.types.type_for_mode
3227 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3228 mask_type = get_vectype_for_scalar_type (mask_element_type);
3229 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3230 mask = XALLOCAVEC (unsigned char, nunits);
3231 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3233 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3234 unrolling factor. */
3235 orig_vec_stmts_num = group_size *
3236 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3237 if (orig_vec_stmts_num == 1)
3238 only_one_vec = true;
3240 /* Number of copies is determined by the final vectorization factor
3241 relatively to SLP_NODE_INSTANCE unrolling factor. */
3242 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3244 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3247 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3249 /* Generate permutation masks for every NODE. Number of masks for each NODE
3250 is equal to GROUP_SIZE.
3251 E.g., we have a group of three nodes with three loads from the same
3252 location in each node, and the vector size is 4. I.e., we have a
3253 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3254 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3255 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3258 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3259 The last mask is illegal since we assume two operands for permute
3260 operation, and the mask element values can't be outside that range.
3261 Hence, the last mask must be converted into {2,5,5,5}.
3262 For the first two permutations we need the first and the second input
3263 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3264 we need the second and the third vectors: {b1,c1,a2,b2} and
3270 vect_stmts_counter = 0;
3272 first_vec_index = vec_index++;
3274 second_vec_index = first_vec_index;
3276 second_vec_index = vec_index++;
3278 for (j = 0; j < unroll_factor; j++)
3280 for (k = 0; k < group_size; k++)
3282 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3283 first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
3284 if (!vect_get_mask_element (stmt, first_mask_element, 0,
3285 nunits, only_one_vec, index,
3286 mask, ¤t_mask_element,
3288 &number_of_mask_fixes, &mask_fixed,
3289 &needs_first_vector))
3291 gcc_assert (current_mask_element >= 0
3292 && current_mask_element < 2 * nunits);
3293 mask[index++] = current_mask_element;
3295 if (index == nunits)
3298 if (!can_vec_perm_p (mode, false, mask))
3300 if (dump_enabled_p ())
3302 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3304 "unsupported vect permute { ");
3305 for (i = 0; i < nunits; ++i)
3306 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3308 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3316 tree mask_vec, *mask_elts;
3317 mask_elts = XALLOCAVEC (tree, nunits);
3318 for (l = 0; l < nunits; ++l)
3319 mask_elts[l] = build_int_cst (mask_element_type,
3321 mask_vec = build_vector (mask_type, mask_elts);
3323 if (need_next_vector)
3325 first_vec_index = second_vec_index;
3326 second_vec_index = vec_index;
3330 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3332 vect_create_mask_and_perm (stmt, next_scalar_stmt,
3333 mask_vec, first_vec_index, second_vec_index,
3334 gsi, node, vectype, dr_chain,
3335 ncopies, vect_stmts_counter++);
3347 /* Vectorize SLP instance tree in postorder. */
3350 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3351 unsigned int vectorization_factor)
3354 bool grouped_store, is_store;
3355 gimple_stmt_iterator si;
3356 stmt_vec_info stmt_info;
3357 unsigned int vec_stmts_size, nunits, group_size;
3365 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3366 vect_schedule_slp_instance (child, instance, vectorization_factor);
3368 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3369 stmt_info = vinfo_for_stmt (stmt);
3371 /* VECTYPE is the type of the destination. */
3372 vectype = STMT_VINFO_VECTYPE (stmt_info);
3373 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3374 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3376 /* For each SLP instance calculate number of vector stmts to be created
3377 for the scalar stmts in each node of the SLP tree. Number of vector
3378 elements in one vector iteration is the number of scalar elements in
3379 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3381 Unless this is a SLP reduction in which case the number of vector
3382 stmts is equal to the number of vector stmts of the children. */
3383 if (GROUP_FIRST_ELEMENT (stmt_info)
3384 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3385 vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3387 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3389 if (!SLP_TREE_VEC_STMTS (node).exists ())
3391 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3392 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3395 if (dump_enabled_p ())
3397 dump_printf_loc (MSG_NOTE,vect_location,
3398 "------>vectorizing SLP node starting from: ");
3399 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3400 dump_printf (MSG_NOTE, "\n");
3403 /* Vectorized stmts go before the last scalar stmt which is where
3404 all uses are ready. */
3405 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3407 /* Mark the first element of the reduction chain as reduction to properly
3408 transform the node. In the analysis phase only the last element of the
3409 chain is marked as reduction. */
3410 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3411 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3413 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3414 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3417 /* Handle two-operation SLP nodes by vectorizing the group with
3418 both operations and then performing a merge. */
3419 if (SLP_TREE_TWO_OPERATORS (node))
3421 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3422 enum tree_code ocode;
3424 unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3425 bool allsame = true;
3426 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3427 if (gimple_assign_rhs_code (ostmt) != code0)
3431 ocode = gimple_assign_rhs_code (ostmt);
3440 tree tmask = NULL_TREE;
3441 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3442 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3443 SLP_TREE_VEC_STMTS (node).truncate (0);
3444 gimple_assign_set_rhs_code (stmt, ocode);
3445 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3446 gimple_assign_set_rhs_code (stmt, code0);
3447 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3448 SLP_TREE_VEC_STMTS (node).truncate (0);
3449 tree meltype = build_nonstandard_integer_type
3450 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3451 tree mvectype = get_same_sized_vectype (meltype, vectype);
3453 for (j = 0; j < v0.length (); ++j)
3455 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3456 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3458 if (k >= group_size)
3460 melts[l] = build_int_cst
3461 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3463 tmask = build_vector (mvectype, melts);
3465 /* ??? Not all targets support a VEC_PERM_EXPR with a
3466 constant mask that would translate to a vec_merge RTX
3467 (with their vec_perm_const_ok). We can either not
3468 vectorize in that case or let veclower do its job.
3469 Unfortunately that isn't too great and at least for
3470 plus/minus we'd eventually like to match targets
3471 vector addsub instructions. */
3473 vstmt = gimple_build_assign (make_ssa_name (vectype),
3475 gimple_assign_lhs (v0[j]),
3476 gimple_assign_lhs (v1[j]), tmask);
3477 vect_finish_stmt_generation (stmt, vstmt, &si);
3478 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3485 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3489 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3490 For loop vectorization this is done in vectorizable_call, but for SLP
3491 it needs to be deferred until end of vect_schedule_slp, because multiple
3492 SLP instances may refer to the same scalar stmt. */
3495 vect_remove_slp_scalar_calls (slp_tree node)
3497 gimple stmt, new_stmt;
3498 gimple_stmt_iterator gsi;
3502 stmt_vec_info stmt_info;
3507 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3508 vect_remove_slp_scalar_calls (child);
3510 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3512 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3514 stmt_info = vinfo_for_stmt (stmt);
3515 if (stmt_info == NULL
3516 || is_pattern_stmt_p (stmt_info)
3517 || !PURE_SLP_STMT (stmt_info))
3519 lhs = gimple_call_lhs (stmt);
3520 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3521 set_vinfo_for_stmt (new_stmt, stmt_info);
3522 set_vinfo_for_stmt (stmt, NULL);
3523 STMT_VINFO_STMT (stmt_info) = new_stmt;
3524 gsi = gsi_for_stmt (stmt);
3525 gsi_replace (&gsi, new_stmt, false);
3526 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3530 /* Generate vector code for all SLP instances in the loop/basic block. */
3533 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3535 vec<slp_instance> slp_instances;
3536 slp_instance instance;
3538 bool is_store = false;
3542 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3543 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3547 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3551 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3553 /* Schedule the tree of INSTANCE. */
3554 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3556 if (dump_enabled_p ())
3557 dump_printf_loc (MSG_NOTE, vect_location,
3558 "vectorizing stmts using SLP.\n");
3561 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3563 slp_tree root = SLP_INSTANCE_TREE (instance);
3566 gimple_stmt_iterator gsi;
3568 /* Remove scalar call stmts. Do not do this for basic-block
3569 vectorization as not all uses may be vectorized.
3570 ??? Why should this be necessary? DCE should be able to
3571 remove the stmts itself.
3572 ??? For BB vectorization we can as well remove scalar
3573 stmts starting from the SLP tree root if they have no
3576 vect_remove_slp_scalar_calls (root);
3578 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3579 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3581 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3584 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3585 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3586 /* Free the attached stmt_vec_info and remove the stmt. */
3587 gsi = gsi_for_stmt (store);
3588 unlink_stmt_vdef (store);
3589 gsi_remove (&gsi, true);
3590 release_defs (store);
3591 free_stmt_vec_info (store);
3599 /* Vectorize the basic block. */
3602 vect_slp_transform_bb (basic_block bb)
3604 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3605 gimple_stmt_iterator si;
3607 gcc_assert (bb_vinfo);
3609 if (dump_enabled_p ())
3610 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3612 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3614 gimple stmt = gsi_stmt (si);
3615 stmt_vec_info stmt_info;
3617 if (dump_enabled_p ())
3619 dump_printf_loc (MSG_NOTE, vect_location,
3620 "------>SLPing statement: ");
3621 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3622 dump_printf (MSG_NOTE, "\n");
3625 stmt_info = vinfo_for_stmt (stmt);
3626 gcc_assert (stmt_info);
3628 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3629 if (STMT_SLP_TYPE (stmt_info))
3631 vect_schedule_slp (NULL, bb_vinfo);
3636 if (dump_enabled_p ())
3637 dump_printf_loc (MSG_NOTE, vect_location,
3638 "BASIC BLOCK VECTORIZED\n");
3640 destroy_bb_vec_info (bb_vinfo);