1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
50 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
51 slp_tree, stmt_vector_for_cost *);
53 /* Initialize a SLP node. */
55 _slp_tree::_slp_tree ()
57 SLP_TREE_SCALAR_STMTS (this) = vNULL;
58 SLP_TREE_SCALAR_OPS (this) = vNULL;
59 SLP_TREE_VEC_STMTS (this) = vNULL;
60 SLP_TREE_VEC_DEFS (this) = vNULL;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
66 SLP_TREE_CODE (this) = ERROR_MARK;
67 SLP_TREE_VECTYPE (this) = NULL_TREE;
68 SLP_TREE_REPRESENTATIVE (this) = NULL;
74 /* Tear down a SLP node. */
76 _slp_tree::~_slp_tree ()
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
92 vect_free_slp_tree (slp_tree node, bool final_p)
97 if (--node->refcnt != 0)
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child, final_p);
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
109 stmt_vec_info stmt_info;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
125 vect_free_slp_instance (slp_instance instance, bool final_p)
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
128 SLP_INSTANCE_LOADS (instance).release ();
133 /* Create an SLP node for SCALAR_STMTS. */
136 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
138 slp_tree node = new _slp_tree;
139 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
140 SLP_TREE_CHILDREN (node).create (nops);
141 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
142 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
143 SLP_TREE_LANES (node) = scalar_stmts.length ();
146 stmt_vec_info stmt_info;
147 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
148 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
153 /* Create an SLP node for OPS. */
156 vect_create_new_slp_node (vec<tree> ops)
158 slp_tree node = new _slp_tree;
159 SLP_TREE_SCALAR_OPS (node) = ops;
160 SLP_TREE_DEF_TYPE (node) = vect_external_def;
161 SLP_TREE_LANES (node) = ops.length ();
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
169 typedef struct _slp_oprnd_info
171 /* Def-stmts for the operands. */
172 vec<stmt_vec_info> def_stmts;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
179 enum vect_def_type first_dt;
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
186 static vec<slp_oprnd_info>
187 vect_create_oprnd_info (int nops, int group_size)
190 slp_oprnd_info oprnd_info;
191 vec<slp_oprnd_info> oprnds_info;
193 oprnds_info.create (nops);
194 for (i = 0; i < nops; i++)
196 oprnd_info = XNEW (struct _slp_oprnd_info);
197 oprnd_info->def_stmts.create (group_size);
198 oprnd_info->ops.create (group_size);
199 oprnd_info->first_dt = vect_uninitialized_def;
200 oprnd_info->first_op_type = NULL_TREE;
201 oprnd_info->any_pattern = false;
202 oprnds_info.quick_push (oprnd_info);
209 /* Free operands info. */
212 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
215 slp_oprnd_info oprnd_info;
217 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
219 oprnd_info->def_stmts.release ();
220 oprnd_info->ops.release ();
221 XDELETE (oprnd_info);
224 oprnds_info.release ();
228 /* Return true if STMTS contains a pattern statement. */
231 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
233 stmt_vec_info stmt_info;
235 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
236 if (is_pattern_stmt_p (stmt_info))
241 /* Return true when all lanes in the external or constant NODE have
245 vect_slp_tree_uniform_p (slp_tree node)
247 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
248 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
250 /* Pre-exsting vectors. */
251 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
255 tree op, first = NULL_TREE;
256 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
259 else if (!operand_equal_p (first, op, 0))
265 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
266 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
270 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
271 stmt_vec_info first_stmt_info)
273 stmt_vec_info next_stmt_info = first_stmt_info;
276 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
281 if (next_stmt_info == stmt_info)
283 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
285 result += DR_GROUP_GAP (next_stmt_info);
287 while (next_stmt_info);
292 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
293 using the method implemented by duplicate_and_interleave. Return true
294 if so, returning the number of intermediate vectors in *NVECTORS_OUT
295 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
299 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
300 tree elt_type, unsigned int *nvectors_out,
301 tree *vector_type_out,
304 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
305 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
308 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
309 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
310 unsigned int nvectors = 1;
313 scalar_int_mode int_mode;
314 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
315 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
317 /* Get the natural vector type for this SLP group size. */
318 tree int_type = build_nonstandard_integer_type
319 (GET_MODE_BITSIZE (int_mode), 1);
321 = get_vectype_for_scalar_type (vinfo, int_type, count);
323 && VECTOR_MODE_P (TYPE_MODE (vector_type))
324 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
325 GET_MODE_SIZE (base_vector_mode)))
327 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
328 together into elements of type INT_TYPE and using the result
329 to build NVECTORS vectors. */
330 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
331 vec_perm_builder sel1 (nelts, 2, 3);
332 vec_perm_builder sel2 (nelts, 2, 3);
333 poly_int64 half_nelts = exact_div (nelts, 2);
334 for (unsigned int i = 0; i < 3; ++i)
337 sel1.quick_push (i + nelts);
338 sel2.quick_push (half_nelts + i);
339 sel2.quick_push (half_nelts + i + nelts);
341 vec_perm_indices indices1 (sel1, 2, nelts);
342 vec_perm_indices indices2 (sel2, 2, nelts);
343 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
344 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
347 *nvectors_out = nvectors;
349 *vector_type_out = vector_type;
352 permutes[0] = vect_gen_perm_mask_checked (vector_type,
354 permutes[1] = vect_gen_perm_mask_checked (vector_type,
361 if (!multiple_p (elt_bytes, 2, &elt_bytes))
367 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
368 they are of a valid type and that they match the defs of the first stmt of
369 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
370 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
371 indicates swap is required for cond_expr stmts. Specifically, *SWAP
372 is 1 if STMT is cond and operands of comparison need to be swapped;
373 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
374 If there is any operand swap in this function, *SWAP is set to non-zero
376 If there was a fatal error return -1; if the error could be corrected by
377 swapping operands of father node of this one, return 1; if everything is
380 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
381 vec<stmt_vec_info> stmts, unsigned stmt_num,
382 vec<slp_oprnd_info> *oprnds_info)
384 stmt_vec_info stmt_info = stmts[stmt_num];
386 unsigned int i, number_of_oprnds;
387 enum vect_def_type dt = vect_uninitialized_def;
388 slp_oprnd_info oprnd_info;
389 int first_op_idx = 1;
390 unsigned int commutative_op = -1U;
391 bool first_op_cond = false;
392 bool first = stmt_num == 0;
394 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
396 number_of_oprnds = gimple_call_num_args (stmt);
398 if (gimple_call_internal_p (stmt))
400 internal_fn ifn = gimple_call_internal_fn (stmt);
401 commutative_op = first_commutative_argument (ifn);
403 /* Masked load, only look at mask. */
404 if (ifn == IFN_MASK_LOAD)
406 number_of_oprnds = 1;
407 /* Mask operand index. */
412 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
414 enum tree_code code = gimple_assign_rhs_code (stmt);
415 number_of_oprnds = gimple_num_ops (stmt) - 1;
416 /* Swap can only be done for cond_expr if asked to, otherwise we
417 could result in different comparison code to the first stmt. */
418 if (code == COND_EXPR
419 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
421 first_op_cond = true;
425 commutative_op = commutative_tree_code (code) ? 0U : -1U;
430 bool swapped = (*swap != 0);
431 gcc_assert (!swapped || first_op_cond);
432 for (i = 0; i < number_of_oprnds; i++)
437 /* Map indicating how operands of cond_expr should be swapped. */
438 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
439 int *map = maps[*swap];
442 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
443 first_op_idx), map[i]);
445 oprnd = gimple_op (stmt_info->stmt, map[i]);
448 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
449 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
450 oprnd = TREE_OPERAND (oprnd, 0);
452 oprnd_info = (*oprnds_info)[i];
454 stmt_vec_info def_stmt_info;
455 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
459 "Build SLP failed: can't analyze def for %T\n",
465 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
466 oprnd_info->any_pattern = true;
470 /* For the swapping logic below force vect_reduction_def
471 for the reduction op in a SLP reduction group. */
472 if (!STMT_VINFO_DATA_REF (stmt_info)
473 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
474 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
476 dt = vect_reduction_def;
477 oprnd_info->first_dt = dt;
478 oprnd_info->first_op_type = TREE_TYPE (oprnd);
482 /* Not first stmt of the group, check that the def-stmt/s match
483 the def-stmt/s of the first stmt. Allow different definition
484 types for reduction chains: the first stmt must be a
485 vect_reduction_def (a phi node), and the rest
486 end in the reduction chain. */
487 tree type = TREE_TYPE (oprnd);
488 if ((oprnd_info->first_dt != dt
489 && !(oprnd_info->first_dt == vect_reduction_def
490 && !STMT_VINFO_DATA_REF (stmt_info)
491 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
493 && !STMT_VINFO_DATA_REF (def_stmt_info)
494 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
495 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
496 && !((oprnd_info->first_dt == vect_external_def
497 || oprnd_info->first_dt == vect_constant_def)
498 && (dt == vect_external_def
499 || dt == vect_constant_def)))
500 || !types_compatible_p (oprnd_info->first_op_type, type)
501 || (!STMT_VINFO_DATA_REF (stmt_info)
502 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
504 || STMT_VINFO_DATA_REF (def_stmt_info)
505 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
506 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
507 != (oprnd_info->first_dt != vect_reduction_def))))
509 /* Try swapping operands if we got a mismatch. */
510 if (i == commutative_op && !swapped)
512 if (dump_enabled_p ())
513 dump_printf_loc (MSG_NOTE, vect_location,
514 "trying swapped operands\n");
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "Build SLP failed: different types\n");
525 if ((dt == vect_constant_def
526 || dt == vect_external_def)
527 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
528 && (TREE_CODE (type) == BOOLEAN_TYPE
529 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
532 if (dump_enabled_p ())
533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
534 "Build SLP failed: invalid type of def "
535 "for variable-length SLP %T\n", oprnd);
540 /* Check the types of the definitions. */
543 case vect_external_def:
544 /* Make sure to demote the overall operand to external. */
545 oprnd_info->first_dt = vect_external_def;
547 case vect_constant_def:
548 oprnd_info->def_stmts.quick_push (NULL);
549 oprnd_info->ops.quick_push (oprnd);
552 case vect_internal_def:
553 case vect_reduction_def:
554 if (oprnd_info->first_dt == vect_reduction_def
555 && !STMT_VINFO_DATA_REF (stmt_info)
556 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
557 && !STMT_VINFO_DATA_REF (def_stmt_info)
558 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
559 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
561 /* For a SLP reduction chain we want to duplicate the
562 reduction to each of the chain members. That gets
563 us a sane SLP graph (still the stmts are not 100%
564 correct wrt the initial values). */
566 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
567 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
571 case vect_induction_def:
572 oprnd_info->def_stmts.quick_push (def_stmt_info);
573 oprnd_info->ops.quick_push (oprnd);
577 /* FORNOW: Not supported. */
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
580 "Build SLP failed: illegal type of def %T\n",
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_NOTE, vect_location,
592 "swapped operands to match def types in %G",
600 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
601 Return true if we can, meaning that this choice doesn't conflict with
602 existing SLP nodes that use STMT_INFO. */
605 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
607 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
608 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
611 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
612 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
614 /* We maintain the invariant that if any statement in the group is
615 used, all other members of the group have the same vector type. */
616 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
617 stmt_vec_info member_info = first_info;
618 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
619 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
620 || is_pattern_stmt_p (member_info))
625 for (member_info = first_info; member_info;
626 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
627 STMT_VINFO_VECTYPE (member_info) = vectype;
631 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
632 && !is_pattern_stmt_p (stmt_info))
634 STMT_VINFO_VECTYPE (stmt_info) = vectype;
638 if (dump_enabled_p ())
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 "Build SLP failed: incompatible vector"
642 " types for: %G", stmt_info->stmt);
643 dump_printf_loc (MSG_NOTE, vect_location,
644 " old vector type: %T\n", old_vectype);
645 dump_printf_loc (MSG_NOTE, vect_location,
646 " new vector type: %T\n", vectype);
651 /* Return true if call statements CALL1 and CALL2 are similar enough
652 to be combined into the same SLP group. */
655 compatible_calls_p (gcall *call1, gcall *call2)
657 unsigned int nargs = gimple_call_num_args (call1);
658 if (nargs != gimple_call_num_args (call2))
661 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
664 if (gimple_call_internal_p (call1))
666 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
667 TREE_TYPE (gimple_call_lhs (call2))))
669 for (unsigned int i = 0; i < nargs; ++i)
670 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
671 TREE_TYPE (gimple_call_arg (call2, i))))
676 if (!operand_equal_p (gimple_call_fn (call1),
677 gimple_call_fn (call2), 0))
680 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
686 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
687 caller's attempt to find the vector type in STMT_INFO with the narrowest
688 element type. Return true if VECTYPE is nonnull and if it is valid
689 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
690 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
691 vect_build_slp_tree. */
694 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
695 unsigned int group_size,
696 tree vectype, poly_uint64 *max_nunits)
700 if (dump_enabled_p ())
701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
702 "Build SLP failed: unsupported data-type in %G\n",
704 /* Fatal mismatch. */
708 /* If populating the vector type requires unrolling then fail
709 before adjusting *max_nunits for basic-block vectorization. */
710 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
711 unsigned HOST_WIDE_INT const_nunits;
712 if (is_a <bb_vec_info> (vinfo)
713 && (!nunits.is_constant (&const_nunits)
714 || const_nunits > group_size))
716 if (dump_enabled_p ())
717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
718 "Build SLP failed: unrolling required "
719 "in basic block SLP\n");
720 /* Fatal mismatch. */
724 /* In case of multiple types we need to detect the smallest type. */
725 vect_update_max_nunits (max_nunits, vectype);
729 /* Verify if the scalar stmts STMTS are isomorphic, require data
730 permutation or are of unsupported types of operation. Return
731 true if they are, otherwise return false and indicate in *MATCHES
732 which stmts are not isomorphic to the first one. If MATCHES[0]
733 is false then this indicates the comparison could not be
734 carried out or the stmts will never be vectorized by SLP.
736 Note COND_EXPR is possibly isomorphic to another one after swapping its
737 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
738 the first stmt by swapping the two operands of comparison; set SWAP[i]
739 to 2 if stmt I is isormorphic to the first stmt by inverting the code
740 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
741 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
744 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
745 vec<stmt_vec_info> stmts, unsigned int group_size,
746 poly_uint64 *max_nunits, bool *matches,
747 bool *two_operators, tree *node_vectype)
750 stmt_vec_info first_stmt_info = stmts[0];
751 enum tree_code first_stmt_code = ERROR_MARK;
752 enum tree_code alt_stmt_code = ERROR_MARK;
753 enum tree_code rhs_code = ERROR_MARK;
754 enum tree_code first_cond_code = ERROR_MARK;
756 bool need_same_oprnds = false;
757 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
760 machine_mode optab_op2_mode;
761 machine_mode vec_mode;
762 stmt_vec_info first_load = NULL, prev_first_load = NULL;
765 /* For every stmt in NODE find its def stmt/s. */
766 stmt_vec_info stmt_info;
767 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
769 gimple *stmt = stmt_info->stmt;
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
776 /* Fail to vectorize statements marked as unvectorizable. */
777 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
779 if (dump_enabled_p ())
780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781 "Build SLP failed: unvectorizable statement %G",
783 /* Fatal mismatch. */
788 lhs = gimple_get_lhs (stmt);
789 if (lhs == NULL_TREE)
791 if (dump_enabled_p ())
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "Build SLP failed: not GIMPLE_ASSIGN nor "
794 "GIMPLE_CALL %G", stmt);
795 /* Fatal mismatch. */
801 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
802 &nunits_vectype, group_size)
804 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
805 nunits_vectype, max_nunits)))
807 /* Fatal mismatch. */
812 gcc_assert (vectype);
814 if (is_a <bb_vec_info> (vinfo)
815 && !vect_update_shared_vectype (stmt_info, vectype))
818 gcall *call_stmt = dyn_cast <gcall *> (stmt);
821 rhs_code = CALL_EXPR;
823 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
825 else if ((gimple_call_internal_p (call_stmt)
826 && (!vectorizable_internal_fn_p
827 (gimple_call_internal_fn (call_stmt))))
828 || gimple_call_tail_p (call_stmt)
829 || gimple_call_noreturn_p (call_stmt)
830 || !gimple_call_nothrow_p (call_stmt)
831 || gimple_call_chain (call_stmt))
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "Build SLP failed: unsupported call type %G",
837 /* Fatal mismatch. */
844 rhs_code = gimple_assign_rhs_code (stmt);
845 load_p = gimple_vuse (stmt);
848 /* Check the operation. */
851 *node_vectype = vectype;
852 first_stmt_code = rhs_code;
854 /* Shift arguments should be equal in all the packed stmts for a
855 vector shift with scalar shift operand. */
856 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
857 || rhs_code == LROTATE_EXPR
858 || rhs_code == RROTATE_EXPR)
860 vec_mode = TYPE_MODE (vectype);
862 /* First see if we have a vector/vector shift. */
863 optab = optab_for_tree_code (rhs_code, vectype,
867 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
869 /* No vector/vector shift, try for a vector/scalar shift. */
870 optab = optab_for_tree_code (rhs_code, vectype,
875 if (dump_enabled_p ())
876 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
877 "Build SLP failed: no optab.\n");
878 /* Fatal mismatch. */
882 icode = (int) optab_handler (optab, vec_mode);
883 if (icode == CODE_FOR_nothing)
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
888 "op not supported by target.\n");
889 /* Fatal mismatch. */
893 optab_op2_mode = insn_data[icode].operand[2].mode;
894 if (!VECTOR_MODE_P (optab_op2_mode))
896 need_same_oprnds = true;
897 first_op1 = gimple_assign_rhs2 (stmt);
901 else if (rhs_code == WIDEN_LSHIFT_EXPR)
903 need_same_oprnds = true;
904 first_op1 = gimple_assign_rhs2 (stmt);
907 && rhs_code == BIT_FIELD_REF)
909 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
910 if (TREE_CODE (vec) != SSA_NAME
911 || !types_compatible_p (vectype, TREE_TYPE (vec)))
913 if (dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
916 "BIT_FIELD_REF not supported\n");
917 /* Fatal mismatch. */
923 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
925 need_same_oprnds = true;
926 first_op1 = gimple_call_arg (call_stmt, 1);
931 if (first_stmt_code != rhs_code
932 && alt_stmt_code == ERROR_MARK)
933 alt_stmt_code = rhs_code;
934 if (first_stmt_code != rhs_code
935 && (first_stmt_code != IMAGPART_EXPR
936 || rhs_code != REALPART_EXPR)
937 && (first_stmt_code != REALPART_EXPR
938 || rhs_code != IMAGPART_EXPR)
939 /* Handle mismatches in plus/minus by computing both
940 and merging the results. */
941 && !((first_stmt_code == PLUS_EXPR
942 || first_stmt_code == MINUS_EXPR)
943 && (alt_stmt_code == PLUS_EXPR
944 || alt_stmt_code == MINUS_EXPR)
945 && rhs_code == alt_stmt_code)
946 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
947 && (first_stmt_code == ARRAY_REF
948 || first_stmt_code == BIT_FIELD_REF
949 || first_stmt_code == INDIRECT_REF
950 || first_stmt_code == COMPONENT_REF
951 || first_stmt_code == MEM_REF)))
953 if (dump_enabled_p ())
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Build SLP failed: different operation "
958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
959 "original stmt %G", first_stmt_info->stmt);
965 if (need_same_oprnds)
967 tree other_op1 = (call_stmt
968 ? gimple_call_arg (call_stmt, 1)
969 : gimple_assign_rhs2 (stmt));
970 if (!operand_equal_p (first_op1, other_op1, 0))
972 if (dump_enabled_p ())
973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
974 "Build SLP failed: different shift "
975 "arguments in %G", stmt);
981 && first_stmt_code == BIT_FIELD_REF
982 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
983 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
985 if (dump_enabled_p ())
986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
987 "Build SLP failed: different BIT_FIELD_REF "
988 "arguments in %G", stmt);
993 if (!load_p && rhs_code == CALL_EXPR)
995 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
996 as_a <gcall *> (stmt)))
998 if (dump_enabled_p ())
999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1000 "Build SLP failed: different calls in %G",
1008 /* Grouped store or load. */
1009 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1011 if (REFERENCE_CLASS_P (lhs))
1019 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1020 if (prev_first_load)
1022 /* Check that there are no loads from different interleaving
1023 chains in the same node. */
1024 if (prev_first_load != first_load)
1026 if (dump_enabled_p ())
1027 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1029 "Build SLP failed: different "
1030 "interleaving chains in one node %G",
1037 prev_first_load = first_load;
1039 } /* Grouped access. */
1044 /* Not grouped load. */
1045 if (dump_enabled_p ())
1046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1047 "Build SLP failed: not grouped load %G", stmt);
1049 /* FORNOW: Not grouped loads are not supported. */
1050 /* Fatal mismatch. */
1055 /* Not memory operation. */
1056 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1057 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1058 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1059 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1060 && rhs_code != VIEW_CONVERT_EXPR
1061 && rhs_code != CALL_EXPR
1062 && rhs_code != BIT_FIELD_REF)
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066 "Build SLP failed: operation unsupported %G",
1068 /* Fatal mismatch. */
1073 if (rhs_code == COND_EXPR)
1075 tree cond_expr = gimple_assign_rhs1 (stmt);
1076 enum tree_code cond_code = TREE_CODE (cond_expr);
1077 enum tree_code swap_code = ERROR_MARK;
1078 enum tree_code invert_code = ERROR_MARK;
1081 first_cond_code = TREE_CODE (cond_expr);
1082 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1084 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1085 swap_code = swap_tree_comparison (cond_code);
1086 invert_code = invert_tree_comparison (cond_code, honor_nans);
1089 if (first_cond_code == cond_code)
1091 /* Isomorphic can be achieved by swapping. */
1092 else if (first_cond_code == swap_code)
1094 /* Isomorphic can be achieved by inverting. */
1095 else if (first_cond_code == invert_code)
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 "Build SLP failed: different"
1102 " operation %G", stmt);
1112 for (i = 0; i < group_size; ++i)
1116 /* If we allowed a two-operation SLP node verify the target can cope
1117 with the permute we are going to use. */
1118 if (alt_stmt_code != ERROR_MARK
1119 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1121 *two_operators = true;
1127 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1128 Note we never remove apart from at destruction time so we do not
1129 need a special value for deleted that differs from empty. */
1132 typedef vec <stmt_vec_info> value_type;
1133 typedef vec <stmt_vec_info> compare_type;
1134 static inline hashval_t hash (value_type);
1135 static inline bool equal (value_type existing, value_type candidate);
1136 static inline bool is_empty (value_type x) { return !x.exists (); }
1137 static inline bool is_deleted (value_type x) { return !x.exists (); }
1138 static const bool empty_zero_p = true;
1139 static inline void mark_empty (value_type &x) { x.release (); }
1140 static inline void mark_deleted (value_type &x) { x.release (); }
1141 static inline void remove (value_type &x) { x.release (); }
1144 bst_traits::hash (value_type x)
1147 for (unsigned i = 0; i < x.length (); ++i)
1148 h.add_int (gimple_uid (x[i]->stmt));
1152 bst_traits::equal (value_type existing, value_type candidate)
1154 if (existing.length () != candidate.length ())
1156 for (unsigned i = 0; i < existing.length (); ++i)
1157 if (existing[i] != candidate[i])
1162 typedef hash_map <vec <gimple *>, slp_tree,
1163 simple_hashmap_traits <bst_traits, slp_tree> >
1164 scalar_stmts_to_slp_tree_map_t;
1167 vect_build_slp_tree_2 (vec_info *vinfo,
1168 vec<stmt_vec_info> stmts, unsigned int group_size,
1169 poly_uint64 *max_nunits,
1170 bool *matches, unsigned *npermutes, unsigned *tree_size,
1171 scalar_stmts_to_slp_tree_map_t *bst_map);
1174 vect_build_slp_tree (vec_info *vinfo,
1175 vec<stmt_vec_info> stmts, unsigned int group_size,
1176 poly_uint64 *max_nunits,
1177 bool *matches, unsigned *npermutes, unsigned *tree_size,
1178 scalar_stmts_to_slp_tree_map_t *bst_map)
1180 if (slp_tree *leader = bst_map->get (stmts))
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1184 *leader ? "" : "failed ", *leader);
1187 (*leader)->refcnt++;
1188 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1192 poly_uint64 this_max_nunits = 1;
1193 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1195 matches, npermutes, tree_size, bst_map);
1198 res->max_nunits = this_max_nunits;
1199 vect_update_max_nunits (max_nunits, this_max_nunits);
1200 /* Keep a reference for the bst_map use. */
1203 bst_map->put (stmts.copy (), res);
1207 /* Recursively build an SLP tree starting from NODE.
1208 Fail (and return a value not equal to zero) if def-stmts are not
1209 isomorphic, require data permutation or are of unsupported types of
1210 operation. Otherwise, return 0.
1211 The value returned is the depth in the SLP tree where a mismatch
1215 vect_build_slp_tree_2 (vec_info *vinfo,
1216 vec<stmt_vec_info> stmts, unsigned int group_size,
1217 poly_uint64 *max_nunits,
1218 bool *matches, unsigned *npermutes, unsigned *tree_size,
1219 scalar_stmts_to_slp_tree_map_t *bst_map)
1221 unsigned nops, i, this_tree_size = 0;
1222 poly_uint64 this_max_nunits = *max_nunits;
1227 stmt_vec_info stmt_info = stmts[0];
1228 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1229 nops = gimple_call_num_args (stmt);
1230 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1232 nops = gimple_num_ops (stmt) - 1;
1233 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1236 else if (is_a <gphi *> (stmt_info->stmt))
1241 /* If the SLP node is a PHI (induction or reduction), terminate
1243 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1245 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1246 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1247 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1251 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1252 /* Induction from different IVs is not supported. */
1253 if (def_type == vect_induction_def)
1255 stmt_vec_info other_info;
1256 FOR_EACH_VEC_ELT (stmts, i, other_info)
1257 if (stmt_info != other_info)
1260 else if (def_type == vect_reduction_def
1261 || def_type == vect_double_reduction_def
1262 || def_type == vect_nested_cycle)
1264 /* Else def types have to match. */
1265 stmt_vec_info other_info;
1266 FOR_EACH_VEC_ELT (stmts, i, other_info)
1267 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1273 node = vect_create_new_slp_node (stmts, 0);
1274 SLP_TREE_VECTYPE (node) = vectype;
1279 bool two_operators = false;
1280 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1281 tree vectype = NULL_TREE;
1282 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1283 &this_max_nunits, matches, &two_operators,
1287 /* If the SLP node is a load, terminate the recursion unless masked. */
1288 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1289 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1291 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1294 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1299 *max_nunits = this_max_nunits;
1301 node = vect_create_new_slp_node (stmts, 0);
1302 SLP_TREE_VECTYPE (node) = vectype;
1303 /* And compute the load permutation. Whether it is actually
1304 a permutation depends on the unrolling factor which is
1306 vec<unsigned> load_permutation;
1308 stmt_vec_info load_info;
1309 load_permutation.create (group_size);
1310 stmt_vec_info first_stmt_info
1311 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1312 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1314 int load_place = vect_get_place_in_interleaving_chain
1315 (load_info, first_stmt_info);
1316 gcc_assert (load_place != -1);
1317 load_permutation.safe_push (load_place);
1319 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1323 else if (gimple_assign_single_p (stmt_info->stmt)
1324 && !gimple_vuse (stmt_info->stmt)
1325 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1327 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1328 the same SSA name vector of a compatible type to vectype. */
1329 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1330 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1331 stmt_vec_info estmt_info;
1332 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1334 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1335 tree bfref = gimple_assign_rhs1 (estmt);
1337 if (!known_eq (bit_field_size (bfref),
1338 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1339 || !constant_multiple_p (bit_field_offset (bfref),
1340 bit_field_size (bfref), &lane))
1345 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1347 slp_tree vnode = vect_create_new_slp_node (vNULL);
1348 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1349 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1350 /* We are always building a permutation node even if it is an identity
1351 permute to shield the rest of the vectorizer from the odd node
1352 representing an actual vector without any scalar ops.
1353 ??? We could hide it completely with making the permute node
1355 node = vect_create_new_slp_node (stmts, 1);
1356 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1357 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1358 SLP_TREE_VECTYPE (node) = vectype;
1359 SLP_TREE_CHILDREN (node).quick_push (vnode);
1363 /* Get at the operands, verifying they are compatible. */
1364 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1365 slp_oprnd_info oprnd_info;
1366 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1368 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1369 stmts, i, &oprnds_info);
1371 matches[(res == -1) ? 0 : i] = false;
1375 for (i = 0; i < group_size; ++i)
1378 vect_free_oprnd_info (oprnds_info);
1382 auto_vec<slp_tree, 4> children;
1384 stmt_info = stmts[0];
1386 /* Create SLP_TREE nodes for the definition node/s. */
1387 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1392 if (oprnd_info->first_dt == vect_uninitialized_def)
1394 /* COND_EXPR have one too many eventually if the condition
1396 gcc_assert (i == 3 && nops == 4);
1400 if (oprnd_info->first_dt != vect_internal_def
1401 && oprnd_info->first_dt != vect_reduction_def
1402 && oprnd_info->first_dt != vect_induction_def)
1404 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1405 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1406 oprnd_info->ops = vNULL;
1407 children.safe_push (invnode);
1411 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1412 group_size, &this_max_nunits,
1414 &this_tree_size, bst_map)) != NULL)
1416 oprnd_info->def_stmts = vNULL;
1417 children.safe_push (child);
1421 /* If the SLP build failed fatally and we analyze a basic-block
1422 simply treat nodes we fail to build as externally defined
1423 (and thus build vectors from the scalar defs).
1424 The cost model will reject outright expensive cases.
1425 ??? This doesn't treat cases where permutation ultimatively
1426 fails (or we don't try permutation below). Ideally we'd
1427 even compute a permutation that will end up with the maximum
1429 if (is_a <bb_vec_info> (vinfo)
1431 /* ??? Rejecting patterns this way doesn't work. We'd have to
1432 do extra work to cancel the pattern so the uses see the
1434 && !is_pattern_stmt_p (stmt_info)
1435 && !oprnd_info->any_pattern)
1437 if (dump_enabled_p ())
1438 dump_printf_loc (MSG_NOTE, vect_location,
1439 "Building vector operands from scalars\n");
1441 child = vect_create_new_slp_node (oprnd_info->ops);
1442 children.safe_push (child);
1443 oprnd_info->ops = vNULL;
1444 oprnd_info->def_stmts = vNULL;
1448 /* If the SLP build for operand zero failed and operand zero
1449 and one can be commutated try that for the scalar stmts
1450 that failed the match. */
1452 /* A first scalar stmt mismatch signals a fatal mismatch. */
1454 /* ??? For COND_EXPRs we can swap the comparison operands
1455 as well as the arms under some constraints. */
1457 && oprnds_info[1]->first_dt == vect_internal_def
1458 && is_gimple_assign (stmt_info->stmt)
1459 /* Swapping operands for reductions breaks assumptions later on. */
1460 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1461 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1462 /* Do so only if the number of not successful permutes was nor more
1463 than a cut-ff as re-trying the recursive match on
1464 possibly each level of the tree would expose exponential
1468 /* See whether we can swap the matching or the non-matching
1470 bool swap_not_matching = true;
1473 for (j = 0; j < group_size; ++j)
1475 if (matches[j] != !swap_not_matching)
1477 stmt_vec_info stmt_info = stmts[j];
1478 /* Verify if we can swap operands of this stmt. */
1479 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1481 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1483 if (!swap_not_matching)
1485 swap_not_matching = false;
1490 while (j != group_size);
1492 /* Swap mismatched definition stmts. */
1493 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_NOTE, vect_location,
1495 "Re-trying with swapped operands of stmts ");
1496 for (j = 0; j < group_size; ++j)
1497 if (matches[j] == !swap_not_matching)
1499 std::swap (oprnds_info[0]->def_stmts[j],
1500 oprnds_info[1]->def_stmts[j]);
1501 std::swap (oprnds_info[0]->ops[j],
1502 oprnds_info[1]->ops[j]);
1503 if (dump_enabled_p ())
1504 dump_printf (MSG_NOTE, "%d ", j);
1506 if (dump_enabled_p ())
1507 dump_printf (MSG_NOTE, "\n");
1508 /* And try again with scratch 'matches' ... */
1509 bool *tem = XALLOCAVEC (bool, group_size);
1510 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1511 group_size, &this_max_nunits,
1513 &this_tree_size, bst_map)) != NULL)
1515 oprnd_info->def_stmts = vNULL;
1516 children.safe_push (child);
1524 gcc_assert (child == NULL);
1525 FOR_EACH_VEC_ELT (children, j, child)
1526 vect_free_slp_tree (child, false);
1527 vect_free_oprnd_info (oprnds_info);
1531 vect_free_oprnd_info (oprnds_info);
1533 /* If we have all children of a child built up from uniform scalars
1534 then just throw that away, causing it built up from scalars.
1535 The exception is the SLP node for the vector store. */
1536 if (is_a <bb_vec_info> (vinfo)
1537 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1538 /* ??? Rejecting patterns this way doesn't work. We'd have to
1539 do extra work to cancel the pattern so the uses see the
1541 && !is_pattern_stmt_p (stmt_info))
1545 FOR_EACH_VEC_ELT (children, j, child)
1546 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
1547 || !vect_slp_tree_uniform_p (child))
1553 FOR_EACH_VEC_ELT (children, j, child)
1554 vect_free_slp_tree (child, false);
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_NOTE, vect_location,
1558 "Building parent vector operands from "
1559 "scalars instead\n");
1564 *tree_size += this_tree_size + 1;
1565 *max_nunits = this_max_nunits;
1569 /* ??? We'd likely want to either cache in bst_map sth like
1570 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1571 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1572 explicit stmts to put in so the keying on 'stmts' doesn't
1573 work (but we have the same issue with nodes that use 'ops'). */
1574 slp_tree one = new _slp_tree;
1575 slp_tree two = new _slp_tree;
1576 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1577 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1578 SLP_TREE_VECTYPE (one) = vectype;
1579 SLP_TREE_VECTYPE (two) = vectype;
1580 SLP_TREE_CHILDREN (one).safe_splice (children);
1581 SLP_TREE_CHILDREN (two).safe_splice (children);
1583 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1586 /* Here we record the original defs since this
1587 node represents the final lane configuration. */
1588 node = vect_create_new_slp_node (stmts, 2);
1589 SLP_TREE_VECTYPE (node) = vectype;
1590 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1591 SLP_TREE_CHILDREN (node).quick_push (one);
1592 SLP_TREE_CHILDREN (node).quick_push (two);
1593 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1594 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1595 enum tree_code ocode = ERROR_MARK;
1596 stmt_vec_info ostmt_info;
1598 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1600 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1601 if (gimple_assign_rhs_code (ostmt) != code0)
1603 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1604 ocode = gimple_assign_rhs_code (ostmt);
1608 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1610 SLP_TREE_CODE (one) = code0;
1611 SLP_TREE_CODE (two) = ocode;
1612 SLP_TREE_LANES (one) = stmts.length ();
1613 SLP_TREE_LANES (two) = stmts.length ();
1614 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1615 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1619 node = vect_create_new_slp_node (stmts, nops);
1620 SLP_TREE_VECTYPE (node) = vectype;
1621 SLP_TREE_CHILDREN (node).splice (children);
1625 /* Dump a single SLP tree NODE. */
1628 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1633 stmt_vec_info stmt_info;
1636 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1637 dump_user_location_t user_loc = loc.get_user_location ();
1638 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1639 SLP_TREE_DEF_TYPE (node) == vect_external_def
1641 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1644 estimated_poly_value (node->max_nunits), node->refcnt);
1645 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1646 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1647 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1650 dump_printf_loc (metadata, user_loc, "\t{ ");
1651 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1652 dump_printf (metadata, "%T%s ", op,
1653 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1654 dump_printf (metadata, "}\n");
1656 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1658 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1659 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1660 dump_printf (dump_kind, " %u", j);
1661 dump_printf (dump_kind, " }\n");
1663 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1665 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1666 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1667 dump_printf (dump_kind, " %u[%u]",
1668 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1669 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1670 dump_printf (dump_kind, " }\n");
1672 if (SLP_TREE_CHILDREN (node).is_empty ())
1674 dump_printf_loc (metadata, user_loc, "\tchildren");
1675 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1676 dump_printf (dump_kind, " %p", (void *)child);
1677 dump_printf (dump_kind, "\n");
1681 debug (slp_tree node)
1683 debug_dump_context ctx;
1684 vect_print_slp_tree (MSG_NOTE,
1685 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1689 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1692 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1693 slp_tree node, hash_set<slp_tree> &visited)
1698 if (visited.add (node))
1701 vect_print_slp_tree (dump_kind, loc, node);
1703 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1704 vect_print_slp_graph (dump_kind, loc, child, visited);
1708 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1711 hash_set<slp_tree> visited;
1712 vect_print_slp_graph (dump_kind, loc, entry, visited);
1715 /* Mark the tree rooted at NODE with PURE_SLP. */
1718 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1721 stmt_vec_info stmt_info;
1724 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1727 if (visited.add (node))
1730 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1731 STMT_SLP_TYPE (stmt_info) = pure_slp;
1733 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1734 vect_mark_slp_stmts (child, visited);
1738 vect_mark_slp_stmts (slp_tree node)
1740 hash_set<slp_tree> visited;
1741 vect_mark_slp_stmts (node, visited);
1744 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1747 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1750 stmt_vec_info stmt_info;
1753 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1756 if (visited.add (node))
1759 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1761 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1762 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1763 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1766 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1767 vect_mark_slp_stmts_relevant (child, visited);
1771 vect_mark_slp_stmts_relevant (slp_tree node)
1773 hash_set<slp_tree> visited;
1774 vect_mark_slp_stmts_relevant (node, visited);
1777 /* Copy the SLP subtree rooted at NODE. */
1780 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1785 slp_tree ©_ref = map.get_or_insert (node, &existed_p);
1789 copy_ref = new _slp_tree;
1790 slp_tree copy = copy_ref;
1791 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1792 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1793 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
1794 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
1795 copy->max_nunits = node->max_nunits;
1797 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1799 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1800 stmt_vec_info stmt_info;
1801 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1802 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1804 if (SLP_TREE_SCALAR_OPS (node).exists ())
1805 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1806 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1807 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1808 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1809 SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
1810 if (SLP_TREE_CHILDREN (node).exists ())
1811 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1812 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1815 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1817 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1818 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1823 /* Rearrange the statements of NODE according to PERMUTATION. */
1826 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1827 vec<unsigned> permutation,
1828 hash_set<slp_tree> &visited)
1833 if (visited.add (node))
1836 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1837 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1839 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1841 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1842 vec<stmt_vec_info> tmp_stmts;
1843 tmp_stmts.create (group_size);
1844 tmp_stmts.quick_grow (group_size);
1845 stmt_vec_info stmt_info;
1846 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1847 tmp_stmts[permutation[i]] = stmt_info;
1848 SLP_TREE_SCALAR_STMTS (node).release ();
1849 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1851 if (SLP_TREE_SCALAR_OPS (node).exists ())
1853 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1855 tmp_ops.create (group_size);
1856 tmp_ops.quick_grow (group_size);
1858 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1859 tmp_ops[permutation[i]] = op;
1860 SLP_TREE_SCALAR_OPS (node).release ();
1861 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1863 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1865 gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
1866 for (i = 0; i < group_size; ++i)
1867 SLP_TREE_LANE_PERMUTATION (node)[i].second
1868 = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
1873 /* Attempt to reorder stmts in a reduction chain so that we don't
1874 require any load permutation. Return true if that was possible,
1875 otherwise return false. */
1878 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1882 slp_tree node, load;
1884 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1887 /* Compare all the permutation sequences to the first one. We know
1888 that at least one load is permuted. */
1889 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1890 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1892 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1893 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1895 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1896 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1898 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1899 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1903 /* Check that the loads in the first sequence are different and there
1904 are no gaps between them. */
1905 auto_sbitmap load_index (group_size);
1906 bitmap_clear (load_index);
1907 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1909 if (lidx >= group_size)
1911 if (bitmap_bit_p (load_index, lidx))
1914 bitmap_set_bit (load_index, lidx);
1916 for (i = 0; i < group_size; i++)
1917 if (!bitmap_bit_p (load_index, i))
1920 /* This permutation is valid for reduction. Since the order of the
1921 statements in the nodes is not important unless they are memory
1922 accesses, we can rearrange the statements in all the nodes
1923 according to the order of the loads. */
1925 /* We have to unshare the SLP tree we modify. */
1926 hash_map<slp_tree, slp_tree> map;
1927 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1928 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1930 SLP_INSTANCE_TREE (slp_instn) = unshared;
1931 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1932 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1933 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1935 /* Do the actual re-arrangement. */
1936 hash_set<slp_tree> visited;
1937 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1938 node->load_permutation, visited);
1940 /* We are done, no actual permutations need to be generated. */
1941 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1942 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1944 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1945 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1946 /* But we have to keep those permutations that are required because
1947 of handling of gaps. */
1948 if (known_eq (unrolling_factor, 1U)
1949 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1950 && DR_GROUP_GAP (first_stmt_info) == 0))
1951 SLP_TREE_LOAD_PERMUTATION (node).release ();
1953 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1954 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1960 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1963 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1964 hash_set<slp_tree> &visited)
1966 if (visited.add (node))
1969 if (SLP_TREE_CHILDREN (node).length () == 0)
1971 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1973 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1974 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1975 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1976 loads.safe_push (node);
1982 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1983 vect_gather_slp_loads (loads, child, visited);
1988 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1990 hash_set<slp_tree> visited;
1991 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1995 /* Find the last store in SLP INSTANCE. */
1998 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2000 stmt_vec_info last = NULL;
2001 stmt_vec_info stmt_vinfo;
2003 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2005 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2006 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2012 /* Find the first stmt in NODE. */
2015 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2017 stmt_vec_info first = NULL;
2018 stmt_vec_info stmt_vinfo;
2020 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2022 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2024 || get_later_stmt (stmt_vinfo, first) == first)
2031 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2032 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2033 (also containing the first GROUP1_SIZE stmts, since stores are
2034 consecutive), the second containing the remainder.
2035 Return the first stmt in the second group. */
2037 static stmt_vec_info
2038 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2040 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2041 gcc_assert (group1_size > 0);
2042 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2043 gcc_assert (group2_size > 0);
2044 DR_GROUP_SIZE (first_vinfo) = group1_size;
2046 stmt_vec_info stmt_info = first_vinfo;
2047 for (unsigned i = group1_size; i > 1; i--)
2049 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2050 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2052 /* STMT is now the last element of the first group. */
2053 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2054 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2056 DR_GROUP_SIZE (group2) = group2_size;
2057 for (stmt_info = group2; stmt_info;
2058 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2060 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2061 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2064 /* For the second group, the DR_GROUP_GAP is that before the original group,
2065 plus skipping over the first vector. */
2066 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2068 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2069 DR_GROUP_GAP (first_vinfo) += group2_size;
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2073 group1_size, group2_size);
2078 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2079 statements and a vector of NUNITS elements. */
2082 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2084 return exact_div (common_multiple (nunits, group_size), group_size);
2087 /* Analyze an SLP instance starting from a group of grouped stores. Call
2088 vect_build_slp_tree to build a tree of packed stmts if possible.
2089 Return FALSE if it's impossible to SLP any stmt in the loop. */
2092 vect_analyze_slp_instance (vec_info *vinfo,
2093 scalar_stmts_to_slp_tree_map_t *bst_map,
2094 stmt_vec_info stmt_info, unsigned max_tree_size)
2096 slp_instance new_instance;
2098 unsigned int group_size;
2099 tree vectype, scalar_type = NULL_TREE;
2101 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2102 vec<stmt_vec_info> scalar_stmts;
2103 bool constructor = false;
2105 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2107 scalar_type = TREE_TYPE (DR_REF (dr));
2108 group_size = DR_GROUP_SIZE (stmt_info);
2109 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2111 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2113 gcc_assert (is_a <loop_vec_info> (vinfo));
2114 vectype = STMT_VINFO_VECTYPE (stmt_info);
2115 group_size = REDUC_GROUP_SIZE (stmt_info);
2117 else if (is_gimple_assign (stmt_info->stmt)
2118 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2120 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2121 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2126 gcc_assert (is_a <loop_vec_info> (vinfo));
2127 vectype = STMT_VINFO_VECTYPE (stmt_info);
2128 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2133 if (dump_enabled_p ())
2134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2135 "Build SLP failed: unsupported data-type %T\n",
2140 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2142 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2143 scalar_stmts.create (group_size);
2144 stmt_vec_info next_info = stmt_info;
2145 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2147 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2150 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2151 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2154 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2156 /* Collect the reduction stmts and store them in
2157 SLP_TREE_SCALAR_STMTS. */
2160 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2161 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2163 /* Mark the first element of the reduction chain as reduction to properly
2164 transform the node. In the reduction analysis phase only the last
2165 element of the chain is marked as reduction. */
2166 STMT_VINFO_DEF_TYPE (stmt_info)
2167 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2168 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2169 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2171 else if (constructor)
2173 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2175 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2177 if (TREE_CODE (val) == SSA_NAME)
2179 gimple* def = SSA_NAME_DEF_STMT (val);
2180 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2181 /* Value is defined in another basic block. */
2184 def_info = vect_stmt_to_vectorize (def_info);
2185 scalar_stmts.safe_push (def_info);
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_NOTE, vect_location,
2192 "Analyzing vectorizable constructor: %G\n",
2197 /* Collect reduction statements. */
2198 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2199 for (i = 0; reductions.iterate (i, &next_info); i++)
2200 scalar_stmts.safe_push (next_info);
2203 /* Build the tree for the SLP instance. */
2204 bool *matches = XALLOCAVEC (bool, group_size);
2205 unsigned npermutes = 0;
2206 poly_uint64 max_nunits = nunits;
2207 unsigned tree_size = 0;
2208 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2209 &max_nunits, matches, &npermutes,
2210 &tree_size, bst_map);
2213 /* Calculate the unrolling factor based on the smallest type. */
2214 poly_uint64 unrolling_factor
2215 = calculate_unrolling_factor (max_nunits, group_size);
2217 if (maybe_ne (unrolling_factor, 1U)
2218 && is_a <bb_vec_info> (vinfo))
2220 unsigned HOST_WIDE_INT const_max_nunits;
2221 if (!max_nunits.is_constant (&const_max_nunits)
2222 || const_max_nunits > group_size)
2224 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2226 "Build SLP failed: store group "
2227 "size not a multiple of the vector size "
2228 "in basic block SLP\n");
2229 vect_free_slp_tree (node, false);
2232 /* Fatal mismatch. */
2234 matches[group_size / const_max_nunits * const_max_nunits] = false;
2235 vect_free_slp_tree (node, false);
2239 /* Create a new SLP instance. */
2240 new_instance = XNEW (class _slp_instance);
2241 SLP_INSTANCE_TREE (new_instance) = node;
2242 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2243 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2244 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2246 vect_gather_slp_loads (new_instance, node);
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_NOTE, vect_location,
2249 "SLP size %u vs. limit %u.\n",
2250 tree_size, max_tree_size);
2252 /* Check whether any load is possibly permuted. */
2254 bool loads_permuted = false;
2255 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2257 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2260 stmt_vec_info load_info;
2261 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2262 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2264 loads_permuted = true;
2269 /* If the loads and stores can be handled with load/store-lane
2270 instructions do not generate this SLP instance. */
2271 if (is_a <loop_vec_info> (vinfo)
2273 && dr && vect_store_lanes_supported (vectype, group_size, false))
2276 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2278 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2279 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2280 /* Use SLP for strided accesses (or if we can't load-lanes). */
2281 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2282 || ! vect_load_lanes_supported
2283 (STMT_VINFO_VECTYPE (stmt_vinfo),
2284 DR_GROUP_SIZE (stmt_vinfo), false))
2287 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2289 if (dump_enabled_p ())
2290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291 "Built SLP cancelled: can use "
2292 "load/store-lanes\n");
2293 vect_free_slp_instance (new_instance, false);
2298 /* If this is a reduction chain with a conversion in front
2299 amend the SLP tree with a node for that. */
2301 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2302 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2304 /* Get at the conversion stmt - we know it's the single use
2305 of the last stmt of the reduction chain. */
2306 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2307 use_operand_p use_p;
2309 bool r = single_imm_use (gimple_assign_lhs (tem),
2312 next_info = vinfo->lookup_stmt (use_stmt);
2313 next_info = vect_stmt_to_vectorize (next_info);
2314 scalar_stmts = vNULL;
2315 scalar_stmts.create (group_size);
2316 for (unsigned i = 0; i < group_size; ++i)
2317 scalar_stmts.quick_push (next_info);
2318 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2319 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2320 SLP_TREE_CHILDREN (conv).quick_push (node);
2321 SLP_INSTANCE_TREE (new_instance) = conv;
2322 /* We also have to fake this conversion stmt as SLP reduction
2323 group so we don't have to mess with too much code
2325 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2326 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2329 vinfo->slp_instances.safe_push (new_instance);
2331 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2332 the number of scalar stmts in the root in a few places.
2333 Verify that assumption holds. */
2334 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2335 .length () == group_size);
2337 if (dump_enabled_p ())
2339 dump_printf_loc (MSG_NOTE, vect_location,
2340 "Final SLP tree for instance:\n");
2341 vect_print_slp_graph (MSG_NOTE, vect_location,
2342 SLP_INSTANCE_TREE (new_instance));
2350 /* Failed to SLP. */
2351 /* Free the allocated memory. */
2352 scalar_stmts.release ();
2355 /* For basic block SLP, try to break the group up into multiples of the
2357 unsigned HOST_WIDE_INT const_nunits;
2358 if (is_a <bb_vec_info> (vinfo)
2359 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2360 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2361 && nunits.is_constant (&const_nunits))
2363 /* We consider breaking the group only on VF boundaries from the existing
2365 for (i = 0; i < group_size; i++)
2366 if (!matches[i]) break;
2368 if (i >= const_nunits && i < group_size)
2370 /* Split into two groups at the first vector boundary before i. */
2371 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2372 unsigned group1_size = i & ~(const_nunits - 1);
2374 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2376 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2378 /* If the first non-match was in the middle of a vector,
2379 skip the rest of that vector. */
2380 if (group1_size < i)
2382 i = group1_size + const_nunits;
2384 rest = vect_split_slp_store_group (rest, const_nunits);
2387 res |= vect_analyze_slp_instance (vinfo, bst_map,
2388 rest, max_tree_size);
2391 /* Even though the first vector did not all match, we might be able to SLP
2392 (some) of the remainder. FORNOW ignore this possibility. */
2399 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2400 trees of packed scalar stmts if SLP is possible. */
2403 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2406 stmt_vec_info first_element;
2408 DUMP_VECT_SCOPE ("vect_analyze_slp");
2410 scalar_stmts_to_slp_tree_map_t *bst_map
2411 = new scalar_stmts_to_slp_tree_map_t ();
2413 /* Find SLP sequences starting from groups of grouped stores. */
2414 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2415 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2417 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2419 if (loop_vinfo->reduction_chains.length () > 0)
2421 /* Find SLP sequences starting from reduction chains. */
2422 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2423 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2426 /* Dissolve reduction chain group. */
2427 stmt_vec_info vinfo = first_element;
2428 stmt_vec_info last = NULL;
2431 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2432 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2433 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2437 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2438 /* It can be still vectorized as part of an SLP reduction. */
2439 loop_vinfo->reductions.safe_push (last);
2443 /* Find SLP sequences starting from groups of reductions. */
2444 if (loop_vinfo->reductions.length () > 1)
2445 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2449 /* The map keeps a reference on SLP nodes built, release that. */
2450 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2451 it != bst_map->end (); ++it)
2453 vect_free_slp_tree ((*it).second, false);
2456 /* Optimize permutations in SLP reductions. */
2457 slp_instance instance;
2458 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2460 slp_tree node = SLP_INSTANCE_TREE (instance);
2461 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2462 /* Reduction (there are no data-refs in the root).
2463 In reduction chain the order of the loads is not important. */
2464 if (!STMT_VINFO_DATA_REF (stmt_info)
2465 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2466 vect_attempt_slp_rearrange_stmts (instance);
2469 /* Gather all loads in the SLP graph. */
2470 hash_set<slp_tree> visited;
2471 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2472 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2475 return opt_result::success ();
2479 vect_optimize_slp (vec_info *vinfo)
2483 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2485 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2488 /* In basic block vectorization we allow any subchain of an interleaving
2490 FORNOW: not in loop SLP because of realignment complications. */
2491 if (is_a <bb_vec_info> (vinfo))
2493 bool subchain_p = true;
2494 stmt_vec_info next_load_info = NULL;
2495 stmt_vec_info load_info;
2497 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2500 && (next_load_info != load_info
2501 || DR_GROUP_GAP (load_info) != 1))
2506 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2510 SLP_TREE_LOAD_PERMUTATION (node).release ();
2516 stmt_vec_info load_info;
2517 bool this_load_permuted = false;
2519 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2520 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2522 this_load_permuted = true;
2525 stmt_vec_info first_stmt_info
2526 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2527 if (!this_load_permuted
2528 /* The load requires permutation when unrolling exposes
2529 a gap either because the group is larger than the SLP
2530 group-size or because there is a gap between the groups. */
2531 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2532 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2533 && DR_GROUP_GAP (first_stmt_info) == 0)))
2535 SLP_TREE_LOAD_PERMUTATION (node).release ();
2543 /* For each possible SLP instance decide whether to SLP it and calculate overall
2544 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2545 least one instance. */
2548 vect_make_slp_decision (loop_vec_info loop_vinfo)
2551 poly_uint64 unrolling_factor = 1;
2552 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2553 slp_instance instance;
2554 int decided_to_slp = 0;
2556 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2558 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2560 /* FORNOW: SLP if you can. */
2561 /* All unroll factors have the form:
2563 GET_MODE_SIZE (vinfo->vector_mode) * X
2565 for some rational X, so they must have a common multiple. */
2567 = force_common_multiple (unrolling_factor,
2568 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2570 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2571 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2572 loop-based vectorization. Such stmts will be marked as HYBRID. */
2573 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2577 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2579 if (decided_to_slp && dump_enabled_p ())
2581 dump_printf_loc (MSG_NOTE, vect_location,
2582 "Decided to SLP %d instances. Unrolling factor ",
2584 dump_dec (MSG_NOTE, unrolling_factor);
2585 dump_printf (MSG_NOTE, "\n");
2588 return (decided_to_slp > 0);
2591 /* Private data for vect_detect_hybrid_slp. */
2594 loop_vec_info loop_vinfo;
2595 vec<stmt_vec_info> *worklist;
2598 /* Walker for walk_gimple_op. */
2601 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2603 walk_stmt_info *wi = (walk_stmt_info *)data;
2604 vdhs_data *dat = (vdhs_data *)wi->info;
2609 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2612 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2613 if (PURE_SLP_STMT (def_stmt_info))
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2617 def_stmt_info->stmt);
2618 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2619 dat->worklist->safe_push (def_stmt_info);
2625 /* Find stmts that must be both vectorized and SLPed. */
2628 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2630 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2632 /* All stmts participating in SLP are marked pure_slp, all other
2633 stmts are loop_vect.
2634 First collect all loop_vect stmts into a worklist. */
2635 auto_vec<stmt_vec_info> worklist;
2636 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2638 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2639 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2642 gphi *phi = gsi.phi ();
2643 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2644 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2645 worklist.safe_push (stmt_info);
2647 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2650 gimple *stmt = gsi_stmt (gsi);
2651 if (is_gimple_debug (stmt))
2653 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2654 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2656 for (gimple_stmt_iterator gsi2
2657 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2658 !gsi_end_p (gsi2); gsi_next (&gsi2))
2660 stmt_vec_info patt_info
2661 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2662 if (!STMT_SLP_TYPE (patt_info)
2663 && STMT_VINFO_RELEVANT (patt_info))
2664 worklist.safe_push (patt_info);
2666 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2668 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2669 worklist.safe_push (stmt_info);
2673 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2674 mark any SLP vectorized stmt as hybrid.
2675 ??? We're visiting def stmts N times (once for each non-SLP and
2676 once for each hybrid-SLP use). */
2679 dat.worklist = &worklist;
2680 dat.loop_vinfo = loop_vinfo;
2681 memset (&wi, 0, sizeof (wi));
2682 wi.info = (void *)&dat;
2683 while (!worklist.is_empty ())
2685 stmt_vec_info stmt_info = worklist.pop ();
2686 /* Since SSA operands are not set up for pattern stmts we need
2687 to use walk_gimple_op. */
2689 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2694 /* Initialize a bb_vec_info struct for the statements between
2695 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2697 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2698 gimple_stmt_iterator region_end_in,
2699 vec_info_shared *shared)
2700 : vec_info (vec_info::bb, init_cost (NULL), shared),
2701 bb (gsi_bb (region_begin_in)),
2702 region_begin (region_begin_in),
2703 region_end (region_end_in)
2705 for (gimple *stmt : this->region_stmts ())
2707 gimple_set_uid (stmt, 0);
2708 if (is_gimple_debug (stmt))
2717 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2718 stmts in the basic block. */
2720 _bb_vec_info::~_bb_vec_info ()
2722 for (gimple *stmt : this->region_stmts ())
2723 /* Reset region marker. */
2724 gimple_set_uid (stmt, -1);
2729 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2730 given then that child nodes have already been processed, and that
2731 their def types currently match their SLP node's def type. */
2734 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2735 slp_instance node_instance,
2736 stmt_vector_for_cost *cost_vec)
2738 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2739 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2741 /* Calculate the number of vector statements to be created for the
2742 scalar stmts in this node. For SLP reductions it is equal to the
2743 number of vector statements in the children (which has already been
2744 calculated by the recursive call). Otherwise it is the number of
2745 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2746 VF divided by the number of elements in a vector. */
2747 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2748 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2750 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2751 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2753 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2754 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2761 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2762 vf = loop_vinfo->vectorization_factor;
2765 unsigned int group_size = SLP_TREE_LANES (node);
2766 tree vectype = SLP_TREE_VECTYPE (node);
2767 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2768 = vect_get_num_vectors (vf * group_size, vectype);
2771 /* Handle purely internal nodes. */
2772 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2773 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
2776 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2777 node, node_instance, cost_vec);
2780 /* Try to build NODE from scalars, returning true on success.
2781 NODE_INSTANCE is the SLP instance that contains NODE. */
2784 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2785 slp_instance node_instance)
2787 stmt_vec_info stmt_info;
2790 if (!is_a <bb_vec_info> (vinfo)
2791 || node == SLP_INSTANCE_TREE (node_instance)
2792 || !SLP_TREE_SCALAR_STMTS (node).exists ()
2793 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_NOTE, vect_location,
2798 "Building vector operands from scalars instead\n");
2800 /* Don't remove and free the child nodes here, since they could be
2801 referenced by other structures. The analysis and scheduling phases
2802 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2803 unsigned int group_size = SLP_TREE_LANES (node);
2804 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2805 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2806 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2808 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2809 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2814 /* Compute the prologue cost for invariant or constant operands represented
2818 vect_prologue_cost_for_slp (slp_tree node,
2819 stmt_vector_for_cost *cost_vec)
2821 /* There's a special case of an existing vector, that costs nothing. */
2822 if (SLP_TREE_SCALAR_OPS (node).length () == 0
2823 && !SLP_TREE_VEC_DEFS (node).is_empty ())
2825 /* Without looking at the actual initializer a vector of
2826 constants can be implemented as load from the constant pool.
2827 When all elements are the same we can use a splat. */
2828 tree vectype = SLP_TREE_VECTYPE (node);
2829 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2830 unsigned num_vects_to_check;
2831 unsigned HOST_WIDE_INT const_nunits;
2832 unsigned nelt_limit;
2833 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2834 && ! multiple_p (const_nunits, group_size))
2836 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2837 nelt_limit = const_nunits;
2841 /* If either the vector has variable length or the vectors
2842 are composed of repeated whole groups we only need to
2843 cost construction once. All vectors will be the same. */
2844 num_vects_to_check = 1;
2845 nelt_limit = group_size;
2847 tree elt = NULL_TREE;
2849 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2851 unsigned si = j % group_size;
2853 elt = SLP_TREE_SCALAR_OPS (node)[si];
2854 /* ??? We're just tracking whether all operands of a single
2855 vector initializer are the same, ideally we'd check if
2856 we emitted the same one already. */
2857 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2860 if (nelt == nelt_limit)
2862 record_stmt_cost (cost_vec, 1,
2863 SLP_TREE_DEF_TYPE (node) == vect_external_def
2864 ? (elt ? scalar_to_vec : vec_construct)
2866 NULL, vectype, 0, vect_prologue);
2872 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2873 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2875 Return true if the operations are supported. */
2878 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2879 slp_instance node_instance,
2880 hash_set<slp_tree> &visited,
2881 hash_set<slp_tree> &lvisited,
2882 stmt_vector_for_cost *cost_vec)
2887 /* Assume we can code-generate all invariants. */
2888 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2891 /* If we already analyzed the exact same set of scalar stmts we're done.
2892 We share the generated vector stmts for those.
2893 The SLP graph is acyclic so not caching whether we failed or succeeded
2894 doesn't result in any issue since we throw away the lvisited set
2896 if (visited.contains (node)
2897 || lvisited.contains (node))
2901 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2903 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
2904 visited, lvisited, cost_vec);
2911 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2914 lvisited.add (node);
2917 /* When the node can be vectorized cost invariant nodes it references.
2918 This is not done in DFS order to allow the refering node
2919 vectorizable_* calls to nail down the invariant nodes vector type
2920 and possibly unshare it if it needs a different vector type than
2923 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2924 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2925 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2926 /* Perform usual caching, note code-generation still
2927 code-gens these nodes multiple times but we expect
2928 to CSE them later. */
2929 && !visited.contains (child)
2930 && !lvisited.add (child))
2932 /* ??? After auditing more code paths make a "default"
2933 and push the vector type from NODE to all children
2934 if it is not already set. */
2935 /* Compute the number of vectors to be generated. */
2936 tree vector_type = SLP_TREE_VECTYPE (child);
2939 /* For shifts with a scalar argument we don't need
2940 to cost or code-generate anything.
2941 ??? Represent this more explicitely. */
2942 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
2943 == shift_vec_info_type)
2947 unsigned group_size = SLP_TREE_LANES (child);
2949 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2950 vf = loop_vinfo->vectorization_factor;
2951 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2952 = vect_get_num_vectors (vf * group_size, vector_type);
2953 /* And cost them. */
2954 vect_prologue_cost_for_slp (child, cost_vec);
2957 /* If this node or any of its children can't be vectorized, try pruning
2958 the tree here rather than felling the whole thing. */
2959 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2961 /* We'll need to revisit this for invariant costing and number
2962 of vectorized stmt setting. */
2970 /* Analyze statements in SLP instances of VINFO. Return true if the
2971 operations are supported. */
2974 vect_slp_analyze_operations (vec_info *vinfo)
2976 slp_instance instance;
2979 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2981 hash_set<slp_tree> visited;
2982 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2984 hash_set<slp_tree> lvisited;
2985 stmt_vector_for_cost cost_vec;
2986 cost_vec.create (2);
2987 if (!vect_slp_analyze_node_operations (vinfo,
2988 SLP_INSTANCE_TREE (instance),
2989 instance, visited, lvisited,
2991 /* Instances with a root stmt require vectorized defs for the
2993 || (SLP_INSTANCE_ROOT_STMT (instance)
2994 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2995 != vect_internal_def)))
2997 slp_tree node = SLP_INSTANCE_TREE (instance);
2998 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2999 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_NOTE, vect_location,
3001 "removing SLP instance operations starting from: %G",
3003 vect_free_slp_instance (instance, false);
3004 vinfo->slp_instances.ordered_remove (i);
3005 cost_vec.release ();
3009 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3010 x != lvisited.end(); ++x)
3014 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3015 cost_vec.release ();
3019 return !vinfo->slp_instances.is_empty ();
3023 /* Compute the scalar cost of the SLP node NODE and its children
3024 and return it. Do not account defs that are marked in LIFE and
3025 update LIFE according to uses of NODE. */
3028 vect_bb_slp_scalar_cost (vec_info *vinfo,
3029 slp_tree node, vec<bool, va_heap> *life,
3030 stmt_vector_for_cost *cost_vec,
3031 hash_set<slp_tree> &visited)
3034 stmt_vec_info stmt_info;
3037 if (visited.add (node))
3040 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3042 ssa_op_iter op_iter;
3043 def_operand_p def_p;
3048 /* If there is a non-vectorized use of the defs then the scalar
3049 stmt is kept live in which case we do not account it or any
3050 required defs in the SLP children in the scalar cost. This
3051 way we make the vectorization more costly when compared to
3053 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3054 gimple *orig_stmt = orig_stmt_info->stmt;
3055 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3057 imm_use_iterator use_iter;
3059 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3060 if (!is_gimple_debug (use_stmt))
3062 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3064 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3067 BREAK_FROM_IMM_USE_STMT (use_iter);
3074 /* Count scalar stmts only once. */
3075 if (gimple_visited_p (orig_stmt))
3077 gimple_set_visited (orig_stmt, true);
3079 vect_cost_for_stmt kind;
3080 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3082 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3085 kind = scalar_store;
3087 else if (vect_nop_conversion_p (orig_stmt_info))
3091 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3094 auto_vec<bool, 20> subtree_life;
3095 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3097 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3099 /* Do not directly pass LIFE to the recursive call, copy it to
3100 confine changes in the callee to the current child/subtree. */
3101 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3103 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child));
3104 for (unsigned j = 0;
3105 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3107 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3108 if (perm.first == i)
3109 subtree_life[perm.second] = (*life)[j];
3114 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3115 subtree_life.safe_splice (*life);
3117 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3119 subtree_life.truncate (0);
3124 /* Check if vectorization of the basic block is profitable. */
3127 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3129 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3130 slp_instance instance;
3132 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3133 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3135 /* Calculate scalar cost. */
3136 stmt_vector_for_cost scalar_costs;
3137 scalar_costs.create (0);
3138 hash_set<slp_tree> visited;
3139 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3141 auto_vec<bool, 20> life;
3142 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)));
3143 vect_bb_slp_scalar_cost (bb_vinfo,
3144 SLP_INSTANCE_TREE (instance),
3145 &life, &scalar_costs, visited);
3147 /* Unset visited flag. */
3148 stmt_info_for_cost *si;
3149 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3150 gimple_set_visited (si->stmt_info->stmt, false);
3152 void *target_cost_data = init_cost (NULL);
3153 add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
3154 scalar_costs.release ();
3156 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3157 destroy_cost_data (target_cost_data);
3159 /* Complete the target-specific cost calculation. */
3160 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3161 &vec_inside_cost, &vec_epilogue_cost);
3163 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3165 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3168 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3170 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3171 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3172 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3175 /* Vectorization is profitable if its cost is more than the cost of scalar
3176 version. Note that we err on the vector side for equal cost because
3177 the cost estimate is otherwise quite pessimistic (constant uses are
3178 free on the scalar side but cost a load on the vector side for
3180 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3186 /* Find any vectorizable constructors and add them to the grouped_store
3190 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3192 for (gimple *stmt : bb_vinfo->region_stmts ())
3194 gassign *assign = dyn_cast<gassign *> (stmt);
3195 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3198 tree rhs = gimple_assign_rhs1 (assign);
3199 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3200 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3201 CONSTRUCTOR_NELTS (rhs))
3202 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3203 || uniform_vector_p (rhs))
3206 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3207 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3211 /* Check if the region described by BB_VINFO can be vectorized, returning
3212 true if so. When returning false, set FATAL to true if the same failure
3213 would prevent vectorization at other vector sizes, false if it is still
3214 worth trying other sizes. N_STMTS is the number of statements in the
3218 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3220 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3222 slp_instance instance;
3224 poly_uint64 min_vf = 2;
3226 /* The first group of checks is independent of the vector size. */
3229 /* Analyze the data references. */
3231 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3233 if (dump_enabled_p ())
3234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3235 "not vectorized: unhandled data-ref in basic "
3240 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3244 "not vectorized: unhandled data access in "
3249 vect_slp_check_for_constructors (bb_vinfo);
3251 /* If there are no grouped stores and no constructors in the region
3252 there is no need to continue with pattern recog as vect_analyze_slp
3253 will fail anyway. */
3254 if (bb_vinfo->grouped_stores.is_empty ())
3256 if (dump_enabled_p ())
3257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3258 "not vectorized: no grouped stores in "
3263 /* While the rest of the analysis below depends on it in some way. */
3266 vect_pattern_recog (bb_vinfo);
3268 /* Check the SLP opportunities in the basic block, analyze and build SLP
3270 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3272 if (dump_enabled_p ())
3274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3275 "Failed to SLP the basic block.\n");
3276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3277 "not vectorized: failed to find SLP opportunities "
3278 "in basic block.\n");
3283 /* Optimize permutations. */
3284 vect_optimize_slp (bb_vinfo);
3286 vect_record_base_alignments (bb_vinfo);
3288 /* Analyze and verify the alignment of data references and the
3289 dependence in the SLP instances. */
3290 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3292 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
3293 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3295 slp_tree node = SLP_INSTANCE_TREE (instance);
3296 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3297 if (dump_enabled_p ())
3298 dump_printf_loc (MSG_NOTE, vect_location,
3299 "removing SLP instance operations starting from: %G",
3301 vect_free_slp_instance (instance, false);
3302 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3306 /* Mark all the statements that we want to vectorize as pure SLP and
3308 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3309 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3310 if (SLP_INSTANCE_ROOT_STMT (instance))
3311 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3315 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3318 if (!vect_slp_analyze_operations (bb_vinfo))
3320 if (dump_enabled_p ())
3321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3322 "not vectorized: bad operation in basic block.\n");
3326 /* Cost model: check if the vectorization is worthwhile. */
3327 if (!unlimited_cost_model (NULL)
3328 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3332 "not vectorized: vectorization is not "
3337 if (dump_enabled_p ())
3338 dump_printf_loc (MSG_NOTE, vect_location,
3339 "Basic block will be vectorized using SLP\n");
3343 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3344 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3345 on success. The region has N_STMTS statements and has the datarefs
3346 given by DATAREFS. */
3349 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3350 gimple_stmt_iterator region_end,
3351 vec<data_reference_p> datarefs,
3352 unsigned int n_stmts)
3354 bb_vec_info bb_vinfo;
3355 auto_vector_modes vector_modes;
3357 /* Autodetect first vector size we try. */
3358 machine_mode next_vector_mode = VOIDmode;
3359 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3360 unsigned int mode_i = 0;
3362 vec_info_shared shared;
3364 machine_mode autodetected_vector_mode = VOIDmode;
3367 bool vectorized = false;
3369 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3371 bool first_time_p = shared.datarefs.is_empty ();
3372 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3374 bb_vinfo->shared->save_datarefs ();
3376 bb_vinfo->shared->check_datarefs ();
3377 bb_vinfo->vector_mode = next_vector_mode;
3379 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3380 && dbg_cnt (vect_slp))
3382 if (dump_enabled_p ())
3384 dump_printf_loc (MSG_NOTE, vect_location,
3385 "***** Analysis succeeded with vector mode"
3386 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3387 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3390 bb_vinfo->shared->check_datarefs ();
3391 vect_schedule_slp (bb_vinfo);
3393 unsigned HOST_WIDE_INT bytes;
3394 if (dump_enabled_p ())
3396 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3397 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3398 "basic block part vectorized using %wu byte "
3399 "vectors\n", bytes);
3401 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3402 "basic block part vectorized using variable "
3403 "length vectors\n");
3410 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_NOTE, vect_location,
3412 "***** Analysis failed with vector mode %s\n",
3413 GET_MODE_NAME (bb_vinfo->vector_mode));
3417 autodetected_vector_mode = bb_vinfo->vector_mode;
3420 while (mode_i < vector_modes.length ()
3421 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3423 if (dump_enabled_p ())
3424 dump_printf_loc (MSG_NOTE, vect_location,
3425 "***** The result for vector mode %s would"
3427 GET_MODE_NAME (vector_modes[mode_i]));
3433 if (mode_i < vector_modes.length ()
3434 && VECTOR_MODE_P (autodetected_vector_mode)
3435 && (related_vector_mode (vector_modes[mode_i],
3436 GET_MODE_INNER (autodetected_vector_mode))
3437 == autodetected_vector_mode)
3438 && (related_vector_mode (autodetected_vector_mode,
3439 GET_MODE_INNER (vector_modes[mode_i]))
3440 == vector_modes[mode_i]))
3442 if (dump_enabled_p ())
3443 dump_printf_loc (MSG_NOTE, vect_location,
3444 "***** Skipping vector mode %s, which would"
3445 " repeat the analysis for %s\n",
3446 GET_MODE_NAME (vector_modes[mode_i]),
3447 GET_MODE_NAME (autodetected_vector_mode));
3452 || mode_i == vector_modes.length ()
3453 || autodetected_vector_mode == VOIDmode
3454 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3455 vector sizes will fail do not bother iterating. */
3459 /* Try the next biggest vector size. */
3460 next_vector_mode = vector_modes[mode_i++];
3461 if (dump_enabled_p ())
3462 dump_printf_loc (MSG_NOTE, vect_location,
3463 "***** Re-trying analysis with vector mode %s\n",
3464 GET_MODE_NAME (next_vector_mode));
3468 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3469 true if anything in the basic-block was vectorized. */
3472 vect_slp_bb (basic_block bb)
3474 gimple_stmt_iterator gsi;
3475 bool any_vectorized = false;
3477 gsi = gsi_after_labels (bb);
3478 while (!gsi_end_p (gsi))
3480 gimple_stmt_iterator region_begin = gsi;
3481 vec<data_reference_p> datarefs = vNULL;
3484 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3486 gimple *stmt = gsi_stmt (gsi);
3487 if (is_gimple_debug (stmt))
3489 /* Skip leading debug stmts. */
3490 if (gsi_stmt (region_begin) == stmt)
3491 gsi_next (®ion_begin);
3496 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3497 vect_location = stmt;
3499 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3502 if (gsi_end_p (region_begin))
3505 /* Skip leading unhandled stmts. */
3506 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3512 gimple_stmt_iterator region_end = gsi;
3514 if (insns > param_slp_max_insns_in_bb)
3516 if (dump_enabled_p ())
3517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3518 "not vectorized: too many instructions in "
3521 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3522 any_vectorized = true;
3524 if (gsi_end_p (region_end))
3527 /* Skip the unhandled stmt. */
3531 return any_vectorized;
3535 /* Build a variable-length vector in which the elements in ELTS are repeated
3536 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3537 RESULTS and add any new instructions to SEQ.
3539 The approach we use is:
3541 (1) Find a vector mode VM with integer elements of mode IM.
3543 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3544 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3545 from small vectors to IM.
3547 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3549 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3550 correct byte contents.
3552 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3554 We try to find the largest IM for which this sequence works, in order
3555 to cut down on the number of interleaves. */
3558 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3559 vec<tree> elts, unsigned int nresults,
3562 unsigned int nelts = elts.length ();
3563 tree element_type = TREE_TYPE (vector_type);
3565 /* (1) Find a vector mode VM with integer elements of mode IM. */
3566 unsigned int nvectors = 1;
3567 tree new_vector_type;
3569 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3570 &nvectors, &new_vector_type,
3574 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3575 unsigned int partial_nelts = nelts / nvectors;
3576 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3578 tree_vector_builder partial_elts;
3579 auto_vec<tree, 32> pieces (nvectors * 2);
3580 pieces.quick_grow (nvectors * 2);
3581 for (unsigned int i = 0; i < nvectors; ++i)
3583 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3584 ELTS' has mode IM. */
3585 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3586 for (unsigned int j = 0; j < partial_nelts; ++j)
3587 partial_elts.quick_push (elts[i * partial_nelts + j]);
3588 tree t = gimple_build_vector (seq, &partial_elts);
3589 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3590 TREE_TYPE (new_vector_type), t);
3592 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3593 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3596 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3597 correct byte contents.
3599 We need to repeat the following operation log2(nvectors) times:
3601 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3602 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3604 However, if each input repeats every N elements and the VF is
3605 a multiple of N * 2, the HI result is the same as the LO. */
3606 unsigned int in_start = 0;
3607 unsigned int out_start = nvectors;
3608 unsigned int hi_start = nvectors / 2;
3609 /* A bound on the number of outputs needed to produce NRESULTS results
3610 in the final iteration. */
3611 unsigned int noutputs_bound = nvectors * nresults;
3612 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3614 noutputs_bound /= 2;
3615 unsigned int limit = MIN (noutputs_bound, nvectors);
3616 for (unsigned int i = 0; i < limit; ++i)
3619 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3622 pieces[out_start + i] = pieces[out_start + i - 1];
3626 tree output = make_ssa_name (new_vector_type);
3627 tree input1 = pieces[in_start + (i / 2)];
3628 tree input2 = pieces[in_start + (i / 2) + hi_start];
3629 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3632 gimple_seq_add_stmt (seq, stmt);
3633 pieces[out_start + i] = output;
3635 std::swap (in_start, out_start);
3638 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3639 results.reserve (nresults);
3640 for (unsigned int i = 0; i < nresults; ++i)
3642 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3643 pieces[in_start + i]));
3645 results.quick_push (results[i - nvectors]);
3649 /* For constant and loop invariant defs in OP_NODE this function creates
3650 vector defs that will be used in the vectorized stmts and stores them
3651 to SLP_TREE_VEC_DEFS of OP_NODE. */
3654 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
3656 unsigned HOST_WIDE_INT nunits;
3658 unsigned j, number_of_places_left_in_vector;
3661 int group_size = op_node->ops.length ();
3662 unsigned int vec_num, i;
3663 unsigned number_of_copies = 1;
3665 gimple_seq ctor_seq = NULL;
3666 auto_vec<tree, 16> permute_results;
3668 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3669 vector_type = SLP_TREE_VECTYPE (op_node);
3671 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3672 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
3673 auto_vec<tree> voprnds (number_of_vectors);
3675 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3676 created vectors. It is greater than 1 if unrolling is performed.
3678 For example, we have two scalar operands, s1 and s2 (e.g., group of
3679 strided accesses of size two), while NUNITS is four (i.e., four scalars
3680 of this type can be packed in a vector). The output vector will contain
3681 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3684 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3685 containing the operands.
3687 For example, NUNITS is four as before, and the group size is 8
3688 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3689 {s5, s6, s7, s8}. */
3691 /* When using duplicate_and_interleave, we just need one element for
3692 each scalar statement. */
3693 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3694 nunits = group_size;
3696 number_of_copies = nunits * number_of_vectors / group_size;
3698 number_of_places_left_in_vector = nunits;
3700 tree_vector_builder elts (vector_type, nunits, 1);
3701 elts.quick_grow (nunits);
3702 stmt_vec_info insert_after = NULL;
3703 for (j = 0; j < number_of_copies; j++)
3706 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3708 /* Create 'vect_ = {op0,op1,...,opn}'. */
3709 number_of_places_left_in_vector--;
3711 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3713 if (CONSTANT_CLASS_P (op))
3715 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3717 /* Can't use VIEW_CONVERT_EXPR for booleans because
3718 of possibly different sizes of scalar value and
3720 if (integer_zerop (op))
3721 op = build_int_cst (TREE_TYPE (vector_type), 0);
3722 else if (integer_onep (op))
3723 op = build_all_ones_cst (TREE_TYPE (vector_type));
3728 op = fold_unary (VIEW_CONVERT_EXPR,
3729 TREE_TYPE (vector_type), op);
3730 gcc_assert (op && CONSTANT_CLASS_P (op));
3734 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3736 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3739 = build_all_ones_cst (TREE_TYPE (vector_type));
3741 = build_zero_cst (TREE_TYPE (vector_type));
3742 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3743 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3749 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3752 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3755 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3759 elts[number_of_places_left_in_vector] = op;
3760 if (!CONSTANT_CLASS_P (op))
3762 /* For BB vectorization we have to compute an insert location
3763 when a def is inside the analyzed region since we cannot
3764 simply insert at the BB start in this case. */
3765 stmt_vec_info opdef;
3766 if (TREE_CODE (orig_op) == SSA_NAME
3767 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3768 && is_a <bb_vec_info> (vinfo)
3769 && (opdef = vinfo->lookup_def (orig_op)))
3772 insert_after = opdef;
3774 insert_after = get_later_stmt (insert_after, opdef);
3777 if (number_of_places_left_in_vector == 0)
3780 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3781 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3782 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3785 if (permute_results.is_empty ())
3786 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3787 elts, number_of_vectors,
3789 vec_cst = permute_results[number_of_vectors - j - 1];
3791 if (!gimple_seq_empty_p (ctor_seq))
3795 gimple_stmt_iterator gsi
3796 = gsi_for_stmt (insert_after->stmt);
3797 gsi_insert_seq_after (&gsi, ctor_seq,
3798 GSI_CONTINUE_LINKING);
3801 vinfo->insert_seq_on_entry (NULL, ctor_seq);
3804 voprnds.quick_push (vec_cst);
3805 insert_after = NULL;
3806 number_of_places_left_in_vector = nunits;
3808 elts.new_vector (vector_type, nunits, 1);
3809 elts.quick_grow (nunits);
3814 /* Since the vectors are created in the reverse order, we should invert
3816 vec_num = voprnds.length ();
3817 for (j = vec_num; j != 0; j--)
3819 vop = voprnds[j - 1];
3820 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3823 /* In case that VF is greater than the unrolling factor needed for the SLP
3824 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3825 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3826 to replicate the vectors. */
3827 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
3828 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
3830 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3833 /* Get the Ith vectorized definition from SLP_NODE. */
3836 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
3838 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
3839 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
3841 return SLP_TREE_VEC_DEFS (slp_node)[i];
3844 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
3847 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
3849 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
3850 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
3853 gimple *vec_def_stmt;
3854 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
3855 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
3858 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
3861 /* Get N vectorized definitions for SLP_NODE. */
3864 vect_get_slp_defs (vec_info *,
3865 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3868 n = SLP_TREE_CHILDREN (slp_node).length ();
3870 for (unsigned i = 0; i < n; ++i)
3872 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3873 vec<tree> vec_defs = vNULL;
3874 vect_get_slp_defs (child, &vec_defs);
3875 vec_oprnds->quick_push (vec_defs);
3879 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3880 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3881 permute statements for the SLP node NODE. */
3884 vect_transform_slp_perm_load (vec_info *vinfo,
3885 slp_tree node, vec<tree> dr_chain,
3886 gimple_stmt_iterator *gsi, poly_uint64 vf,
3887 bool analyze_only, unsigned *n_perms)
3889 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3891 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3892 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
3893 unsigned int mask_element;
3896 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3899 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3901 mode = TYPE_MODE (vectype);
3902 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3904 /* Initialize the vect stmts of NODE to properly insert the generated
3907 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3908 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3909 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3911 /* Generate permutation masks for every NODE. Number of masks for each NODE
3912 is equal to GROUP_SIZE.
3913 E.g., we have a group of three nodes with three loads from the same
3914 location in each node, and the vector size is 4. I.e., we have a
3915 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3916 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3917 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3920 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3921 The last mask is illegal since we assume two operands for permute
3922 operation, and the mask element values can't be outside that range.
3923 Hence, the last mask must be converted into {2,5,5,5}.
3924 For the first two permutations we need the first and the second input
3925 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3926 we need the second and the third vectors: {b1,c1,a2,b2} and
3929 int vect_stmts_counter = 0;
3930 unsigned int index = 0;
3931 int first_vec_index = -1;
3932 int second_vec_index = -1;
3936 vec_perm_builder mask;
3937 unsigned int nelts_to_build;
3938 unsigned int nvectors_per_build;
3939 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3940 && multiple_p (nunits, group_size));
3943 /* A single vector contains a whole number of copies of the node, so:
3944 (a) all permutes can use the same mask; and
3945 (b) the permutes only need a single vector input. */
3946 mask.new_vector (nunits, group_size, 3);
3947 nelts_to_build = mask.encoded_nelts ();
3948 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3952 /* We need to construct a separate mask for each vector statement. */
3953 unsigned HOST_WIDE_INT const_nunits, const_vf;
3954 if (!nunits.is_constant (&const_nunits)
3955 || !vf.is_constant (&const_vf))
3957 mask.new_vector (const_nunits, const_nunits, 1);
3958 nelts_to_build = const_vf * group_size;
3959 nvectors_per_build = 1;
3962 unsigned int count = mask.encoded_nelts ();
3963 mask.quick_grow (count);
3964 vec_perm_indices indices;
3966 for (unsigned int j = 0; j < nelts_to_build; j++)
3968 unsigned int iter_num = j / group_size;
3969 unsigned int stmt_num = j % group_size;
3970 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3971 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3974 first_vec_index = 0;
3979 /* Enforced before the loop when !repeating_p. */
3980 unsigned int const_nunits = nunits.to_constant ();
3981 vec_index = i / const_nunits;
3982 mask_element = i % const_nunits;
3983 if (vec_index == first_vec_index
3984 || first_vec_index == -1)
3986 first_vec_index = vec_index;
3988 else if (vec_index == second_vec_index
3989 || second_vec_index == -1)
3991 second_vec_index = vec_index;
3992 mask_element += const_nunits;
3996 if (dump_enabled_p ())
3997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3998 "permutation requires at "
3999 "least three vectors %G",
4001 gcc_assert (analyze_only);
4005 gcc_assert (mask_element < 2 * const_nunits);
4008 if (mask_element != index)
4010 mask[index++] = mask_element;
4012 if (index == count && !noop_p)
4014 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4015 if (!can_vec_perm_const_p (mode, indices))
4017 if (dump_enabled_p ())
4019 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4021 "unsupported vect permute { ");
4022 for (i = 0; i < count; ++i)
4024 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4025 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4027 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4029 gcc_assert (analyze_only);
4040 tree mask_vec = NULL_TREE;
4043 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4045 if (second_vec_index == -1)
4046 second_vec_index = first_vec_index;
4048 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4050 /* Generate the permute statement if necessary. */
4051 tree first_vec = dr_chain[first_vec_index + ri];
4052 tree second_vec = dr_chain[second_vec_index + ri];
4056 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4058 = vect_create_destination_var (gimple_assign_lhs (stmt),
4060 perm_dest = make_ssa_name (perm_dest);
4062 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4063 first_vec, second_vec,
4065 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4069 /* If mask was NULL_TREE generate the requested
4070 identity transform. */
4071 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4073 /* Store the vector statement in NODE. */
4074 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4079 first_vec_index = -1;
4080 second_vec_index = -1;
4089 /* Vectorize the SLP permutations in NODE as specified
4090 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4091 child number and lane number.
4092 Interleaving of two two-lane two-child SLP subtrees (not supported):
4093 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4094 A blend of two four-lane two-child SLP subtrees:
4095 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4096 Highpart of a four-lane one-child SLP subtree (not supported):
4097 [ { 0, 2 }, { 0, 3 } ]
4098 Where currently only a subset is supported by code generating below. */
4101 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4102 slp_tree node, stmt_vector_for_cost *cost_vec)
4104 tree vectype = SLP_TREE_VECTYPE (node);
4106 /* ??? We currently only support all same vector input and output types
4107 while the SLP IL should really do a concat + select and thus accept
4108 arbitrary mismatches. */
4111 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4112 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4114 if (dump_enabled_p ())
4115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4116 "Unsupported lane permutation\n");
4120 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4121 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4122 if (dump_enabled_p ())
4124 dump_printf_loc (MSG_NOTE, vect_location,
4125 "vectorizing permutation");
4126 for (unsigned i = 0; i < perm.length (); ++i)
4127 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4128 dump_printf (MSG_NOTE, "\n");
4131 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4132 if (!nunits.is_constant ())
4134 unsigned HOST_WIDE_INT vf = 1;
4135 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4136 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4138 unsigned olanes = vf * SLP_TREE_LANES (node);
4139 gcc_assert (multiple_p (olanes, nunits));
4141 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4142 from the { SLP operand, scalar lane } permutation as recorded in the
4143 SLP node as intermediate step. This part should already work
4144 with SLP children with arbitrary number of lanes. */
4145 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4146 auto_vec<unsigned> active_lane;
4147 vperm.create (olanes);
4148 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length ());
4149 for (unsigned i = 0; i < vf; ++i)
4151 for (unsigned pi = 0; pi < perm.length (); ++pi)
4153 std::pair<unsigned, unsigned> p = perm[pi];
4154 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4155 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4156 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4157 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4158 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4160 /* Advance to the next group. */
4161 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4162 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4165 if (dump_enabled_p ())
4167 dump_printf_loc (MSG_NOTE, vect_location, "as");
4168 for (unsigned i = 0; i < vperm.length (); ++i)
4170 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4171 dump_printf (MSG_NOTE, ",");
4172 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4173 vperm[i].first.first, vperm[i].first.second,
4174 vperm[i].first.second);
4176 dump_printf (MSG_NOTE, "\n");
4179 /* We can only handle two-vector permutes, everything else should
4180 be lowered on the SLP level. The following is closely inspired
4181 by vect_transform_slp_perm_load and is supposed to eventually
4183 ??? As intermediate step do code-gen in the SLP tree representation
4185 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4186 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4187 unsigned int const_nunits = nunits.to_constant ();
4188 unsigned int index = 0;
4189 unsigned int mask_element;
4190 vec_perm_builder mask;
4191 mask.new_vector (const_nunits, const_nunits, 1);
4192 unsigned int count = mask.encoded_nelts ();
4193 mask.quick_grow (count);
4194 vec_perm_indices indices;
4195 unsigned nperms = 0;
4196 for (unsigned i = 0; i < vperm.length (); ++i)
4198 mask_element = vperm[i].second;
4199 if (first_vec.first == -1U
4200 || first_vec == vperm[i].first)
4201 first_vec = vperm[i].first;
4202 else if (second_vec.first == -1U
4203 || second_vec == vperm[i].first)
4205 second_vec = vperm[i].first;
4206 mask_element += const_nunits;
4210 if (dump_enabled_p ())
4211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4212 "permutation requires at "
4213 "least three vectors");
4218 mask[index++] = mask_element;
4222 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4224 bool identity_p = indices.series_p (0, 1, 0, 1);
4226 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4228 if (dump_enabled_p ())
4230 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4232 "unsupported vect permute { ");
4233 for (i = 0; i < count; ++i)
4235 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4236 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4238 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4248 if (second_vec.first == -1U)
4249 second_vec = first_vec;
4251 /* Generate the permute statement if necessary. */
4252 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4254 = vect_get_slp_vect_def (first_node, first_vec.second);
4256 tree perm_dest = make_ssa_name (vectype);
4259 slp_tree second_node
4260 = SLP_TREE_CHILDREN (node)[second_vec.first];
4262 = vect_get_slp_vect_def (second_node, second_vec.second);
4263 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4264 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4265 first_def, second_def,
4269 /* We need a copy here in case the def was external. */
4270 perm_stmt = gimple_build_assign (perm_dest, first_def);
4271 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4272 /* Store the vector statement in NODE. */
4273 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4277 first_vec = std::make_pair (-1U, -1U);
4278 second_vec = std::make_pair (-1U, -1U);
4283 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
4288 /* Vectorize SLP instance tree in postorder. */
4291 vect_schedule_slp_instance (vec_info *vinfo,
4292 slp_tree node, slp_instance instance)
4294 gimple_stmt_iterator si;
4298 /* See if we have already vectorized the node in the graph of the
4300 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
4301 && SLP_TREE_VEC_STMTS (node).exists ())
4302 || SLP_TREE_VEC_DEFS (node).exists ())
4305 /* Vectorize externals and constants. */
4306 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
4307 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4309 /* ??? vectorizable_shift can end up using a scalar operand which is
4310 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4311 node in this case. */
4312 if (!SLP_TREE_VECTYPE (node))
4315 vect_create_constant_vectors (vinfo, node);
4319 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4320 vect_schedule_slp_instance (vinfo, child, instance);
4322 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4323 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4325 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4326 if (dump_enabled_p ())
4327 dump_printf_loc (MSG_NOTE, vect_location,
4328 "------>vectorizing SLP node starting from: %G",
4331 if (STMT_VINFO_DATA_REF (stmt_info)
4332 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
4334 /* Vectorized loads go before the first scalar load to make it
4335 ready early, vectorized stores go before the last scalar
4336 stmt which is where all uses are ready. */
4337 stmt_vec_info last_stmt_info = NULL;
4338 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
4339 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
4340 else /* DR_IS_WRITE */
4341 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4342 si = gsi_for_stmt (last_stmt_info->stmt);
4344 else if (SLP_TREE_CHILDREN (node).is_empty ())
4346 /* This happens for reduction and induction PHIs where we do not use the
4347 insertion iterator. */
4348 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4349 == cycle_phi_info_type
4350 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4351 == induc_vec_info_type));
4356 /* Emit other stmts after the children vectorized defs which is
4357 earliest possible. */
4358 gimple *last_stmt = NULL;
4359 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4360 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4362 /* For fold-left reductions we are retaining the scalar
4363 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4364 set so the representation isn't perfect. Resort to the
4365 last scalar def here. */
4366 if (SLP_TREE_VEC_STMTS (child).is_empty ())
4368 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
4369 == cycle_phi_info_type);
4370 gphi *phi = as_a <gphi *>
4371 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
4373 || vect_stmt_dominates_stmt_p (last_stmt, phi))
4376 /* We are emitting all vectorized stmts in the same place and
4377 the last one is the last.
4378 ??? Unless we have a load permutation applied and that
4379 figures to re-use an earlier generated load. */
4382 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
4384 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4387 else if (!SLP_TREE_VECTYPE (child))
4389 /* For externals we use unvectorized at all scalar defs. */
4392 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
4393 if (TREE_CODE (def) == SSA_NAME
4394 && !SSA_NAME_IS_DEFAULT_DEF (def))
4396 gimple *stmt = SSA_NAME_DEF_STMT (def);
4398 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
4404 /* For externals we have to look at all defs since their
4405 insertion place is decided per vector. */
4408 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
4409 if (TREE_CODE (vdef) == SSA_NAME
4410 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
4412 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
4414 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4418 /* This can happen when all children are pre-existing vectors or
4421 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
4422 if (is_a <gphi *> (last_stmt))
4423 si = gsi_after_labels (gimple_bb (last_stmt));
4426 si = gsi_for_stmt (last_stmt);
4431 bool done_p = false;
4433 /* Handle purely internal nodes. */
4434 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4436 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4437 be shared with different SLP nodes (but usually it's the same
4438 operation apart from the case the stmt is only there for denoting
4439 the actual scalar lane defs ...). So do not call vect_transform_stmt
4440 but open-code it here (partly). */
4441 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
4446 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4449 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4450 For loop vectorization this is done in vectorizable_call, but for SLP
4451 it needs to be deferred until end of vect_schedule_slp, because multiple
4452 SLP instances may refer to the same scalar stmt. */
4455 vect_remove_slp_scalar_calls (vec_info *vinfo,
4456 slp_tree node, hash_set<slp_tree> &visited)
4459 gimple_stmt_iterator gsi;
4463 stmt_vec_info stmt_info;
4465 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4468 if (visited.add (node))
4471 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4472 vect_remove_slp_scalar_calls (vinfo, child, visited);
4474 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4476 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4477 if (!stmt || gimple_bb (stmt) == NULL)
4479 if (is_pattern_stmt_p (stmt_info)
4480 || !PURE_SLP_STMT (stmt_info))
4482 lhs = gimple_call_lhs (stmt);
4483 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4484 gsi = gsi_for_stmt (stmt);
4485 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4486 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4491 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4493 hash_set<slp_tree> visited;
4494 vect_remove_slp_scalar_calls (vinfo, node, visited);
4497 /* Vectorize the instance root. */
4500 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4502 gassign *rstmt = NULL;
4504 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4509 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4511 tree vect_lhs = gimple_get_lhs (child_stmt);
4512 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4513 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4514 TREE_TYPE (vect_lhs)))
4515 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4517 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4521 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4523 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4526 vec<constructor_elt, va_gc> *v;
4527 vec_alloc (v, nelts);
4529 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4531 CONSTRUCTOR_APPEND_ELT (v,
4533 gimple_get_lhs (child_stmt));
4535 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4536 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4537 tree r_constructor = build_constructor (rtype, v);
4538 rstmt = gimple_build_assign (lhs, r_constructor);
4543 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4544 gsi_replace (&rgsi, rstmt, true);
4547 /* Generate vector code for all SLP instances in the loop/basic block. */
4550 vect_schedule_slp (vec_info *vinfo)
4552 vec<slp_instance> slp_instances;
4553 slp_instance instance;
4556 slp_instances = vinfo->slp_instances;
4557 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4559 slp_tree node = SLP_INSTANCE_TREE (instance);
4560 /* Schedule the tree of INSTANCE. */
4561 vect_schedule_slp_instance (vinfo, node, instance);
4563 if (SLP_INSTANCE_ROOT_STMT (instance))
4564 vectorize_slp_instance_root_stmt (node, instance);
4566 if (dump_enabled_p ())
4567 dump_printf_loc (MSG_NOTE, vect_location,
4568 "vectorizing stmts using SLP.\n");
4571 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4573 slp_tree root = SLP_INSTANCE_TREE (instance);
4574 stmt_vec_info store_info;
4577 /* Remove scalar call stmts. Do not do this for basic-block
4578 vectorization as not all uses may be vectorized.
4579 ??? Why should this be necessary? DCE should be able to
4580 remove the stmts itself.
4581 ??? For BB vectorization we can as well remove scalar
4582 stmts starting from the SLP tree root if they have no
4584 if (is_a <loop_vec_info> (vinfo))
4585 vect_remove_slp_scalar_calls (vinfo, root);
4587 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4589 if (!STMT_VINFO_DATA_REF (store_info))
4592 if (SLP_INSTANCE_ROOT_STMT (instance))
4595 store_info = vect_orig_stmt (store_info);
4596 /* Free the attached stmt_vec_info and remove the stmt. */
4597 vinfo->remove_stmt (store_info);