2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
39 #include "diagnostic-core.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
45 /* Loop Vectorization Pass.
47 This pass tries to vectorize loops.
49 For example, the vectorizer transforms the following simple loop:
51 short a[N]; short b[N]; short c[N]; int i;
57 as if it was manually vectorized by rewriting the source code into:
59 typedef int __attribute__((mode(V8HI))) v8hi;
60 short a[N]; short b[N]; short c[N]; int i;
61 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
64 for (i=0; i<N/8; i++){
71 The main entry to this pass is vectorize_loops(), in which
72 the vectorizer applies a set of analyses on a given set of loops,
73 followed by the actual vectorization transformation for the loops that
74 had successfully passed the analysis phase.
75 Throughout this pass we make a distinction between two types of
76 data: scalars (which are represented by SSA_NAMES), and memory references
77 ("data-refs"). These two types of data require different handling both
78 during analysis and transformation. The types of data-refs that the
79 vectorizer currently supports are ARRAY_REFS which base is an array DECL
80 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
81 accesses are required to have a simple (consecutive) access pattern.
85 The driver for the analysis phase is vect_analyze_loop().
86 It applies a set of analyses, some of which rely on the scalar evolution
87 analyzer (scev) developed by Sebastian Pop.
89 During the analysis phase the vectorizer records some information
90 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
91 loop, as well as general information about the loop as a whole, which is
92 recorded in a "loop_vec_info" struct attached to each loop.
96 The loop transformation phase scans all the stmts in the loop, and
97 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
98 the loop that needs to be vectorized. It inserts the vector code sequence
99 just before the scalar stmt S, and records a pointer to the vector code
100 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
101 attached to S). This pointer will be used for the vectorization of following
102 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
103 otherwise, we rely on dead code elimination for removing it.
105 For example, say stmt S1 was vectorized into stmt VS1:
108 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
111 To vectorize stmt S2, the vectorizer first finds the stmt that defines
112 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
113 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
114 resulting sequence would be:
117 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
121 Operands that are not SSA_NAMEs, are data-refs that appear in
122 load/store operations (like 'x[i]' in S1), and are handled differently.
126 Currently the only target specific information that is used is the
127 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
128 Targets that can support different sizes of vectors, for now will need
129 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
130 flexibility will be added in the future.
132 Since we only vectorize operations which vector form can be
133 expressed using existing tree codes, to verify that an operation is
134 supported, the vectorizer checks the relevant optab at the relevant
135 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
136 the value found is CODE_FOR_nothing, then there's no target support, and
137 we can't vectorize the stmt.
139 For additional information on this project see:
140 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
143 /* Function vect_determine_vectorization_factor
145 Determine the vectorization factor (VF). VF is the number of data elements
146 that are operated upon in parallel in a single iteration of the vectorized
147 loop. For example, when vectorizing a loop that operates on 4byte elements,
148 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
149 elements can fit in a single vector register.
151 We currently support vectorization of loops in which all types operated upon
152 are of the same size. Therefore this function currently sets VF according to
153 the size of the types operated upon, and fails if there are multiple sizes
156 VF is also the factor by which the loop iterations are strip-mined, e.g.:
163 for (i=0; i<N; i+=VF){
164 a[i:VF] = b[i:VF] + c[i:VF];
169 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
172 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
173 int nbbs = loop->num_nodes;
174 gimple_stmt_iterator si;
175 unsigned int vectorization_factor = 0;
180 stmt_vec_info stmt_info;
183 gimple stmt, pattern_stmt = NULL;
184 gimple_seq pattern_def_seq = NULL;
185 gimple_stmt_iterator pattern_def_si = gsi_none ();
186 bool analyze_pattern_stmt = false;
188 if (dump_kind_p (MSG_NOTE))
189 dump_printf_loc (MSG_NOTE, vect_location,
190 "=== vect_determine_vectorization_factor ===");
192 for (i = 0; i < nbbs; i++)
194 basic_block bb = bbs[i];
196 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
199 stmt_info = vinfo_for_stmt (phi);
200 if (dump_kind_p (MSG_NOTE))
202 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
203 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
206 gcc_assert (stmt_info);
208 if (STMT_VINFO_RELEVANT_P (stmt_info))
210 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
211 scalar_type = TREE_TYPE (PHI_RESULT (phi));
213 if (dump_kind_p (MSG_NOTE))
215 dump_printf_loc (MSG_NOTE, vect_location,
216 "get vectype for scalar type: ");
217 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
220 vectype = get_vectype_for_scalar_type (scalar_type);
223 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
226 "not vectorized: unsupported "
228 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
233 STMT_VINFO_VECTYPE (stmt_info) = vectype;
235 if (dump_kind_p (MSG_NOTE))
237 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
238 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
241 nunits = TYPE_VECTOR_SUBPARTS (vectype);
242 if (dump_kind_p (MSG_NOTE))
243 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
245 if (!vectorization_factor
246 || (nunits > vectorization_factor))
247 vectorization_factor = nunits;
251 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
255 if (analyze_pattern_stmt)
258 stmt = gsi_stmt (si);
260 stmt_info = vinfo_for_stmt (stmt);
262 if (dump_kind_p (MSG_NOTE))
264 dump_printf_loc (MSG_NOTE, vect_location,
265 "==> examining statement: ");
266 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
269 gcc_assert (stmt_info);
271 /* Skip stmts which do not need to be vectorized. */
272 if (!STMT_VINFO_RELEVANT_P (stmt_info)
273 && !STMT_VINFO_LIVE_P (stmt_info))
275 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
276 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
277 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
278 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
281 stmt_info = vinfo_for_stmt (pattern_stmt);
282 if (dump_kind_p (MSG_NOTE))
284 dump_printf_loc (MSG_NOTE, vect_location,
285 "==> examining pattern statement: ");
286 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
291 if (dump_kind_p (MSG_NOTE))
292 dump_printf_loc (MSG_NOTE, vect_location, "skip.");
297 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
298 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
299 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
300 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
301 analyze_pattern_stmt = true;
303 /* If a pattern statement has def stmts, analyze them too. */
304 if (is_pattern_stmt_p (stmt_info))
306 if (pattern_def_seq == NULL)
308 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
309 pattern_def_si = gsi_start (pattern_def_seq);
311 else if (!gsi_end_p (pattern_def_si))
312 gsi_next (&pattern_def_si);
313 if (pattern_def_seq != NULL)
315 gimple pattern_def_stmt = NULL;
316 stmt_vec_info pattern_def_stmt_info = NULL;
318 while (!gsi_end_p (pattern_def_si))
320 pattern_def_stmt = gsi_stmt (pattern_def_si);
321 pattern_def_stmt_info
322 = vinfo_for_stmt (pattern_def_stmt);
323 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
324 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
326 gsi_next (&pattern_def_si);
329 if (!gsi_end_p (pattern_def_si))
331 if (dump_kind_p (MSG_NOTE))
333 dump_printf_loc (MSG_NOTE, vect_location,
334 "==> examining pattern def stmt: ");
335 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
336 pattern_def_stmt, 0);
339 stmt = pattern_def_stmt;
340 stmt_info = pattern_def_stmt_info;
344 pattern_def_si = gsi_none ();
345 analyze_pattern_stmt = false;
349 analyze_pattern_stmt = false;
352 if (gimple_get_lhs (stmt) == NULL_TREE)
354 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357 "not vectorized: irregular stmt.");
358 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
364 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
366 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
369 "not vectorized: vector stmt in loop:");
370 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
375 if (STMT_VINFO_VECTYPE (stmt_info))
377 /* The only case when a vectype had been already set is for stmts
378 that contain a dataref, or for "pattern-stmts" (stmts
379 generated by the vectorizer to represent/replace a certain
381 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
382 || is_pattern_stmt_p (stmt_info)
383 || !gsi_end_p (pattern_def_si));
384 vectype = STMT_VINFO_VECTYPE (stmt_info);
388 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
389 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
390 if (dump_kind_p (MSG_NOTE))
392 dump_printf_loc (MSG_NOTE, vect_location,
393 "get vectype for scalar type: ");
394 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
396 vectype = get_vectype_for_scalar_type (scalar_type);
399 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
402 "not vectorized: unsupported "
404 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
410 STMT_VINFO_VECTYPE (stmt_info) = vectype;
413 /* The vectorization factor is according to the smallest
414 scalar type (or the largest vector size, but we only
415 support one vector size per loop). */
416 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
418 if (dump_kind_p (MSG_NOTE))
420 dump_printf_loc (MSG_NOTE, vect_location,
421 "get vectype for scalar type: ");
422 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
424 vf_vectype = get_vectype_for_scalar_type (scalar_type);
427 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
430 "not vectorized: unsupported data-type ");
431 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
437 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
438 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
440 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "not vectorized: different sized vector "
444 "types in statement, ");
445 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
447 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
448 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
454 if (dump_kind_p (MSG_NOTE))
456 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
457 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
460 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
461 if (dump_kind_p (MSG_NOTE))
462 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
463 if (!vectorization_factor
464 || (nunits > vectorization_factor))
465 vectorization_factor = nunits;
467 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
469 pattern_def_seq = NULL;
475 /* TODO: Analyze cost. Decide if worth while to vectorize. */
476 if (dump_kind_p (MSG_NOTE))
477 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
478 vectorization_factor);
479 if (vectorization_factor <= 1)
481 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
483 "not vectorized: unsupported data-type");
486 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
492 /* Function vect_is_simple_iv_evolution.
494 FORNOW: A simple evolution of an induction variables in the loop is
495 considered a polynomial evolution with constant step. */
498 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
503 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
505 /* When there is no evolution in this loop, the evolution function
507 if (evolution_part == NULL_TREE)
510 /* When the evolution is a polynomial of degree >= 2
511 the evolution function is not "simple". */
512 if (tree_is_chrec (evolution_part))
515 step_expr = evolution_part;
516 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
518 if (dump_kind_p (MSG_NOTE))
520 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
521 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
522 dump_printf (MSG_NOTE, ", init: ");
523 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
529 if (TREE_CODE (step_expr) != INTEGER_CST)
531 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 /* Function vect_analyze_scalar_cycles_1.
542 Examine the cross iteration def-use cycles of scalar variables
543 in LOOP. LOOP_VINFO represents the loop that is now being
544 considered for vectorization (can be LOOP, or an outer-loop
548 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
550 basic_block bb = loop->header;
552 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
553 gimple_stmt_iterator gsi;
556 if (dump_kind_p (MSG_NOTE))
557 dump_printf_loc (MSG_NOTE, vect_location,
558 "=== vect_analyze_scalar_cycles ===");
560 /* First - identify all inductions. Reduction detection assumes that all the
561 inductions have been identified, therefore, this order must not be
563 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
565 gimple phi = gsi_stmt (gsi);
566 tree access_fn = NULL;
567 tree def = PHI_RESULT (phi);
568 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
570 if (dump_kind_p (MSG_NOTE))
572 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
573 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
576 /* Skip virtual phi's. The data dependences that are associated with
577 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
578 if (virtual_operand_p (def))
581 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
583 /* Analyze the evolution function. */
584 access_fn = analyze_scalar_evolution (loop, def);
587 STRIP_NOPS (access_fn);
588 if (dump_kind_p (MSG_NOTE))
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Access function of PHI: ");
592 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
594 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
595 = evolution_part_in_loop_num (access_fn, loop->num);
599 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
601 VEC_safe_push (gimple, heap, worklist, phi);
605 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
607 if (dump_kind_p (MSG_NOTE))
608 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
609 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
613 /* Second - identify all reductions and nested cycles. */
614 while (VEC_length (gimple, worklist) > 0)
616 gimple phi = VEC_pop (gimple, worklist);
617 tree def = PHI_RESULT (phi);
618 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
622 if (dump_kind_p (MSG_NOTE))
624 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
625 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
628 gcc_assert (!virtual_operand_p (def)
629 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
631 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
632 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
638 if (dump_kind_p (MSG_NOTE))
639 dump_printf_loc (MSG_NOTE, vect_location,
640 "Detected double reduction.");
642 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
643 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
644 vect_double_reduction_def;
650 if (dump_kind_p (MSG_NOTE))
651 dump_printf_loc (MSG_NOTE, vect_location,
652 "Detected vectorizable nested cycle.");
654 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
655 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
660 if (dump_kind_p (MSG_NOTE))
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "Detected reduction.");
664 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
665 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
667 /* Store the reduction cycles for possible vectorization in
669 VEC_safe_push (gimple, heap,
670 LOOP_VINFO_REDUCTIONS (loop_vinfo),
676 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 "Unknown def-use cycle pattern.");
681 VEC_free (gimple, heap, worklist);
685 /* Function vect_analyze_scalar_cycles.
687 Examine the cross iteration def-use cycles of scalar variables, by
688 analyzing the loop-header PHIs of scalar variables. Classify each
689 cycle as one of the following: invariant, induction, reduction, unknown.
690 We do that for the loop represented by LOOP_VINFO, and also to its
691 inner-loop, if exists.
692 Examples for scalar cycles:
707 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
709 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
711 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
713 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
714 Reductions in such inner-loop therefore have different properties than
715 the reductions in the nest that gets vectorized:
716 1. When vectorized, they are executed in the same order as in the original
717 scalar loop, so we can't change the order of computation when
719 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
720 current checks are too strict. */
723 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
726 /* Function vect_get_loop_niters.
728 Determine how many iterations the loop is executed.
729 If an expression that represents the number of iterations
730 can be constructed, place it in NUMBER_OF_ITERATIONS.
731 Return the loop exit condition. */
734 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
738 if (dump_kind_p (MSG_NOTE))
739 dump_printf_loc (MSG_NOTE, vect_location,
740 "=== get_loop_niters ===");
741 niters = number_of_exit_cond_executions (loop);
743 if (niters != NULL_TREE
744 && niters != chrec_dont_know)
746 *number_of_iterations = niters;
748 if (dump_kind_p (MSG_NOTE))
750 dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
751 dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
755 return get_loop_exit_condition (loop);
759 /* Function bb_in_loop_p
761 Used as predicate for dfs order traversal of the loop bbs. */
764 bb_in_loop_p (const_basic_block bb, const void *data)
766 const struct loop *const loop = (const struct loop *)data;
767 if (flow_bb_inside_loop_p (loop, bb))
773 /* Function new_loop_vec_info.
775 Create and initialize a new loop_vec_info struct for LOOP, as well as
776 stmt_vec_info structs for all the stmts in LOOP. */
779 new_loop_vec_info (struct loop *loop)
783 gimple_stmt_iterator si;
784 unsigned int i, nbbs;
786 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
787 LOOP_VINFO_LOOP (res) = loop;
789 bbs = get_loop_body (loop);
791 /* Create/Update stmt_info for all stmts in the loop. */
792 for (i = 0; i < loop->num_nodes; i++)
794 basic_block bb = bbs[i];
796 /* BBs in a nested inner-loop will have been already processed (because
797 we will have called vect_analyze_loop_form for any nested inner-loop).
798 Therefore, for stmts in an inner-loop we just want to update the
799 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
800 loop_info of the outer-loop we are currently considering to vectorize
801 (instead of the loop_info of the inner-loop).
802 For stmts in other BBs we need to create a stmt_info from scratch. */
803 if (bb->loop_father != loop)
806 gcc_assert (loop->inner && bb->loop_father == loop->inner);
807 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
809 gimple phi = gsi_stmt (si);
810 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
811 loop_vec_info inner_loop_vinfo =
812 STMT_VINFO_LOOP_VINFO (stmt_info);
813 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
814 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
816 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
818 gimple stmt = gsi_stmt (si);
819 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
820 loop_vec_info inner_loop_vinfo =
821 STMT_VINFO_LOOP_VINFO (stmt_info);
822 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
823 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
828 /* bb in current nest. */
829 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
831 gimple phi = gsi_stmt (si);
832 gimple_set_uid (phi, 0);
833 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
836 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
838 gimple stmt = gsi_stmt (si);
839 gimple_set_uid (stmt, 0);
840 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
845 /* CHECKME: We want to visit all BBs before their successors (except for
846 latch blocks, for which this assertion wouldn't hold). In the simple
847 case of the loop forms we allow, a dfs order of the BBs would the same
848 as reversed postorder traversal, so we are safe. */
851 bbs = XCNEWVEC (basic_block, loop->num_nodes);
852 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
853 bbs, loop->num_nodes, loop);
854 gcc_assert (nbbs == loop->num_nodes);
856 LOOP_VINFO_BBS (res) = bbs;
857 LOOP_VINFO_NITERS (res) = NULL;
858 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
859 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
860 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
861 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
862 LOOP_VINFO_VECT_FACTOR (res) = 0;
863 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
864 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
865 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
866 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
867 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
868 VEC_alloc (gimple, heap,
869 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
870 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
871 VEC_alloc (ddr_p, heap,
872 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
873 LOOP_VINFO_GROUPED_STORES (res) = VEC_alloc (gimple, heap, 10);
874 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
875 LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
876 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
877 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
878 LOOP_VINFO_PEELING_HTAB (res) = NULL;
879 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
880 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
881 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
887 /* Function destroy_loop_vec_info.
889 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
890 stmts in the loop. */
893 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
898 gimple_stmt_iterator si;
900 VEC (slp_instance, heap) *slp_instances;
901 slp_instance instance;
907 loop = LOOP_VINFO_LOOP (loop_vinfo);
909 bbs = LOOP_VINFO_BBS (loop_vinfo);
910 nbbs = loop->num_nodes;
911 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
915 free (LOOP_VINFO_BBS (loop_vinfo));
916 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
917 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
918 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
919 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
920 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
927 for (j = 0; j < nbbs; j++)
929 basic_block bb = bbs[j];
930 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
931 free_stmt_vec_info (gsi_stmt (si));
933 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
935 gimple stmt = gsi_stmt (si);
937 /* We may have broken canonical form by moving a constant
938 into RHS1 of a commutative op. Fix such occurrences. */
939 if (swapped && is_gimple_assign (stmt))
941 enum tree_code code = gimple_assign_rhs_code (stmt);
943 if ((code == PLUS_EXPR
944 || code == POINTER_PLUS_EXPR
945 || code == MULT_EXPR)
946 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
947 swap_tree_operands (stmt,
948 gimple_assign_rhs1_ptr (stmt),
949 gimple_assign_rhs2_ptr (stmt));
952 /* Free stmt_vec_info. */
953 free_stmt_vec_info (stmt);
958 free (LOOP_VINFO_BBS (loop_vinfo));
959 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
960 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
961 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
962 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
963 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
964 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
965 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
966 vect_free_slp_instance (instance);
968 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
969 VEC_free (gimple, heap, LOOP_VINFO_GROUPED_STORES (loop_vinfo));
970 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
971 VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
973 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
974 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
976 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
983 /* Function vect_analyze_loop_1.
985 Apply a set of analyses on LOOP, and create a loop_vec_info struct
986 for it. The different analyses will record information in the
987 loop_vec_info struct. This is a subset of the analyses applied in
988 vect_analyze_loop, to be applied on an inner-loop nested in the loop
989 that is now considered for (outer-loop) vectorization. */
992 vect_analyze_loop_1 (struct loop *loop)
994 loop_vec_info loop_vinfo;
996 if (dump_kind_p (MSG_NOTE))
997 dump_printf_loc (MSG_NOTE, vect_location,
998 "===== analyze_loop_nest_1 =====");
1000 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1002 loop_vinfo = vect_analyze_loop_form (loop);
1005 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 "bad inner-loop form.");
1015 /* Function vect_analyze_loop_form.
1017 Verify that certain CFG restrictions hold, including:
1018 - the loop has a pre-header
1019 - the loop has a single entry and exit
1020 - the loop exit condition is simple enough, and the number of iterations
1021 can be analyzed (a countable loop). */
1024 vect_analyze_loop_form (struct loop *loop)
1026 loop_vec_info loop_vinfo;
1028 tree number_of_iterations = NULL;
1029 loop_vec_info inner_loop_vinfo = NULL;
1031 if (dump_kind_p (MSG_NOTE))
1032 dump_printf_loc (MSG_NOTE, vect_location,
1033 "=== vect_analyze_loop_form ===");
1035 /* Different restrictions apply when we are considering an inner-most loop,
1036 vs. an outer (nested) loop.
1037 (FORNOW. May want to relax some of these restrictions in the future). */
1041 /* Inner-most loop. We currently require that the number of BBs is
1042 exactly 2 (the header and latch). Vectorizable inner-most loops
1053 if (loop->num_nodes != 2)
1055 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "not vectorized: control flow in loop.");
1061 if (empty_block_p (loop->header))
1063 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1065 "not vectorized: empty loop.");
1071 struct loop *innerloop = loop->inner;
1074 /* Nested loop. We currently require that the loop is doubly-nested,
1075 contains a single inner loop, and the number of BBs is exactly 5.
1076 Vectorizable outer-loops look like this:
1088 The inner-loop has the properties expected of inner-most loops
1089 as described above. */
1091 if ((loop->inner)->inner || (loop->inner)->next)
1093 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095 "not vectorized: multiple nested loops.");
1099 /* Analyze the inner-loop. */
1100 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1101 if (!inner_loop_vinfo)
1103 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105 "not vectorized: Bad inner loop.");
1109 if (!expr_invariant_in_loop_p (loop,
1110 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1112 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1114 "not vectorized: inner-loop count not invariant.");
1115 destroy_loop_vec_info (inner_loop_vinfo, true);
1119 if (loop->num_nodes != 5)
1121 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 "not vectorized: control flow in loop.");
1124 destroy_loop_vec_info (inner_loop_vinfo, true);
1128 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1129 entryedge = EDGE_PRED (innerloop->header, 0);
1130 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1131 entryedge = EDGE_PRED (innerloop->header, 1);
1133 if (entryedge->src != loop->header
1134 || !single_exit (innerloop)
1135 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1137 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 "not vectorized: unsupported outerloop form.");
1140 destroy_loop_vec_info (inner_loop_vinfo, true);
1144 if (dump_kind_p (MSG_NOTE))
1145 dump_printf_loc (MSG_NOTE, vect_location,
1146 "Considering outer-loop vectorization.");
1149 if (!single_exit (loop)
1150 || EDGE_COUNT (loop->header->preds) != 2)
1152 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1154 if (!single_exit (loop))
1155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1156 "not vectorized: multiple exits.");
1157 else if (EDGE_COUNT (loop->header->preds) != 2)
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "not vectorized: too many incoming edges.");
1161 if (inner_loop_vinfo)
1162 destroy_loop_vec_info (inner_loop_vinfo, true);
1166 /* We assume that the loop exit condition is at the end of the loop. i.e,
1167 that the loop is represented as a do-while (with a proper if-guard
1168 before the loop if needed), where the loop header contains all the
1169 executable statements, and the latch is empty. */
1170 if (!empty_block_p (loop->latch)
1171 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1173 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1175 "not vectorized: unexpected loop form.");
1176 if (inner_loop_vinfo)
1177 destroy_loop_vec_info (inner_loop_vinfo, true);
1181 /* Make sure there exists a single-predecessor exit bb: */
1182 if (!single_pred_p (single_exit (loop)->dest))
1184 edge e = single_exit (loop);
1185 if (!(e->flags & EDGE_ABNORMAL))
1187 split_loop_exit_edge (e);
1188 if (dump_kind_p (MSG_NOTE))
1189 dump_printf (MSG_NOTE, "split exit edge.");
1193 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1195 "not vectorized: abnormal loop exit edge.");
1196 if (inner_loop_vinfo)
1197 destroy_loop_vec_info (inner_loop_vinfo, true);
1202 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1205 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207 "not vectorized: complicated exit condition.");
1208 if (inner_loop_vinfo)
1209 destroy_loop_vec_info (inner_loop_vinfo, true);
1213 if (!number_of_iterations)
1215 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1217 "not vectorized: number of iterations cannot be "
1219 if (inner_loop_vinfo)
1220 destroy_loop_vec_info (inner_loop_vinfo, true);
1224 if (chrec_contains_undetermined (number_of_iterations))
1226 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 "Infinite number of iterations.");
1229 if (inner_loop_vinfo)
1230 destroy_loop_vec_info (inner_loop_vinfo, true);
1234 if (!NITERS_KNOWN_P (number_of_iterations))
1236 if (dump_kind_p (MSG_NOTE))
1238 dump_printf_loc (MSG_NOTE, vect_location,
1239 "Symbolic number of iterations is ");
1240 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1243 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1245 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247 "not vectorized: number of iterations = 0.");
1248 if (inner_loop_vinfo)
1249 destroy_loop_vec_info (inner_loop_vinfo, false);
1253 loop_vinfo = new_loop_vec_info (loop);
1254 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1255 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1257 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1259 /* CHECKME: May want to keep it around it in the future. */
1260 if (inner_loop_vinfo)
1261 destroy_loop_vec_info (inner_loop_vinfo, false);
1263 gcc_assert (!loop->aux);
1264 loop->aux = loop_vinfo;
1269 /* Function vect_analyze_loop_operations.
1271 Scan the loop stmts and make sure they are all vectorizable. */
1274 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1276 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1277 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1278 int nbbs = loop->num_nodes;
1279 gimple_stmt_iterator si;
1280 unsigned int vectorization_factor = 0;
1283 stmt_vec_info stmt_info;
1284 bool need_to_vectorize = false;
1285 int min_profitable_iters;
1286 int min_scalar_loop_bound;
1288 bool only_slp_in_loop = true, ok;
1289 HOST_WIDE_INT max_niter;
1291 if (dump_kind_p (MSG_NOTE))
1292 dump_printf_loc (MSG_NOTE, vect_location,
1293 "=== vect_analyze_loop_operations ===");
1295 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1296 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1299 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1300 vectorization factor of the loop is the unrolling factor required by
1301 the SLP instances. If that unrolling factor is 1, we say, that we
1302 perform pure SLP on loop - cross iteration parallelism is not
1304 for (i = 0; i < nbbs; i++)
1306 basic_block bb = bbs[i];
1307 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1309 gimple stmt = gsi_stmt (si);
1310 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1311 gcc_assert (stmt_info);
1312 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1313 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1314 && !PURE_SLP_STMT (stmt_info))
1315 /* STMT needs both SLP and loop-based vectorization. */
1316 only_slp_in_loop = false;
1320 if (only_slp_in_loop)
1321 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1323 vectorization_factor = least_common_multiple (vectorization_factor,
1324 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1326 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1327 if (dump_kind_p (MSG_NOTE))
1328 dump_printf_loc (MSG_NOTE, vect_location,
1329 "Updating vectorization factor to %d ",
1330 vectorization_factor);
1333 for (i = 0; i < nbbs; i++)
1335 basic_block bb = bbs[i];
1337 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1339 phi = gsi_stmt (si);
1342 stmt_info = vinfo_for_stmt (phi);
1343 if (dump_kind_p (MSG_NOTE))
1345 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1346 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1349 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1350 (i.e., a phi in the tail of the outer-loop). */
1351 if (! is_loop_header_bb_p (bb))
1353 /* FORNOW: we currently don't support the case that these phis
1354 are not used in the outerloop (unless it is double reduction,
1355 i.e., this phi is vect_reduction_def), cause this case
1356 requires to actually do something here. */
1357 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1358 || STMT_VINFO_LIVE_P (stmt_info))
1359 && STMT_VINFO_DEF_TYPE (stmt_info)
1360 != vect_double_reduction_def)
1362 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1364 "Unsupported loop-closed phi in "
1369 /* If PHI is used in the outer loop, we check that its operand
1370 is defined in the inner loop. */
1371 if (STMT_VINFO_RELEVANT_P (stmt_info))
1376 if (gimple_phi_num_args (phi) != 1)
1379 phi_op = PHI_ARG_DEF (phi, 0);
1380 if (TREE_CODE (phi_op) != SSA_NAME)
1383 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1385 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1386 || !vinfo_for_stmt (op_def_stmt))
1389 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1390 != vect_used_in_outer
1391 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1392 != vect_used_in_outer_by_reduction)
1399 gcc_assert (stmt_info);
1401 if (STMT_VINFO_LIVE_P (stmt_info))
1403 /* FORNOW: not yet supported. */
1404 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1406 "not vectorized: value used after loop.");
1410 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1411 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1413 /* A scalar-dependence cycle that we don't support. */
1414 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1416 "not vectorized: scalar dependence cycle.");
1420 if (STMT_VINFO_RELEVANT_P (stmt_info))
1422 need_to_vectorize = true;
1423 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1424 ok = vectorizable_induction (phi, NULL, NULL);
1429 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1432 "not vectorized: relevant phi not "
1434 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1440 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1442 gimple stmt = gsi_stmt (si);
1443 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1448 /* All operations in the loop are either irrelevant (deal with loop
1449 control, or dead), or only used outside the loop and can be moved
1450 out of the loop (e.g. invariants, inductions). The loop can be
1451 optimized away by scalar optimizations. We're better off not
1452 touching this loop. */
1453 if (!need_to_vectorize)
1455 if (dump_kind_p (MSG_NOTE))
1456 dump_printf_loc (MSG_NOTE, vect_location,
1457 "All the computation can be taken out of the loop.");
1458 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1460 "not vectorized: redundant loop. no profit to "
1465 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1466 && dump_kind_p (MSG_NOTE))
1467 dump_printf_loc (MSG_NOTE, vect_location,
1468 "vectorization_factor = %d, niters = "
1469 HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1470 LOOP_VINFO_INT_NITERS (loop_vinfo));
1472 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1473 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1474 || ((max_niter = max_stmt_executions_int (loop)) != -1
1475 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1477 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1479 "not vectorized: iteration count too small.");
1480 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1482 "not vectorized: iteration count smaller than "
1483 "vectorization factor.");
1487 /* Analyze cost. Decide if worth while to vectorize. */
1489 /* Once VF is set, SLP costs should be updated since the number of created
1490 vector stmts depends on VF. */
1491 vect_update_slp_costs_according_to_vf (loop_vinfo);
1493 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1494 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1496 if (min_profitable_iters < 0)
1498 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1500 "not vectorized: vectorization not profitable.");
1501 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503 "not vectorized: vector version will never be "
1508 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1509 * vectorization_factor) - 1);
1511 /* Use the cost model only if it is more conservative than user specified
1514 th = (unsigned) min_scalar_loop_bound;
1515 if (min_profitable_iters
1516 && (!min_scalar_loop_bound
1517 || min_profitable_iters > min_scalar_loop_bound))
1518 th = (unsigned) min_profitable_iters;
1520 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1521 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1523 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525 "not vectorized: vectorization not profitable.");
1526 if (dump_kind_p (MSG_NOTE))
1527 dump_printf_loc (MSG_NOTE, vect_location,
1528 "not vectorized: iteration count smaller than user "
1529 "specified loop bound parameter or minimum profitable "
1530 "iterations (whichever is more conservative).");
1534 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1535 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1536 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1538 if (dump_kind_p (MSG_NOTE))
1539 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1540 if (!vect_can_advance_ivs_p (loop_vinfo))
1542 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1544 "not vectorized: can't create epilog loop 1.");
1547 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1549 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1551 "not vectorized: can't create epilog loop 2.");
1560 /* Function vect_analyze_loop_2.
1562 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1563 for it. The different analyses will record information in the
1564 loop_vec_info struct. */
1566 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1568 bool ok, slp = false;
1569 int max_vf = MAX_VECTORIZATION_FACTOR;
1572 /* Find all data references in the loop (which correspond to vdefs/vuses)
1573 and analyze their evolution in the loop. Also adjust the minimal
1574 vectorization factor according to the loads and stores.
1576 FORNOW: Handle only simple, array references, which
1577 alignment can be forced, and aligned pointer-references. */
1579 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1582 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1584 "bad data references.");
1588 /* Classify all cross-iteration scalar data-flow cycles.
1589 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1591 vect_analyze_scalar_cycles (loop_vinfo);
1593 vect_pattern_recog (loop_vinfo, NULL);
1595 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1597 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1600 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1602 "unexpected pattern.");
1606 /* Analyze data dependences between the data-refs in the loop
1607 and adjust the maximum vectorization factor according to
1609 FORNOW: fail at the first data dependence that we encounter. */
1611 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1615 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1617 "bad data dependence.");
1621 ok = vect_determine_vectorization_factor (loop_vinfo);
1624 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1626 "can't determine vectorization factor.");
1629 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1631 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633 "bad data dependence.");
1637 /* Analyze the alignment of the data-refs in the loop.
1638 Fail if a data reference is found that cannot be vectorized. */
1640 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1643 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645 "bad data alignment.");
1649 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1650 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1652 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1655 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657 "bad data access.");
1661 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1662 It is important to call pruning after vect_analyze_data_ref_accesses,
1663 since we use grouping information gathered by interleaving analysis. */
1664 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1667 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1669 "too long list of versioning for alias "
1674 /* This pass will decide on using loop versioning and/or loop peeling in
1675 order to enhance the alignment of data references in the loop. */
1677 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1680 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1682 "bad data alignment.");
1686 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1687 ok = vect_analyze_slp (loop_vinfo, NULL);
1690 /* Decide which possible SLP instances to SLP. */
1691 slp = vect_make_slp_decision (loop_vinfo);
1693 /* Find stmts that need to be both vectorized and SLPed. */
1694 vect_detect_hybrid_slp (loop_vinfo);
1699 /* Scan all the operations in the loop and make sure they are
1702 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1705 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707 "bad operation or unsupported loop bound.");
1714 /* Function vect_analyze_loop.
1716 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1717 for it. The different analyses will record information in the
1718 loop_vec_info struct. */
1720 vect_analyze_loop (struct loop *loop)
1722 loop_vec_info loop_vinfo;
1723 unsigned int vector_sizes;
1725 /* Autodetect first vector size we try. */
1726 current_vector_size = 0;
1727 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1729 if (dump_kind_p (MSG_NOTE))
1730 dump_printf_loc (MSG_NOTE, vect_location,
1731 "===== analyze_loop_nest =====");
1733 if (loop_outer (loop)
1734 && loop_vec_info_for_loop (loop_outer (loop))
1735 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1737 if (dump_kind_p (MSG_NOTE))
1738 dump_printf_loc (MSG_NOTE, vect_location,
1739 "outer-loop already vectorized.");
1745 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1746 loop_vinfo = vect_analyze_loop_form (loop);
1749 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
1750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1755 if (vect_analyze_loop_2 (loop_vinfo))
1757 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1762 destroy_loop_vec_info (loop_vinfo, true);
1764 vector_sizes &= ~current_vector_size;
1765 if (vector_sizes == 0
1766 || current_vector_size == 0)
1769 /* Try the next biggest vector size. */
1770 current_vector_size = 1 << floor_log2 (vector_sizes);
1771 if (dump_kind_p (MSG_NOTE))
1772 dump_printf_loc (MSG_NOTE, vect_location,
1773 "***** Re-trying analysis with "
1774 "vector size %d\n", current_vector_size);
1779 /* Function reduction_code_for_scalar_code
1782 CODE - tree_code of a reduction operations.
1785 REDUC_CODE - the corresponding tree-code to be used to reduce the
1786 vector of partial results into a single scalar result (which
1787 will also reside in a vector) or ERROR_MARK if the operation is
1788 a supported reduction operation, but does not have such tree-code.
1790 Return FALSE if CODE currently cannot be vectorized as reduction. */
1793 reduction_code_for_scalar_code (enum tree_code code,
1794 enum tree_code *reduc_code)
1799 *reduc_code = REDUC_MAX_EXPR;
1803 *reduc_code = REDUC_MIN_EXPR;
1807 *reduc_code = REDUC_PLUS_EXPR;
1815 *reduc_code = ERROR_MARK;
1824 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1825 STMT is printed with a message MSG. */
1828 report_vect_op (int msg_type, gimple stmt, const char *msg)
1830 dump_printf_loc (msg_type, vect_location, "%s", msg);
1831 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1835 /* Detect SLP reduction of the form:
1845 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1846 FIRST_STMT is the first reduction stmt in the chain
1847 (a2 = operation (a1)).
1849 Return TRUE if a reduction chain was detected. */
1852 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1854 struct loop *loop = (gimple_bb (phi))->loop_father;
1855 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1856 enum tree_code code;
1857 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1858 stmt_vec_info use_stmt_info, current_stmt_info;
1860 imm_use_iterator imm_iter;
1861 use_operand_p use_p;
1862 int nloop_uses, size = 0, n_out_of_loop_uses;
1865 if (loop != vect_loop)
1868 lhs = PHI_RESULT (phi);
1869 code = gimple_assign_rhs_code (first_stmt);
1873 n_out_of_loop_uses = 0;
1874 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1876 gimple use_stmt = USE_STMT (use_p);
1877 if (is_gimple_debug (use_stmt))
1880 use_stmt = USE_STMT (use_p);
1882 /* Check if we got back to the reduction phi. */
1883 if (use_stmt == phi)
1885 loop_use_stmt = use_stmt;
1890 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1892 if (vinfo_for_stmt (use_stmt)
1893 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1895 loop_use_stmt = use_stmt;
1900 n_out_of_loop_uses++;
1902 /* There are can be either a single use in the loop or two uses in
1904 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1911 /* We reached a statement with no loop uses. */
1912 if (nloop_uses == 0)
1915 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1916 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1919 if (!is_gimple_assign (loop_use_stmt)
1920 || code != gimple_assign_rhs_code (loop_use_stmt)
1921 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1924 /* Insert USE_STMT into reduction chain. */
1925 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1928 current_stmt_info = vinfo_for_stmt (current_stmt);
1929 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1930 GROUP_FIRST_ELEMENT (use_stmt_info)
1931 = GROUP_FIRST_ELEMENT (current_stmt_info);
1934 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1936 lhs = gimple_assign_lhs (loop_use_stmt);
1937 current_stmt = loop_use_stmt;
1941 if (!found || loop_use_stmt != phi || size < 2)
1944 /* Swap the operands, if needed, to make the reduction operand be the second
1946 lhs = PHI_RESULT (phi);
1947 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1950 if (gimple_assign_rhs2 (next_stmt) == lhs)
1952 tree op = gimple_assign_rhs1 (next_stmt);
1953 gimple def_stmt = NULL;
1955 if (TREE_CODE (op) == SSA_NAME)
1956 def_stmt = SSA_NAME_DEF_STMT (op);
1958 /* Check that the other def is either defined in the loop
1959 ("vect_internal_def"), or it's an induction (defined by a
1960 loop-header phi-node). */
1962 && gimple_bb (def_stmt)
1963 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1964 && (is_gimple_assign (def_stmt)
1965 || is_gimple_call (def_stmt)
1966 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1967 == vect_induction_def
1968 || (gimple_code (def_stmt) == GIMPLE_PHI
1969 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1970 == vect_internal_def
1971 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1973 lhs = gimple_assign_lhs (next_stmt);
1974 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1982 tree op = gimple_assign_rhs2 (next_stmt);
1983 gimple def_stmt = NULL;
1985 if (TREE_CODE (op) == SSA_NAME)
1986 def_stmt = SSA_NAME_DEF_STMT (op);
1988 /* Check that the other def is either defined in the loop
1989 ("vect_internal_def"), or it's an induction (defined by a
1990 loop-header phi-node). */
1992 && gimple_bb (def_stmt)
1993 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1994 && (is_gimple_assign (def_stmt)
1995 || is_gimple_call (def_stmt)
1996 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1997 == vect_induction_def
1998 || (gimple_code (def_stmt) == GIMPLE_PHI
1999 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2000 == vect_internal_def
2001 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2003 if (dump_kind_p (MSG_NOTE))
2005 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2006 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2009 swap_tree_operands (next_stmt,
2010 gimple_assign_rhs1_ptr (next_stmt),
2011 gimple_assign_rhs2_ptr (next_stmt));
2012 update_stmt (next_stmt);
2014 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2015 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2021 lhs = gimple_assign_lhs (next_stmt);
2022 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2025 /* Save the chain for further analysis in SLP detection. */
2026 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2027 VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
2028 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2034 /* Function vect_is_simple_reduction_1
2036 (1) Detect a cross-iteration def-use cycle that represents a simple
2037 reduction computation. We look for the following pattern:
2042 a2 = operation (a3, a1)
2045 1. operation is commutative and associative and it is safe to
2046 change the order of the computation (if CHECK_REDUCTION is true)
2047 2. no uses for a2 in the loop (a2 is used out of the loop)
2048 3. no uses of a1 in the loop besides the reduction operation
2049 4. no uses of a1 outside the loop.
2051 Conditions 1,4 are tested here.
2052 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2054 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2055 nested cycles, if CHECK_REDUCTION is false.
2057 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2061 inner loop (def of a3)
2064 If MODIFY is true it tries also to rework the code in-place to enable
2065 detection of more reduction patterns. For the time being we rewrite
2066 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2070 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2071 bool check_reduction, bool *double_reduc,
2074 struct loop *loop = (gimple_bb (phi))->loop_father;
2075 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2076 edge latch_e = loop_latch_edge (loop);
2077 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2078 gimple def_stmt, def1 = NULL, def2 = NULL;
2079 enum tree_code orig_code, code;
2080 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2084 imm_use_iterator imm_iter;
2085 use_operand_p use_p;
2088 *double_reduc = false;
2090 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2091 otherwise, we assume outer loop vectorization. */
2092 gcc_assert ((check_reduction && loop == vect_loop)
2093 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2095 name = PHI_RESULT (phi);
2097 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2099 gimple use_stmt = USE_STMT (use_p);
2100 if (is_gimple_debug (use_stmt))
2103 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2105 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2107 "intermediate value used outside loop.");
2112 if (vinfo_for_stmt (use_stmt)
2113 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2117 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2119 "reduction used in loop.");
2124 if (TREE_CODE (loop_arg) != SSA_NAME)
2126 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129 "reduction: not ssa_name: ");
2130 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2135 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2138 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2140 "reduction: no def_stmt.");
2144 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2146 if (dump_kind_p (MSG_NOTE))
2147 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2151 if (is_gimple_assign (def_stmt))
2153 name = gimple_assign_lhs (def_stmt);
2158 name = PHI_RESULT (def_stmt);
2163 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2165 gimple use_stmt = USE_STMT (use_p);
2166 if (is_gimple_debug (use_stmt))
2168 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2169 && vinfo_for_stmt (use_stmt)
2170 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2174 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2176 "reduction used in loop.");
2181 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2182 defined in the inner loop. */
2185 op1 = PHI_ARG_DEF (def_stmt, 0);
2187 if (gimple_phi_num_args (def_stmt) != 1
2188 || TREE_CODE (op1) != SSA_NAME)
2190 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192 "unsupported phi node definition.");
2197 def1 = SSA_NAME_DEF_STMT (op1);
2198 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2200 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2201 && is_gimple_assign (def1))
2203 if (dump_kind_p (MSG_NOTE))
2204 report_vect_op (MSG_NOTE, def_stmt,
2205 "detected double reduction: ");
2207 *double_reduc = true;
2214 code = orig_code = gimple_assign_rhs_code (def_stmt);
2216 /* We can handle "res -= x[i]", which is non-associative by
2217 simply rewriting this into "res += -x[i]". Avoid changing
2218 gimple instruction for the first simple tests and only do this
2219 if we're allowed to change code at all. */
2220 if (code == MINUS_EXPR
2222 && (op1 = gimple_assign_rhs1 (def_stmt))
2223 && TREE_CODE (op1) == SSA_NAME
2224 && SSA_NAME_DEF_STMT (op1) == phi)
2228 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2230 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2231 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2232 "reduction: not commutative/associative: ");
2236 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2238 if (code != COND_EXPR)
2240 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2241 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2242 "reduction: not binary operation: ");
2247 op3 = gimple_assign_rhs1 (def_stmt);
2248 if (COMPARISON_CLASS_P (op3))
2250 op4 = TREE_OPERAND (op3, 1);
2251 op3 = TREE_OPERAND (op3, 0);
2254 op1 = gimple_assign_rhs2 (def_stmt);
2255 op2 = gimple_assign_rhs3 (def_stmt);
2257 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2259 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2260 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2261 "reduction: uses not ssa_names: ");
2268 op1 = gimple_assign_rhs1 (def_stmt);
2269 op2 = gimple_assign_rhs2 (def_stmt);
2271 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2273 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2274 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2275 "reduction: uses not ssa_names: ");
2281 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2282 if ((TREE_CODE (op1) == SSA_NAME
2283 && !types_compatible_p (type,TREE_TYPE (op1)))
2284 || (TREE_CODE (op2) == SSA_NAME
2285 && !types_compatible_p (type, TREE_TYPE (op2)))
2286 || (op3 && TREE_CODE (op3) == SSA_NAME
2287 && !types_compatible_p (type, TREE_TYPE (op3)))
2288 || (op4 && TREE_CODE (op4) == SSA_NAME
2289 && !types_compatible_p (type, TREE_TYPE (op4))))
2291 if (dump_kind_p (MSG_NOTE))
2293 dump_printf_loc (MSG_NOTE, vect_location,
2294 "reduction: multiple types: operation type: ");
2295 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2296 dump_printf (MSG_NOTE, ", operands types: ");
2297 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2299 dump_printf (MSG_NOTE, ",");
2300 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2304 dump_printf (MSG_NOTE, ",");
2305 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2311 dump_printf (MSG_NOTE, ",");
2312 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2320 /* Check that it's ok to change the order of the computation.
2321 Generally, when vectorizing a reduction we change the order of the
2322 computation. This may change the behavior of the program in some
2323 cases, so we need to check that this is ok. One exception is when
2324 vectorizing an outer-loop: the inner-loop is executed sequentially,
2325 and therefore vectorizing reductions in the inner-loop during
2326 outer-loop vectorization is safe. */
2328 /* CHECKME: check for !flag_finite_math_only too? */
2329 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2332 /* Changing the order of operations changes the semantics. */
2333 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2334 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2335 "reduction: unsafe fp math optimization: ");
2338 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2341 /* Changing the order of operations changes the semantics. */
2342 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2343 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2344 "reduction: unsafe int math optimization: ");
2347 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2349 /* Changing the order of operations changes the semantics. */
2350 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2351 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2352 "reduction: unsafe fixed-point math optimization: ");
2356 /* If we detected "res -= x[i]" earlier, rewrite it into
2357 "res += -x[i]" now. If this turns out to be useless reassoc
2358 will clean it up again. */
2359 if (orig_code == MINUS_EXPR)
2361 tree rhs = gimple_assign_rhs2 (def_stmt);
2362 tree negrhs = copy_ssa_name (rhs, NULL);
2363 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2365 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2366 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2368 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2369 gimple_assign_set_rhs2 (def_stmt, negrhs);
2370 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2371 update_stmt (def_stmt);
2374 /* Reduction is safe. We're dealing with one of the following:
2375 1) integer arithmetic and no trapv
2376 2) floating point arithmetic, and special flags permit this optimization
2377 3) nested cycle (i.e., outer loop vectorization). */
2378 if (TREE_CODE (op1) == SSA_NAME)
2379 def1 = SSA_NAME_DEF_STMT (op1);
2381 if (TREE_CODE (op2) == SSA_NAME)
2382 def2 = SSA_NAME_DEF_STMT (op2);
2384 if (code != COND_EXPR
2385 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2387 if (dump_kind_p (MSG_NOTE))
2388 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2392 /* Check that one def is the reduction def, defined by PHI,
2393 the other def is either defined in the loop ("vect_internal_def"),
2394 or it's an induction (defined by a loop-header phi-node). */
2396 if (def2 && def2 == phi
2397 && (code == COND_EXPR
2398 || !def1 || gimple_nop_p (def1)
2399 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2400 && (is_gimple_assign (def1)
2401 || is_gimple_call (def1)
2402 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2403 == vect_induction_def
2404 || (gimple_code (def1) == GIMPLE_PHI
2405 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2406 == vect_internal_def
2407 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2409 if (dump_kind_p (MSG_NOTE))
2410 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2414 if (def1 && def1 == phi
2415 && (code == COND_EXPR
2416 || !def2 || gimple_nop_p (def2)
2417 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2418 && (is_gimple_assign (def2)
2419 || is_gimple_call (def2)
2420 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2421 == vect_induction_def
2422 || (gimple_code (def2) == GIMPLE_PHI
2423 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2424 == vect_internal_def
2425 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2427 if (check_reduction)
2429 /* Swap operands (just for simplicity - so that the rest of the code
2430 can assume that the reduction variable is always the last (second)
2432 if (dump_kind_p (MSG_NOTE))
2433 report_vect_op (MSG_NOTE, def_stmt,
2434 "detected reduction: need to swap operands: ");
2436 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2437 gimple_assign_rhs2_ptr (def_stmt));
2439 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2440 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2444 if (dump_kind_p (MSG_NOTE))
2445 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2451 /* Try to find SLP reduction chain. */
2452 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2454 if (dump_kind_p (MSG_NOTE))
2455 report_vect_op (MSG_NOTE, def_stmt,
2456 "reduction: detected reduction chain: ");
2461 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2462 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2463 "reduction: unknown pattern: ");
2468 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2469 in-place. Arguments as there. */
2472 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2473 bool check_reduction, bool *double_reduc)
2475 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2476 double_reduc, false);
2479 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2480 in-place if it enables detection of more reductions. Arguments
2484 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2485 bool check_reduction, bool *double_reduc)
2487 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2488 double_reduc, true);
2491 /* Calculate the cost of one scalar iteration of the loop. */
2493 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2495 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2496 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2497 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2498 int innerloop_iters, i, stmt_cost;
2500 /* Count statements in scalar loop. Using this as scalar cost for a single
2503 TODO: Add outer loop support.
2505 TODO: Consider assigning different costs to different scalar
2509 innerloop_iters = 1;
2511 innerloop_iters = 50; /* FIXME */
2513 for (i = 0; i < nbbs; i++)
2515 gimple_stmt_iterator si;
2516 basic_block bb = bbs[i];
2518 if (bb->loop_father == loop->inner)
2519 factor = innerloop_iters;
2523 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2525 gimple stmt = gsi_stmt (si);
2526 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2528 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2531 /* Skip stmts that are not vectorized inside the loop. */
2533 && !STMT_VINFO_RELEVANT_P (stmt_info)
2534 && (!STMT_VINFO_LIVE_P (stmt_info)
2535 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2536 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2539 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2541 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2542 stmt_cost = vect_get_stmt_cost (scalar_load);
2544 stmt_cost = vect_get_stmt_cost (scalar_store);
2547 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2549 scalar_single_iter_cost += stmt_cost * factor;
2552 return scalar_single_iter_cost;
2555 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2557 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2558 int *peel_iters_epilogue,
2559 int scalar_single_iter_cost,
2560 stmt_vector_for_cost *prologue_cost_vec,
2561 stmt_vector_for_cost *epilogue_cost_vec)
2564 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2566 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2568 *peel_iters_epilogue = vf/2;
2569 if (dump_kind_p (MSG_NOTE))
2570 dump_printf_loc (MSG_NOTE, vect_location,
2571 "cost model: epilogue peel iters set to vf/2 "
2572 "because loop iterations are unknown .");
2574 /* If peeled iterations are known but number of scalar loop
2575 iterations are unknown, count a taken branch per peeled loop. */
2576 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2577 NULL, 0, vect_prologue);
2581 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2582 peel_iters_prologue = niters < peel_iters_prologue ?
2583 niters : peel_iters_prologue;
2584 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2585 /* If we need to peel for gaps, but no peeling is required, we have to
2586 peel VF iterations. */
2587 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2588 *peel_iters_epilogue = vf;
2591 if (peel_iters_prologue)
2592 retval += record_stmt_cost (prologue_cost_vec,
2593 peel_iters_prologue * scalar_single_iter_cost,
2594 scalar_stmt, NULL, 0, vect_prologue);
2595 if (*peel_iters_epilogue)
2596 retval += record_stmt_cost (epilogue_cost_vec,
2597 *peel_iters_epilogue * scalar_single_iter_cost,
2598 scalar_stmt, NULL, 0, vect_epilogue);
2602 /* Function vect_estimate_min_profitable_iters
2604 Return the number of iterations required for the vector version of the
2605 loop to be profitable relative to the cost of the scalar version of the
2608 TODO: Take profile info into account before making vectorization
2609 decisions, if available. */
2612 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2614 int min_profitable_iters;
2615 int peel_iters_prologue;
2616 int peel_iters_epilogue;
2617 unsigned vec_inside_cost = 0;
2618 int vec_outside_cost = 0;
2619 unsigned vec_prologue_cost = 0;
2620 unsigned vec_epilogue_cost = 0;
2621 int scalar_single_iter_cost = 0;
2622 int scalar_outside_cost = 0;
2623 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2624 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2625 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2627 /* Cost model disabled. */
2628 if (!flag_vect_cost_model)
2630 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2634 /* Requires loop versioning tests to handle misalignment. */
2635 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2637 /* FIXME: Make cost depend on complexity of individual check. */
2638 unsigned len = VEC_length (gimple,
2639 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2640 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2642 dump_printf (MSG_NOTE,
2643 "cost model: Adding cost of checks for loop "
2644 "versioning to treat misalignment.\n");
2647 /* Requires loop versioning with alias checks. */
2648 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2650 /* FIXME: Make cost depend on complexity of individual check. */
2651 unsigned len = VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2652 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2654 dump_printf (MSG_NOTE,
2655 "cost model: Adding cost of checks for loop "
2656 "versioning aliasing.\n");
2659 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2660 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2661 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2664 /* Count statements in scalar loop. Using this as scalar cost for a single
2667 TODO: Add outer loop support.
2669 TODO: Consider assigning different costs to different scalar
2672 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2674 /* Add additional cost for the peeled instructions in prologue and epilogue
2677 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2678 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2680 TODO: Build an expression that represents peel_iters for prologue and
2681 epilogue to be used in a run-time test. */
2685 peel_iters_prologue = vf/2;
2686 dump_printf (MSG_NOTE, "cost model: "
2687 "prologue peel iters set to vf/2.");
2689 /* If peeling for alignment is unknown, loop bound of main loop becomes
2691 peel_iters_epilogue = vf/2;
2692 dump_printf (MSG_NOTE, "cost model: "
2693 "epilogue peel iters set to vf/2 because "
2694 "peeling for alignment is unknown.");
2696 /* If peeled iterations are unknown, count a taken branch and a not taken
2697 branch per peeled loop. Even if scalar loop iterations are known,
2698 vector iterations are not known since peeled prologue iterations are
2699 not known. Hence guards remain the same. */
2700 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2701 NULL, 0, vect_prologue);
2702 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2703 NULL, 0, vect_prologue);
2704 /* FORNOW: Don't attempt to pass individual scalar instructions to
2705 the model; just assume linear cost for scalar iterations. */
2706 (void) add_stmt_cost (target_cost_data,
2707 peel_iters_prologue * scalar_single_iter_cost,
2708 scalar_stmt, NULL, 0, vect_prologue);
2709 (void) add_stmt_cost (target_cost_data,
2710 peel_iters_epilogue * scalar_single_iter_cost,
2711 scalar_stmt, NULL, 0, vect_epilogue);
2715 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2716 stmt_info_for_cost *si;
2718 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2720 prologue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
2721 epilogue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
2722 peel_iters_prologue = npeel;
2724 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2725 &peel_iters_epilogue,
2726 scalar_single_iter_cost,
2728 &epilogue_cost_vec);
2730 FOR_EACH_VEC_ELT (stmt_info_for_cost, prologue_cost_vec, j, si)
2732 struct _stmt_vec_info *stmt_info
2733 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2734 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2735 si->misalign, vect_prologue);
2738 FOR_EACH_VEC_ELT (stmt_info_for_cost, epilogue_cost_vec, j, si)
2740 struct _stmt_vec_info *stmt_info
2741 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2742 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2743 si->misalign, vect_epilogue);
2746 VEC_free (stmt_info_for_cost, heap, prologue_cost_vec);
2747 VEC_free (stmt_info_for_cost, heap, epilogue_cost_vec);
2750 /* FORNOW: The scalar outside cost is incremented in one of the
2753 1. The vectorizer checks for alignment and aliasing and generates
2754 a condition that allows dynamic vectorization. A cost model
2755 check is ANDED with the versioning condition. Hence scalar code
2756 path now has the added cost of the versioning check.
2758 if (cost > th & versioning_check)
2761 Hence run-time scalar is incremented by not-taken branch cost.
2763 2. The vectorizer then checks if a prologue is required. If the
2764 cost model check was not done before during versioning, it has to
2765 be done before the prologue check.
2768 prologue = scalar_iters
2773 if (prologue == num_iters)
2776 Hence the run-time scalar cost is incremented by a taken branch,
2777 plus a not-taken branch, plus a taken branch cost.
2779 3. The vectorizer then checks if an epilogue is required. If the
2780 cost model check was not done before during prologue check, it
2781 has to be done with the epilogue check.
2787 if (prologue == num_iters)
2790 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2793 Hence the run-time scalar cost should be incremented by 2 taken
2796 TODO: The back end may reorder the BBS's differently and reverse
2797 conditions/branch directions. Change the estimates below to
2798 something more reasonable. */
2800 /* If the number of iterations is known and we do not do versioning, we can
2801 decide whether to vectorize at compile time. Hence the scalar version
2802 do not carry cost model guard costs. */
2803 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2804 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2805 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2807 /* Cost model check occurs at versioning. */
2808 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2809 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2810 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2813 /* Cost model check occurs at prologue generation. */
2814 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2815 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2816 + vect_get_stmt_cost (cond_branch_not_taken);
2817 /* Cost model check occurs at epilogue generation. */
2819 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2823 /* Complete the target-specific cost calculations. */
2824 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2825 &vec_inside_cost, &vec_epilogue_cost);
2827 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2829 /* Calculate number of iterations required to make the vector version
2830 profitable, relative to the loop bodies only. The following condition
2832 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2834 SIC = scalar iteration cost, VIC = vector iteration cost,
2835 VOC = vector outside cost, VF = vectorization factor,
2836 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2837 SOC = scalar outside cost for run time cost model check. */
2839 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2841 if (vec_outside_cost <= 0)
2842 min_profitable_iters = 1;
2845 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2846 - vec_inside_cost * peel_iters_prologue
2847 - vec_inside_cost * peel_iters_epilogue)
2848 / ((scalar_single_iter_cost * vf)
2851 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2852 <= (((int) vec_inside_cost * min_profitable_iters)
2853 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2854 min_profitable_iters++;
2857 /* vector version will never be profitable. */
2860 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2862 "cost model: the vector iteration cost = %d "
2863 "divided by the scalar iteration cost = %d "
2864 "is greater or equal to the vectorization factor = %d.",
2865 vec_inside_cost, scalar_single_iter_cost, vf);
2869 if (dump_kind_p (MSG_NOTE))
2871 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2872 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
2874 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
2876 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
2878 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
2879 scalar_single_iter_cost);
2880 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
2881 scalar_outside_cost);
2882 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
2883 peel_iters_prologue);
2884 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
2885 peel_iters_epilogue);
2886 dump_printf (MSG_NOTE,
2887 " Calculated minimum iters for profitability: %d\n",
2888 min_profitable_iters);
2891 min_profitable_iters =
2892 min_profitable_iters < vf ? vf : min_profitable_iters;
2894 /* Because the condition we create is:
2895 if (niters <= min_profitable_iters)
2896 then skip the vectorized loop. */
2897 min_profitable_iters--;
2899 if (dump_kind_p (MSG_NOTE))
2900 dump_printf_loc (MSG_NOTE, vect_location,
2901 " Profitability threshold = %d\n", min_profitable_iters);
2903 return min_profitable_iters;
2907 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2908 functions. Design better to avoid maintenance issues. */
2910 /* Function vect_model_reduction_cost.
2912 Models cost for a reduction operation, including the vector ops
2913 generated within the strip-mine loop, the initial definition before
2914 the loop, and the epilogue code that must be generated. */
2917 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2920 int prologue_cost = 0, epilogue_cost = 0;
2921 enum tree_code code;
2924 gimple stmt, orig_stmt;
2926 enum machine_mode mode;
2927 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2928 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2929 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2931 /* Cost of reduction op inside loop. */
2932 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2933 stmt_info, 0, vect_body);
2934 stmt = STMT_VINFO_STMT (stmt_info);
2936 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2938 case GIMPLE_SINGLE_RHS:
2939 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2940 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2942 case GIMPLE_UNARY_RHS:
2943 reduction_op = gimple_assign_rhs1 (stmt);
2945 case GIMPLE_BINARY_RHS:
2946 reduction_op = gimple_assign_rhs2 (stmt);
2948 case GIMPLE_TERNARY_RHS:
2949 reduction_op = gimple_assign_rhs3 (stmt);
2955 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2958 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
2960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2961 "unsupported data-type ");
2962 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2963 TREE_TYPE (reduction_op));
2968 mode = TYPE_MODE (vectype);
2969 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2972 orig_stmt = STMT_VINFO_STMT (stmt_info);
2974 code = gimple_assign_rhs_code (orig_stmt);
2976 /* Add in cost for initial definition. */
2977 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
2978 stmt_info, 0, vect_prologue);
2980 /* Determine cost of epilogue code.
2982 We have a reduction operator that will reduce the vector in one statement.
2983 Also requires scalar extract. */
2985 if (!nested_in_vect_loop_p (loop, orig_stmt))
2987 if (reduc_code != ERROR_MARK)
2989 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
2990 stmt_info, 0, vect_epilogue);
2991 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
2992 stmt_info, 0, vect_epilogue);
2996 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2998 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2999 int element_bitsize = tree_low_cst (bitsize, 1);
3000 int nelements = vec_size_in_bits / element_bitsize;
3002 optab = optab_for_tree_code (code, vectype, optab_default);
3004 /* We have a whole vector shift available. */
3005 if (VECTOR_MODE_P (mode)
3006 && optab_handler (optab, mode) != CODE_FOR_nothing
3007 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3009 /* Final reduction via vector shifts and the reduction operator.
3010 Also requires scalar extract. */
3011 epilogue_cost += add_stmt_cost (target_cost_data,
3012 exact_log2 (nelements) * 2,
3013 vector_stmt, stmt_info, 0,
3015 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3016 vec_to_scalar, stmt_info, 0,
3020 /* Use extracts and reduction op for final reduction. For N
3021 elements, we have N extracts and N-1 reduction ops. */
3022 epilogue_cost += add_stmt_cost (target_cost_data,
3023 nelements + nelements - 1,
3024 vector_stmt, stmt_info, 0,
3029 if (dump_kind_p (MSG_NOTE))
3030 dump_printf (MSG_NOTE,
3031 "vect_model_reduction_cost: inside_cost = %d, "
3032 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3033 prologue_cost, epilogue_cost);
3039 /* Function vect_model_induction_cost.
3041 Models cost for induction operations. */
3044 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3046 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3047 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3048 unsigned inside_cost, prologue_cost;
3050 /* loop cost for vec_loop. */
3051 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3052 stmt_info, 0, vect_body);
3054 /* prologue cost for vec_init and vec_step. */
3055 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3056 stmt_info, 0, vect_prologue);
3058 if (dump_kind_p (MSG_NOTE))
3059 dump_printf_loc (MSG_NOTE, vect_location,
3060 "vect_model_induction_cost: inside_cost = %d, "
3061 "prologue_cost = %d .", inside_cost, prologue_cost);
3065 /* Function get_initial_def_for_induction
3068 STMT - a stmt that performs an induction operation in the loop.
3069 IV_PHI - the initial value of the induction variable
3072 Return a vector variable, initialized with the first VF values of
3073 the induction variable. E.g., for an iv with IV_PHI='X' and
3074 evolution S, for a vector of 4 units, we want to return:
3075 [X, X + S, X + 2*S, X + 3*S]. */
3078 get_initial_def_for_induction (gimple iv_phi)
3080 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3081 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3082 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3086 edge pe = loop_preheader_edge (loop);
3087 struct loop *iv_loop;
3089 tree vec, vec_init, vec_step, t;
3093 gimple init_stmt, induction_phi, new_stmt;
3094 tree induc_def, vec_def, vec_dest;
3095 tree init_expr, step_expr;
3096 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3101 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3102 bool nested_in_vect_loop = false;
3103 gimple_seq stmts = NULL;
3104 imm_use_iterator imm_iter;
3105 use_operand_p use_p;
3109 gimple_stmt_iterator si;
3110 basic_block bb = gimple_bb (iv_phi);
3114 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3115 if (nested_in_vect_loop_p (loop, iv_phi))
3117 nested_in_vect_loop = true;
3118 iv_loop = loop->inner;
3122 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3124 latch_e = loop_latch_edge (iv_loop);
3125 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3127 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3128 gcc_assert (access_fn);
3129 STRIP_NOPS (access_fn);
3130 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3131 &init_expr, &step_expr);
3133 pe = loop_preheader_edge (iv_loop);
3135 scalar_type = TREE_TYPE (init_expr);
3136 vectype = get_vectype_for_scalar_type (scalar_type);
3137 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3138 gcc_assert (vectype);
3139 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3140 ncopies = vf / nunits;
3142 gcc_assert (phi_info);
3143 gcc_assert (ncopies >= 1);
3145 /* Find the first insertion point in the BB. */
3146 si = gsi_after_labels (bb);
3148 /* Create the vector that holds the initial_value of the induction. */
3149 if (nested_in_vect_loop)
3151 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3152 been created during vectorization of previous stmts. We obtain it
3153 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3154 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3155 loop_preheader_edge (iv_loop));
3156 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3160 VEC(constructor_elt,gc) *v;
3162 /* iv_loop is the loop to be vectorized. Create:
3163 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3164 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3165 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3168 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3169 gcc_assert (!new_bb);
3172 v = VEC_alloc (constructor_elt, gc, nunits);
3173 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3174 for (i = 1; i < nunits; i++)
3176 /* Create: new_name_i = new_name + step_expr */
3177 enum tree_code code = POINTER_TYPE_P (scalar_type)
3178 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3179 init_stmt = gimple_build_assign_with_ops (code, new_var,
3180 new_name, step_expr);
3181 new_name = make_ssa_name (new_var, init_stmt);
3182 gimple_assign_set_lhs (init_stmt, new_name);
3184 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3185 gcc_assert (!new_bb);
3187 if (dump_kind_p (MSG_NOTE))
3189 dump_printf_loc (MSG_NOTE, vect_location,
3190 "created new init_stmt: ");
3191 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3193 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3195 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3196 vec = build_constructor (vectype, v);
3197 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3201 /* Create the vector that holds the step of the induction. */
3202 if (nested_in_vect_loop)
3203 /* iv_loop is nested in the loop to be vectorized. Generate:
3204 vec_step = [S, S, S, S] */
3205 new_name = step_expr;
3208 /* iv_loop is the loop to be vectorized. Generate:
3209 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3210 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3211 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3215 t = unshare_expr (new_name);
3216 gcc_assert (CONSTANT_CLASS_P (new_name));
3217 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3218 gcc_assert (stepvectype);
3219 vec = build_vector_from_val (stepvectype, t);
3220 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3223 /* Create the following def-use cycle:
3228 vec_iv = PHI <vec_init, vec_loop>
3232 vec_loop = vec_iv + vec_step; */
3234 /* Create the induction-phi that defines the induction-operand. */
3235 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3236 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3237 set_vinfo_for_stmt (induction_phi,
3238 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3239 induc_def = PHI_RESULT (induction_phi);
3241 /* Create the iv update inside the loop */
3242 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3243 induc_def, vec_step);
3244 vec_def = make_ssa_name (vec_dest, new_stmt);
3245 gimple_assign_set_lhs (new_stmt, vec_def);
3246 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3247 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3250 /* Set the arguments of the phi node: */
3251 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3252 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3256 /* In case that vectorization factor (VF) is bigger than the number
3257 of elements that we can fit in a vectype (nunits), we have to generate
3258 more than one vector stmt - i.e - we need to "unroll" the
3259 vector stmt by a factor VF/nunits. For more details see documentation
3260 in vectorizable_operation. */
3264 stmt_vec_info prev_stmt_vinfo;
3265 /* FORNOW. This restriction should be relaxed. */
3266 gcc_assert (!nested_in_vect_loop);
3268 /* Create the vector that holds the step of the induction. */
3269 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3270 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3272 t = unshare_expr (new_name);
3273 gcc_assert (CONSTANT_CLASS_P (new_name));
3274 vec = build_vector_from_val (stepvectype, t);
3275 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3277 vec_def = induc_def;
3278 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3279 for (i = 1; i < ncopies; i++)
3281 /* vec_i = vec_prev + vec_step */
3282 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3284 vec_def = make_ssa_name (vec_dest, new_stmt);
3285 gimple_assign_set_lhs (new_stmt, vec_def);
3287 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3288 if (!useless_type_conversion_p (resvectype, vectype))
3290 new_stmt = gimple_build_assign_with_ops
3292 vect_get_new_vect_var (resvectype, vect_simple_var,
3294 build1 (VIEW_CONVERT_EXPR, resvectype,
3295 gimple_assign_lhs (new_stmt)), NULL_TREE);
3296 gimple_assign_set_lhs (new_stmt,
3298 (gimple_assign_lhs (new_stmt), new_stmt));
3299 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3301 set_vinfo_for_stmt (new_stmt,
3302 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3303 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3304 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3308 if (nested_in_vect_loop)
3310 /* Find the loop-closed exit-phi of the induction, and record
3311 the final vector of induction results: */
3313 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3315 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3317 exit_phi = USE_STMT (use_p);
3323 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3324 /* FORNOW. Currently not supporting the case that an inner-loop induction
3325 is not used in the outer-loop (i.e. only outside the outer-loop). */
3326 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3327 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3329 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3330 if (dump_kind_p (MSG_NOTE))
3332 dump_printf_loc (MSG_NOTE, vect_location,
3333 "vector of inductions after inner-loop:");
3334 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3340 if (dump_kind_p (MSG_NOTE))
3342 dump_printf_loc (MSG_NOTE, vect_location,
3343 "transform induction: created def-use cycle: ");
3344 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3345 dump_printf (MSG_NOTE, "\n");
3346 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3347 SSA_NAME_DEF_STMT (vec_def), 0);
3350 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3351 if (!useless_type_conversion_p (resvectype, vectype))
3353 new_stmt = gimple_build_assign_with_ops
3355 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3356 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3357 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3358 gimple_assign_set_lhs (new_stmt, induc_def);
3359 si = gsi_start_bb (bb);
3360 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3361 set_vinfo_for_stmt (new_stmt,
3362 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3363 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3364 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3371 /* Function get_initial_def_for_reduction
3374 STMT - a stmt that performs a reduction operation in the loop.
3375 INIT_VAL - the initial value of the reduction variable
3378 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3379 of the reduction (used for adjusting the epilog - see below).
3380 Return a vector variable, initialized according to the operation that STMT
3381 performs. This vector will be used as the initial value of the
3382 vector of partial results.
3384 Option1 (adjust in epilog): Initialize the vector as follows:
3385 add/bit or/xor: [0,0,...,0,0]
3386 mult/bit and: [1,1,...,1,1]
3387 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3388 and when necessary (e.g. add/mult case) let the caller know
3389 that it needs to adjust the result by init_val.
3391 Option2: Initialize the vector as follows:
3392 add/bit or/xor: [init_val,0,0,...,0]
3393 mult/bit and: [init_val,1,1,...,1]
3394 min/max/cond_expr: [init_val,init_val,...,init_val]
3395 and no adjustments are needed.
3397 For example, for the following code:
3403 STMT is 's = s + a[i]', and the reduction variable is 's'.
3404 For a vector of 4 units, we want to return either [0,0,0,init_val],
3405 or [0,0,0,0] and let the caller know that it needs to adjust
3406 the result at the end by 'init_val'.
3408 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3409 initialization vector is simpler (same element in all entries), if
3410 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3412 A cost model should help decide between these two schemes. */
3415 get_initial_def_for_reduction (gimple stmt, tree init_val,
3416 tree *adjustment_def)
3418 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3419 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3420 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3421 tree scalar_type = TREE_TYPE (init_val);
3422 tree vectype = get_vectype_for_scalar_type (scalar_type);
3424 enum tree_code code = gimple_assign_rhs_code (stmt);
3429 bool nested_in_vect_loop = false;
3431 REAL_VALUE_TYPE real_init_val = dconst0;
3432 int int_init_val = 0;
3433 gimple def_stmt = NULL;
3435 gcc_assert (vectype);
3436 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3438 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3439 || SCALAR_FLOAT_TYPE_P (scalar_type));
3441 if (nested_in_vect_loop_p (loop, stmt))
3442 nested_in_vect_loop = true;
3444 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3446 /* In case of double reduction we only create a vector variable to be put
3447 in the reduction phi node. The actual statement creation is done in
3448 vect_create_epilog_for_reduction. */
3449 if (adjustment_def && nested_in_vect_loop
3450 && TREE_CODE (init_val) == SSA_NAME
3451 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3452 && gimple_code (def_stmt) == GIMPLE_PHI
3453 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3454 && vinfo_for_stmt (def_stmt)
3455 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3456 == vect_double_reduction_def)
3458 *adjustment_def = NULL;
3459 return vect_create_destination_var (init_val, vectype);
3462 if (TREE_CONSTANT (init_val))
3464 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3465 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3467 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3470 init_value = init_val;
3474 case WIDEN_SUM_EXPR:
3482 /* ADJUSMENT_DEF is NULL when called from
3483 vect_create_epilog_for_reduction to vectorize double reduction. */
3486 if (nested_in_vect_loop)
3487 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3490 *adjustment_def = init_val;
3493 if (code == MULT_EXPR)
3495 real_init_val = dconst1;
3499 if (code == BIT_AND_EXPR)
3502 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3503 def_for_init = build_real (scalar_type, real_init_val);
3505 def_for_init = build_int_cst (scalar_type, int_init_val);
3507 /* Create a vector of '0' or '1' except the first element. */
3508 elts = XALLOCAVEC (tree, nunits);
3509 for (i = nunits - 2; i >= 0; --i)
3510 elts[i + 1] = def_for_init;
3512 /* Option1: the first element is '0' or '1' as well. */
3515 elts[0] = def_for_init;
3516 init_def = build_vector (vectype, elts);
3520 /* Option2: the first element is INIT_VAL. */
3522 if (TREE_CONSTANT (init_val))
3523 init_def = build_vector (vectype, elts);
3526 VEC(constructor_elt,gc) *v;
3527 v = VEC_alloc (constructor_elt, gc, nunits);
3528 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3529 for (i = 1; i < nunits; ++i)
3530 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3531 init_def = build_constructor (vectype, v);
3541 *adjustment_def = NULL_TREE;
3542 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3546 init_def = build_vector_from_val (vectype, init_value);
3557 /* Function vect_create_epilog_for_reduction
3559 Create code at the loop-epilog to finalize the result of a reduction
3562 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3563 reduction statements.
3564 STMT is the scalar reduction stmt that is being vectorized.
3565 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3566 number of elements that we can fit in a vectype (nunits). In this case
3567 we have to generate more than one vector stmt - i.e - we need to "unroll"
3568 the vector stmt by a factor VF/nunits. For more details see documentation
3569 in vectorizable_operation.
3570 REDUC_CODE is the tree-code for the epilog reduction.
3571 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3573 REDUC_INDEX is the index of the operand in the right hand side of the
3574 statement that is defined by REDUCTION_PHI.
3575 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3576 SLP_NODE is an SLP node containing a group of reduction statements. The
3577 first one in this group is STMT.
3580 1. Creates the reduction def-use cycles: sets the arguments for
3582 The loop-entry argument is the vectorized initial-value of the reduction.
3583 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3585 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3586 by applying the operation specified by REDUC_CODE if available, or by
3587 other means (whole-vector shifts or a scalar loop).
3588 The function also creates a new phi node at the loop exit to preserve
3589 loop-closed form, as illustrated below.
3591 The flow at the entry to this function:
3594 vec_def = phi <null, null> # REDUCTION_PHI
3595 VECT_DEF = vector_stmt # vectorized form of STMT
3596 s_loop = scalar_stmt # (scalar) STMT
3598 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3602 The above is transformed by this function into:
3605 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3606 VECT_DEF = vector_stmt # vectorized form of STMT
3607 s_loop = scalar_stmt # (scalar) STMT
3609 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3610 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3611 v_out2 = reduce <v_out1>
3612 s_out3 = extract_field <v_out2, 0>
3613 s_out4 = adjust_result <s_out3>
3619 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3620 int ncopies, enum tree_code reduc_code,
3621 VEC (gimple, heap) *reduction_phis,
3622 int reduc_index, bool double_reduc,
3625 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3626 stmt_vec_info prev_phi_info;
3628 enum machine_mode mode;
3629 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3630 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3631 basic_block exit_bb;
3634 gimple new_phi = NULL, phi;
3635 gimple_stmt_iterator exit_gsi;
3637 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3638 gimple epilog_stmt = NULL;
3639 enum tree_code code = gimple_assign_rhs_code (stmt);
3641 tree bitsize, bitpos;
3642 tree adjustment_def = NULL;
3643 tree vec_initial_def = NULL;
3644 tree reduction_op, expr, def;
3645 tree orig_name, scalar_result;
3646 imm_use_iterator imm_iter, phi_imm_iter;
3647 use_operand_p use_p, phi_use_p;
3648 bool extract_scalar_result = false;
3649 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3650 bool nested_in_vect_loop = false;
3651 VEC (gimple, heap) *new_phis = NULL;
3652 VEC (gimple, heap) *inner_phis = NULL;
3653 enum vect_def_type dt = vect_unknown_def_type;
3655 VEC (tree, heap) *scalar_results = NULL;
3656 unsigned int group_size = 1, k, ratio;
3657 VEC (tree, heap) *vec_initial_defs = NULL;
3658 VEC (gimple, heap) *phis;
3659 bool slp_reduc = false;
3660 tree new_phi_result;
3661 gimple inner_phi = NULL;
3664 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3666 if (nested_in_vect_loop_p (loop, stmt))
3670 nested_in_vect_loop = true;
3671 gcc_assert (!slp_node);
3674 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3676 case GIMPLE_SINGLE_RHS:
3677 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3679 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3681 case GIMPLE_UNARY_RHS:
3682 reduction_op = gimple_assign_rhs1 (stmt);
3684 case GIMPLE_BINARY_RHS:
3685 reduction_op = reduc_index ?
3686 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3688 case GIMPLE_TERNARY_RHS:
3689 reduction_op = gimple_op (stmt, reduc_index + 1);
3695 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3696 gcc_assert (vectype);
3697 mode = TYPE_MODE (vectype);
3699 /* 1. Create the reduction def-use cycle:
3700 Set the arguments of REDUCTION_PHIS, i.e., transform
3703 vec_def = phi <null, null> # REDUCTION_PHI
3704 VECT_DEF = vector_stmt # vectorized form of STMT
3710 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3711 VECT_DEF = vector_stmt # vectorized form of STMT
3714 (in case of SLP, do it for all the phis). */
3716 /* Get the loop-entry arguments. */
3718 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3719 NULL, slp_node, reduc_index);
3722 vec_initial_defs = VEC_alloc (tree, heap, 1);
3723 /* For the case of reduction, vect_get_vec_def_for_operand returns
3724 the scalar def before the loop, that defines the initial value
3725 of the reduction variable. */
3726 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3728 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3731 /* Set phi nodes arguments. */
3732 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3734 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3735 tree def = VEC_index (tree, vect_defs, i);
3736 for (j = 0; j < ncopies; j++)
3738 /* Set the loop-entry arg of the reduction-phi. */
3739 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3742 /* Set the loop-latch arg for the reduction-phi. */
3744 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3746 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3748 if (dump_kind_p (MSG_NOTE))
3750 dump_printf_loc (MSG_NOTE, vect_location,
3751 "transform reduction: created def-use cycle: ");
3752 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3753 dump_printf (MSG_NOTE, "\n");
3754 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3757 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3761 VEC_free (tree, heap, vec_initial_defs);
3763 /* 2. Create epilog code.
3764 The reduction epilog code operates across the elements of the vector
3765 of partial results computed by the vectorized loop.
3766 The reduction epilog code consists of:
3768 step 1: compute the scalar result in a vector (v_out2)
3769 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3770 step 3: adjust the scalar result (s_out3) if needed.
3772 Step 1 can be accomplished using one the following three schemes:
3773 (scheme 1) using reduc_code, if available.
3774 (scheme 2) using whole-vector shifts, if available.
3775 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3778 The overall epilog code looks like this:
3780 s_out0 = phi <s_loop> # original EXIT_PHI
3781 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3782 v_out2 = reduce <v_out1> # step 1
3783 s_out3 = extract_field <v_out2, 0> # step 2
3784 s_out4 = adjust_result <s_out3> # step 3
3786 (step 3 is optional, and steps 1 and 2 may be combined).
3787 Lastly, the uses of s_out0 are replaced by s_out4. */
3790 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3791 v_out1 = phi <VECT_DEF>
3792 Store them in NEW_PHIS. */
3794 exit_bb = single_exit (loop)->dest;
3795 prev_phi_info = NULL;
3796 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3797 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3799 for (j = 0; j < ncopies; j++)
3801 tree new_def = copy_ssa_name (def, NULL);
3802 phi = create_phi_node (new_def, exit_bb);
3803 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3805 VEC_quick_push (gimple, new_phis, phi);
3808 def = vect_get_vec_def_for_stmt_copy (dt, def);
3809 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3812 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3813 prev_phi_info = vinfo_for_stmt (phi);
3817 /* The epilogue is created for the outer-loop, i.e., for the loop being
3818 vectorized. Create exit phis for the outer loop. */
3822 exit_bb = single_exit (loop)->dest;
3823 inner_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3824 FOR_EACH_VEC_ELT (gimple, new_phis, i, phi)
3826 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3827 gimple outer_phi = create_phi_node (new_result, exit_bb);
3828 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3830 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3832 VEC_quick_push (gimple, inner_phis, phi);
3833 VEC_replace (gimple, new_phis, i, outer_phi);
3834 prev_phi_info = vinfo_for_stmt (outer_phi);
3835 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3837 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3838 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3839 outer_phi = create_phi_node (new_result, exit_bb);
3840 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3842 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3844 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3845 prev_phi_info = vinfo_for_stmt (outer_phi);
3850 exit_gsi = gsi_after_labels (exit_bb);
3852 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3853 (i.e. when reduc_code is not available) and in the final adjustment
3854 code (if needed). Also get the original scalar reduction variable as
3855 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3856 represents a reduction pattern), the tree-code and scalar-def are
3857 taken from the original stmt that the pattern-stmt (STMT) replaces.
3858 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3859 are taken from STMT. */
3861 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3864 /* Regular reduction */
3869 /* Reduction pattern */
3870 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3871 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3872 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3875 code = gimple_assign_rhs_code (orig_stmt);
3876 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3877 partial results are added and not subtracted. */
3878 if (code == MINUS_EXPR)
3881 scalar_dest = gimple_assign_lhs (orig_stmt);
3882 scalar_type = TREE_TYPE (scalar_dest);
3883 scalar_results = VEC_alloc (tree, heap, group_size);
3884 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3885 bitsize = TYPE_SIZE (scalar_type);
3887 /* In case this is a reduction in an inner-loop while vectorizing an outer
3888 loop - we don't need to extract a single scalar result at the end of the
3889 inner-loop (unless it is double reduction, i.e., the use of reduction is
3890 outside the outer-loop). The final vector of partial results will be used
3891 in the vectorized outer-loop, or reduced to a scalar result at the end of
3893 if (nested_in_vect_loop && !double_reduc)
3894 goto vect_finalize_reduction;
3896 /* SLP reduction without reduction chain, e.g.,
3900 b2 = operation (b1) */
3901 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3903 /* In case of reduction chain, e.g.,
3906 a3 = operation (a2),
3908 we may end up with more than one vector result. Here we reduce them to
3910 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3912 tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3914 gimple new_vec_stmt = NULL;
3916 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3917 for (k = 1; k < VEC_length (gimple, new_phis); k++)
3919 gimple next_phi = VEC_index (gimple, new_phis, k);
3920 tree second_vect = PHI_RESULT (next_phi);
3922 tmp = build2 (code, vectype, first_vect, second_vect);
3923 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3924 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3925 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3926 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3929 new_phi_result = first_vect;
3932 VEC_truncate (gimple, new_phis, 0);
3933 VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3937 new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3939 /* 2.3 Create the reduction code, using one of the three schemes described
3940 above. In SLP we simply need to extract all the elements from the
3941 vector (without reducing them), so we use scalar shifts. */
3942 if (reduc_code != ERROR_MARK && !slp_reduc)
3946 /*** Case 1: Create:
3947 v_out2 = reduc_expr <v_out1> */
3949 if (dump_kind_p (MSG_NOTE))
3950 dump_printf_loc (MSG_NOTE, vect_location,
3951 "Reduce using direct vector reduction.");
3953 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3954 tmp = build1 (reduc_code, vectype, new_phi_result);
3955 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3956 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3957 gimple_assign_set_lhs (epilog_stmt, new_temp);
3958 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3960 extract_scalar_result = true;
3964 enum tree_code shift_code = ERROR_MARK;
3965 bool have_whole_vector_shift = true;
3967 int element_bitsize = tree_low_cst (bitsize, 1);
3968 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3971 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3972 shift_code = VEC_RSHIFT_EXPR;
3974 have_whole_vector_shift = false;
3976 /* Regardless of whether we have a whole vector shift, if we're
3977 emulating the operation via tree-vect-generic, we don't want
3978 to use it. Only the first round of the reduction is likely
3979 to still be profitable via emulation. */
3980 /* ??? It might be better to emit a reduction tree code here, so that
3981 tree-vect-generic can expand the first round via bit tricks. */
3982 if (!VECTOR_MODE_P (mode))
3983 have_whole_vector_shift = false;
3986 optab optab = optab_for_tree_code (code, vectype, optab_default);
3987 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3988 have_whole_vector_shift = false;
3991 if (have_whole_vector_shift && !slp_reduc)
3993 /*** Case 2: Create:
3994 for (offset = VS/2; offset >= element_size; offset/=2)
3996 Create: va' = vec_shift <va, offset>
3997 Create: va = vop <va, va'>
4000 if (dump_kind_p (MSG_NOTE))
4001 dump_printf_loc (MSG_NOTE, vect_location,
4002 "Reduce using vector shifts");
4004 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4005 new_temp = new_phi_result;
4006 for (bit_offset = vec_size_in_bits/2;
4007 bit_offset >= element_bitsize;
4010 tree bitpos = size_int (bit_offset);
4012 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4013 vec_dest, new_temp, bitpos);
4014 new_name = make_ssa_name (vec_dest, epilog_stmt);
4015 gimple_assign_set_lhs (epilog_stmt, new_name);
4016 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4018 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4019 new_name, new_temp);
4020 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4021 gimple_assign_set_lhs (epilog_stmt, new_temp);
4022 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4025 extract_scalar_result = true;
4031 /*** Case 3: Create:
4032 s = extract_field <v_out2, 0>
4033 for (offset = element_size;
4034 offset < vector_size;
4035 offset += element_size;)
4037 Create: s' = extract_field <v_out2, offset>
4038 Create: s = op <s, s'> // For non SLP cases
4041 if (dump_kind_p (MSG_NOTE))
4042 dump_printf_loc (MSG_NOTE, vect_location,
4043 "Reduce using scalar code. ");
4045 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4046 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
4048 if (gimple_code (new_phi) == GIMPLE_PHI)
4049 vec_temp = PHI_RESULT (new_phi);
4051 vec_temp = gimple_assign_lhs (new_phi);
4052 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4054 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4055 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4056 gimple_assign_set_lhs (epilog_stmt, new_temp);
4057 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4059 /* In SLP we don't need to apply reduction operation, so we just
4060 collect s' values in SCALAR_RESULTS. */
4062 VEC_safe_push (tree, heap, scalar_results, new_temp);
4064 for (bit_offset = element_bitsize;
4065 bit_offset < vec_size_in_bits;
4066 bit_offset += element_bitsize)
4068 tree bitpos = bitsize_int (bit_offset);
4069 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4072 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4073 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4074 gimple_assign_set_lhs (epilog_stmt, new_name);
4075 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4079 /* In SLP we don't need to apply reduction operation, so
4080 we just collect s' values in SCALAR_RESULTS. */
4081 new_temp = new_name;
4082 VEC_safe_push (tree, heap, scalar_results, new_name);
4086 epilog_stmt = gimple_build_assign_with_ops (code,
4087 new_scalar_dest, new_name, new_temp);
4088 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4089 gimple_assign_set_lhs (epilog_stmt, new_temp);
4090 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4095 /* The only case where we need to reduce scalar results in SLP, is
4096 unrolling. If the size of SCALAR_RESULTS is greater than
4097 GROUP_SIZE, we reduce them combining elements modulo
4101 tree res, first_res, new_res;
4104 /* Reduce multiple scalar results in case of SLP unrolling. */
4105 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
4108 first_res = VEC_index (tree, scalar_results, j % group_size);
4109 new_stmt = gimple_build_assign_with_ops (code,
4110 new_scalar_dest, first_res, res);
4111 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4112 gimple_assign_set_lhs (new_stmt, new_res);
4113 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4114 VEC_replace (tree, scalar_results, j % group_size, new_res);
4118 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4119 VEC_safe_push (tree, heap, scalar_results, new_temp);
4121 extract_scalar_result = false;
4125 /* 2.4 Extract the final scalar result. Create:
4126 s_out3 = extract_field <v_out2, bitpos> */
4128 if (extract_scalar_result)
4132 if (dump_kind_p (MSG_NOTE))
4133 dump_printf_loc (MSG_NOTE, vect_location,
4134 "extract scalar result");
4136 if (BYTES_BIG_ENDIAN)
4137 bitpos = size_binop (MULT_EXPR,
4138 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4139 TYPE_SIZE (scalar_type));
4141 bitpos = bitsize_zero_node;
4143 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4144 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4145 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4146 gimple_assign_set_lhs (epilog_stmt, new_temp);
4147 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4148 VEC_safe_push (tree, heap, scalar_results, new_temp);
4151 vect_finalize_reduction:
4156 /* 2.5 Adjust the final result by the initial value of the reduction
4157 variable. (When such adjustment is not needed, then
4158 'adjustment_def' is zero). For example, if code is PLUS we create:
4159 new_temp = loop_exit_def + adjustment_def */
4163 gcc_assert (!slp_reduc);
4164 if (nested_in_vect_loop)
4166 new_phi = VEC_index (gimple, new_phis, 0);
4167 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4168 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4169 new_dest = vect_create_destination_var (scalar_dest, vectype);
4173 new_temp = VEC_index (tree, scalar_results, 0);
4174 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4175 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4176 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4179 epilog_stmt = gimple_build_assign (new_dest, expr);
4180 new_temp = make_ssa_name (new_dest, epilog_stmt);
4181 gimple_assign_set_lhs (epilog_stmt, new_temp);
4182 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4183 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4184 if (nested_in_vect_loop)
4186 set_vinfo_for_stmt (epilog_stmt,
4187 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4189 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4190 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4193 VEC_quick_push (tree, scalar_results, new_temp);
4195 VEC_replace (tree, scalar_results, 0, new_temp);
4198 VEC_replace (tree, scalar_results, 0, new_temp);
4200 VEC_replace (gimple, new_phis, 0, epilog_stmt);
4203 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4204 phis with new adjusted scalar results, i.e., replace use <s_out0>
4209 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4210 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4211 v_out2 = reduce <v_out1>
4212 s_out3 = extract_field <v_out2, 0>
4213 s_out4 = adjust_result <s_out3>
4220 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4221 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4222 v_out2 = reduce <v_out1>
4223 s_out3 = extract_field <v_out2, 0>
4224 s_out4 = adjust_result <s_out3>
4229 /* In SLP reduction chain we reduce vector results into one vector if
4230 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4231 the last stmt in the reduction chain, since we are looking for the loop
4233 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4235 scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4236 SLP_TREE_SCALAR_STMTS (slp_node),
4241 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4242 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4243 need to match SCALAR_RESULTS with corresponding statements. The first
4244 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4245 the first vector stmt, etc.
4246 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4247 if (group_size > VEC_length (gimple, new_phis))
4249 ratio = group_size / VEC_length (gimple, new_phis);
4250 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4255 for (k = 0; k < group_size; k++)
4259 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4260 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4262 inner_phi = VEC_index (gimple, inner_phis, k / ratio);
4267 gimple current_stmt = VEC_index (gimple,
4268 SLP_TREE_SCALAR_STMTS (slp_node), k);
4270 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4271 /* SLP statements can't participate in patterns. */
4272 gcc_assert (!orig_stmt);
4273 scalar_dest = gimple_assign_lhs (current_stmt);
4276 phis = VEC_alloc (gimple, heap, 3);
4277 /* Find the loop-closed-use at the loop exit of the original scalar
4278 result. (The reduction result is expected to have two immediate uses -
4279 one at the latch block, and one at the loop exit). */
4280 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4281 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4282 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4284 /* We expect to have found an exit_phi because of loop-closed-ssa
4286 gcc_assert (!VEC_empty (gimple, phis));
4288 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4292 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4295 /* FORNOW. Currently not supporting the case that an inner-loop
4296 reduction is not used in the outer-loop (but only outside the
4297 outer-loop), unless it is double reduction. */
4298 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4299 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4302 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4304 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4305 != vect_double_reduction_def)
4308 /* Handle double reduction:
4310 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4311 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4312 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4313 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4315 At that point the regular reduction (stmt2 and stmt3) is
4316 already vectorized, as well as the exit phi node, stmt4.
4317 Here we vectorize the phi node of double reduction, stmt1, and
4318 update all relevant statements. */
4320 /* Go through all the uses of s2 to find double reduction phi
4321 node, i.e., stmt1 above. */
4322 orig_name = PHI_RESULT (exit_phi);
4323 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4325 stmt_vec_info use_stmt_vinfo;
4326 stmt_vec_info new_phi_vinfo;
4327 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4328 basic_block bb = gimple_bb (use_stmt);
4331 /* Check that USE_STMT is really double reduction phi
4333 if (gimple_code (use_stmt) != GIMPLE_PHI
4334 || gimple_phi_num_args (use_stmt) != 2
4335 || bb->loop_father != outer_loop)
4337 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4339 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4340 != vect_double_reduction_def)
4343 /* Create vector phi node for double reduction:
4344 vs1 = phi <vs0, vs2>
4345 vs1 was created previously in this function by a call to
4346 vect_get_vec_def_for_operand and is stored in
4348 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4349 vs0 is created here. */
4351 /* Create vector phi node. */
4352 vect_phi = create_phi_node (vec_initial_def, bb);
4353 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4354 loop_vec_info_for_loop (outer_loop), NULL);
4355 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4357 /* Create vs0 - initial def of the double reduction phi. */
4358 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4359 loop_preheader_edge (outer_loop));
4360 init_def = get_initial_def_for_reduction (stmt,
4361 preheader_arg, NULL);
4362 vect_phi_init = vect_init_vector (use_stmt, init_def,
4365 /* Update phi node arguments with vs0 and vs2. */
4366 add_phi_arg (vect_phi, vect_phi_init,
4367 loop_preheader_edge (outer_loop),
4369 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4370 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4371 if (dump_kind_p (MSG_NOTE))
4373 dump_printf_loc (MSG_NOTE, vect_location,
4374 "created double reduction phi node: ");
4375 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4378 vect_phi_res = PHI_RESULT (vect_phi);
4380 /* Replace the use, i.e., set the correct vs1 in the regular
4381 reduction phi node. FORNOW, NCOPIES is always 1, so the
4382 loop is redundant. */
4383 use = reduction_phi;
4384 for (j = 0; j < ncopies; j++)
4386 edge pr_edge = loop_preheader_edge (loop);
4387 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4388 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4394 VEC_free (gimple, heap, phis);
4395 if (nested_in_vect_loop)
4403 phis = VEC_alloc (gimple, heap, 3);
4404 /* Find the loop-closed-use at the loop exit of the original scalar
4405 result. (The reduction result is expected to have two immediate uses,
4406 one at the latch block, and one at the loop exit). For double
4407 reductions we are looking for exit phis of the outer loop. */
4408 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4410 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4411 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4414 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4416 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4418 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4420 if (!flow_bb_inside_loop_p (loop,
4421 gimple_bb (USE_STMT (phi_use_p))))
4422 VEC_safe_push (gimple, heap, phis,
4423 USE_STMT (phi_use_p));
4429 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4431 /* Replace the uses: */
4432 orig_name = PHI_RESULT (exit_phi);
4433 scalar_result = VEC_index (tree, scalar_results, k);
4434 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4435 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4436 SET_USE (use_p, scalar_result);
4439 VEC_free (gimple, heap, phis);
4442 VEC_free (tree, heap, scalar_results);
4443 VEC_free (gimple, heap, new_phis);
4447 /* Function vectorizable_reduction.
4449 Check if STMT performs a reduction operation that can be vectorized.
4450 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4451 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4452 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4454 This function also handles reduction idioms (patterns) that have been
4455 recognized in advance during vect_pattern_recog. In this case, STMT may be
4457 X = pattern_expr (arg0, arg1, ..., X)
4458 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4459 sequence that had been detected and replaced by the pattern-stmt (STMT).
4461 In some cases of reduction patterns, the type of the reduction variable X is
4462 different than the type of the other arguments of STMT.
4463 In such cases, the vectype that is used when transforming STMT into a vector
4464 stmt is different than the vectype that is used to determine the
4465 vectorization factor, because it consists of a different number of elements
4466 than the actual number of elements that are being operated upon in parallel.
4468 For example, consider an accumulation of shorts into an int accumulator.
4469 On some targets it's possible to vectorize this pattern operating on 8
4470 shorts at a time (hence, the vectype for purposes of determining the
4471 vectorization factor should be V8HI); on the other hand, the vectype that
4472 is used to create the vector form is actually V4SI (the type of the result).
4474 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4475 indicates what is the actual level of parallelism (V8HI in the example), so
4476 that the right vectorization factor would be derived. This vectype
4477 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4478 be used to create the vectorized stmt. The right vectype for the vectorized
4479 stmt is obtained from the type of the result X:
4480 get_vectype_for_scalar_type (TREE_TYPE (X))
4482 This means that, contrary to "regular" reductions (or "regular" stmts in
4483 general), the following equation:
4484 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4485 does *NOT* necessarily hold for reduction patterns. */
4488 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4489 gimple *vec_stmt, slp_tree slp_node)
4493 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4494 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4495 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4496 tree vectype_in = NULL_TREE;
4497 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4498 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4499 enum tree_code code, orig_code, epilog_reduc_code;
4500 enum machine_mode vec_mode;
4502 optab optab, reduc_optab;
4503 tree new_temp = NULL_TREE;
4506 enum vect_def_type dt;
4507 gimple new_phi = NULL;
4511 stmt_vec_info orig_stmt_info;
4512 tree expr = NULL_TREE;
4516 stmt_vec_info prev_stmt_info, prev_phi_info;
4517 bool single_defuse_cycle = false;
4518 tree reduc_def = NULL_TREE;
4519 gimple new_stmt = NULL;
4522 bool nested_cycle = false, found_nested_cycle_def = false;
4523 gimple reduc_def_stmt = NULL;
4524 /* The default is that the reduction variable is the last in statement. */
4525 int reduc_index = 2;
4526 bool double_reduc = false, dummy;
4528 struct loop * def_stmt_loop, *outer_loop = NULL;
4530 gimple def_arg_stmt;
4531 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4532 VEC (gimple, heap) *phis = NULL;
4534 tree def0, def1, tem, op0, op1 = NULL_TREE;
4536 /* In case of reduction chain we switch to the first stmt in the chain, but
4537 we don't update STMT_INFO, since only the last stmt is marked as reduction
4538 and has reduction properties. */
4539 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4540 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4542 if (nested_in_vect_loop_p (loop, stmt))
4546 nested_cycle = true;
4549 /* 1. Is vectorizable reduction? */
4550 /* Not supportable if the reduction variable is used in the loop, unless
4551 it's a reduction chain. */
4552 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4553 && !GROUP_FIRST_ELEMENT (stmt_info))
4556 /* Reductions that are not used even in an enclosing outer-loop,
4557 are expected to be "live" (used out of the loop). */
4558 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4559 && !STMT_VINFO_LIVE_P (stmt_info))
4562 /* Make sure it was already recognized as a reduction computation. */
4563 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4564 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4567 /* 2. Has this been recognized as a reduction pattern?
4569 Check if STMT represents a pattern that has been recognized
4570 in earlier analysis stages. For stmts that represent a pattern,
4571 the STMT_VINFO_RELATED_STMT field records the last stmt in
4572 the original sequence that constitutes the pattern. */
4574 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4577 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4578 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4579 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4580 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4583 /* 3. Check the operands of the operation. The first operands are defined
4584 inside the loop body. The last operand is the reduction variable,
4585 which is defined by the loop-header-phi. */
4587 gcc_assert (is_gimple_assign (stmt));
4590 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4592 case GIMPLE_SINGLE_RHS:
4593 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4594 if (op_type == ternary_op)
4596 tree rhs = gimple_assign_rhs1 (stmt);
4597 ops[0] = TREE_OPERAND (rhs, 0);
4598 ops[1] = TREE_OPERAND (rhs, 1);
4599 ops[2] = TREE_OPERAND (rhs, 2);
4600 code = TREE_CODE (rhs);
4606 case GIMPLE_BINARY_RHS:
4607 code = gimple_assign_rhs_code (stmt);
4608 op_type = TREE_CODE_LENGTH (code);
4609 gcc_assert (op_type == binary_op);
4610 ops[0] = gimple_assign_rhs1 (stmt);
4611 ops[1] = gimple_assign_rhs2 (stmt);
4614 case GIMPLE_TERNARY_RHS:
4615 code = gimple_assign_rhs_code (stmt);
4616 op_type = TREE_CODE_LENGTH (code);
4617 gcc_assert (op_type == ternary_op);
4618 ops[0] = gimple_assign_rhs1 (stmt);
4619 ops[1] = gimple_assign_rhs2 (stmt);
4620 ops[2] = gimple_assign_rhs3 (stmt);
4623 case GIMPLE_UNARY_RHS:
4630 if (code == COND_EXPR && slp_node)
4633 scalar_dest = gimple_assign_lhs (stmt);
4634 scalar_type = TREE_TYPE (scalar_dest);
4635 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4636 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4639 /* Do not try to vectorize bit-precision reductions. */
4640 if ((TYPE_PRECISION (scalar_type)
4641 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4644 /* All uses but the last are expected to be defined in the loop.
4645 The last use is the reduction variable. In case of nested cycle this
4646 assumption is not true: we use reduc_index to record the index of the
4647 reduction variable. */
4648 for (i = 0; i < op_type-1; i++)
4650 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4651 if (i == 0 && code == COND_EXPR)
4654 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4655 &def_stmt, &def, &dt, &tem);
4658 gcc_assert (is_simple_use);
4660 if (dt != vect_internal_def
4661 && dt != vect_external_def
4662 && dt != vect_constant_def
4663 && dt != vect_induction_def
4664 && !(dt == vect_nested_cycle && nested_cycle))
4667 if (dt == vect_nested_cycle)
4669 found_nested_cycle_def = true;
4670 reduc_def_stmt = def_stmt;
4675 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4676 &def_stmt, &def, &dt, &tem);
4679 gcc_assert (is_simple_use);
4680 gcc_assert (dt == vect_reduction_def
4681 || dt == vect_nested_cycle
4682 || ((dt == vect_internal_def || dt == vect_external_def
4683 || dt == vect_constant_def || dt == vect_induction_def)
4684 && nested_cycle && found_nested_cycle_def));
4685 if (!found_nested_cycle_def)
4686 reduc_def_stmt = def_stmt;
4688 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4690 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4696 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4697 !nested_cycle, &dummy);
4698 /* We changed STMT to be the first stmt in reduction chain, hence we
4699 check that in this case the first element in the chain is STMT. */
4700 gcc_assert (stmt == tmp
4701 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4704 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4707 if (slp_node || PURE_SLP_STMT (stmt_info))
4710 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4711 / TYPE_VECTOR_SUBPARTS (vectype_in));
4713 gcc_assert (ncopies >= 1);
4715 vec_mode = TYPE_MODE (vectype_in);
4717 if (code == COND_EXPR)
4719 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4721 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
4722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4723 "unsupported condition in reduction");
4730 /* 4. Supportable by target? */
4732 /* 4.1. check support for the operation in the loop */
4733 optab = optab_for_tree_code (code, vectype_in, optab_default);
4736 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
4737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4743 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4745 if (dump_kind_p (MSG_NOTE))
4746 dump_printf (MSG_NOTE, "op not supported by target.");
4748 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4749 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4750 < vect_min_worthwhile_factor (code))
4753 if (dump_kind_p (MSG_NOTE))
4754 dump_printf (MSG_NOTE, "proceeding using word mode.");
4757 /* Worthwhile without SIMD support? */
4758 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4759 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4760 < vect_min_worthwhile_factor (code))
4762 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
4763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4764 "not worthwhile without SIMD support.");
4770 /* 4.2. Check support for the epilog operation.
4772 If STMT represents a reduction pattern, then the type of the
4773 reduction variable may be different than the type of the rest
4774 of the arguments. For example, consider the case of accumulation
4775 of shorts into an int accumulator; The original code:
4776 S1: int_a = (int) short_a;
4777 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4780 STMT: int_acc = widen_sum <short_a, int_acc>
4783 1. The tree-code that is used to create the vector operation in the
4784 epilog code (that reduces the partial results) is not the
4785 tree-code of STMT, but is rather the tree-code of the original
4786 stmt from the pattern that STMT is replacing. I.e, in the example
4787 above we want to use 'widen_sum' in the loop, but 'plus' in the
4789 2. The type (mode) we use to check available target support
4790 for the vector operation to be created in the *epilog*, is
4791 determined by the type of the reduction variable (in the example
4792 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4793 However the type (mode) we use to check available target support
4794 for the vector operation to be created *inside the loop*, is
4795 determined by the type of the other arguments to STMT (in the
4796 example we'd check this: optab_handler (widen_sum_optab,
4799 This is contrary to "regular" reductions, in which the types of all
4800 the arguments are the same as the type of the reduction variable.
4801 For "regular" reductions we can therefore use the same vector type
4802 (and also the same tree-code) when generating the epilog code and
4803 when generating the code inside the loop. */
4807 /* This is a reduction pattern: get the vectype from the type of the
4808 reduction variable, and get the tree-code from orig_stmt. */
4809 orig_code = gimple_assign_rhs_code (orig_stmt);
4810 gcc_assert (vectype_out);
4811 vec_mode = TYPE_MODE (vectype_out);
4815 /* Regular reduction: use the same vectype and tree-code as used for
4816 the vector code inside the loop can be used for the epilog code. */
4822 def_bb = gimple_bb (reduc_def_stmt);
4823 def_stmt_loop = def_bb->loop_father;
4824 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4825 loop_preheader_edge (def_stmt_loop));
4826 if (TREE_CODE (def_arg) == SSA_NAME
4827 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4828 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4829 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4830 && vinfo_for_stmt (def_arg_stmt)
4831 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4832 == vect_double_reduction_def)
4833 double_reduc = true;
4836 epilog_reduc_code = ERROR_MARK;
4837 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4839 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4843 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
4844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4845 "no optab for reduction.");
4847 epilog_reduc_code = ERROR_MARK;
4851 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4853 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
4854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4855 "reduc op not supported by target.");
4857 epilog_reduc_code = ERROR_MARK;
4862 if (!nested_cycle || double_reduc)
4864 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
4865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4866 "no reduc code for scalar code.");
4872 if (double_reduc && ncopies > 1)
4874 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
4875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4876 "multiple types in double reduction");
4881 /* In case of widenning multiplication by a constant, we update the type
4882 of the constant to be the type of the other operand. We check that the
4883 constant fits the type in the pattern recognition pass. */
4884 if (code == DOT_PROD_EXPR
4885 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4887 if (TREE_CODE (ops[0]) == INTEGER_CST)
4888 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4889 else if (TREE_CODE (ops[1]) == INTEGER_CST)
4890 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4893 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
4894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4895 "invalid types in dot-prod");
4901 if (!vec_stmt) /* transformation not required. */
4903 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4905 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4911 if (dump_kind_p (MSG_NOTE))
4912 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4914 /* FORNOW: Multiple types are not supported for condition. */
4915 if (code == COND_EXPR)
4916 gcc_assert (ncopies == 1);
4918 /* Create the destination vector */
4919 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4921 /* In case the vectorization factor (VF) is bigger than the number
4922 of elements that we can fit in a vectype (nunits), we have to generate
4923 more than one vector stmt - i.e - we need to "unroll" the
4924 vector stmt by a factor VF/nunits. For more details see documentation
4925 in vectorizable_operation. */
4927 /* If the reduction is used in an outer loop we need to generate
4928 VF intermediate results, like so (e.g. for ncopies=2):
4933 (i.e. we generate VF results in 2 registers).
4934 In this case we have a separate def-use cycle for each copy, and therefore
4935 for each copy we get the vector def for the reduction variable from the
4936 respective phi node created for this copy.
4938 Otherwise (the reduction is unused in the loop nest), we can combine
4939 together intermediate results, like so (e.g. for ncopies=2):
4943 (i.e. we generate VF/2 results in a single register).
4944 In this case for each copy we get the vector def for the reduction variable
4945 from the vectorized reduction operation generated in the previous iteration.
4948 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4950 single_defuse_cycle = true;
4954 epilog_copies = ncopies;
4956 prev_stmt_info = NULL;
4957 prev_phi_info = NULL;
4960 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4961 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4962 == TYPE_VECTOR_SUBPARTS (vectype_in));
4967 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4968 if (op_type == ternary_op)
4969 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4972 phis = VEC_alloc (gimple, heap, vec_num);
4973 vect_defs = VEC_alloc (tree, heap, vec_num);
4975 VEC_quick_push (tree, vect_defs, NULL_TREE);
4977 for (j = 0; j < ncopies; j++)
4979 if (j == 0 || !single_defuse_cycle)
4981 for (i = 0; i < vec_num; i++)
4983 /* Create the reduction-phi that defines the reduction
4985 new_phi = create_phi_node (vec_dest, loop->header);
4986 set_vinfo_for_stmt (new_phi,
4987 new_stmt_vec_info (new_phi, loop_vinfo,
4989 if (j == 0 || slp_node)
4990 VEC_quick_push (gimple, phis, new_phi);
4994 if (code == COND_EXPR)
4996 gcc_assert (!slp_node);
4997 vectorizable_condition (stmt, gsi, vec_stmt,
4998 PHI_RESULT (VEC_index (gimple, phis, 0)),
5000 /* Multiple types are not supported for condition. */
5007 op0 = ops[!reduc_index];
5008 if (op_type == ternary_op)
5010 if (reduc_index == 0)
5017 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5021 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5023 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
5024 if (op_type == ternary_op)
5026 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5028 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
5036 enum vect_def_type dt;
5040 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5041 &dummy_stmt, &dummy, &dt);
5042 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5044 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
5045 if (op_type == ternary_op)
5047 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5049 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5051 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
5055 if (single_defuse_cycle)
5056 reduc_def = gimple_assign_lhs (new_stmt);
5058 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5061 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
5064 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
5067 if (!single_defuse_cycle || j == 0)
5068 reduc_def = PHI_RESULT (new_phi);
5071 def1 = ((op_type == ternary_op)
5072 ? VEC_index (tree, vec_oprnds1, i) : NULL);
5073 if (op_type == binary_op)
5075 if (reduc_index == 0)
5076 expr = build2 (code, vectype_out, reduc_def, def0);
5078 expr = build2 (code, vectype_out, def0, reduc_def);
5082 if (reduc_index == 0)
5083 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5086 if (reduc_index == 1)
5087 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5089 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5093 new_stmt = gimple_build_assign (vec_dest, expr);
5094 new_temp = make_ssa_name (vec_dest, new_stmt);
5095 gimple_assign_set_lhs (new_stmt, new_temp);
5096 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5100 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5101 VEC_quick_push (tree, vect_defs, new_temp);
5104 VEC_replace (tree, vect_defs, 0, new_temp);
5111 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5113 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5115 prev_stmt_info = vinfo_for_stmt (new_stmt);
5116 prev_phi_info = vinfo_for_stmt (new_phi);
5119 /* Finalize the reduction-phi (set its arguments) and create the
5120 epilog reduction code. */
5121 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5123 new_temp = gimple_assign_lhs (*vec_stmt);
5124 VEC_replace (tree, vect_defs, 0, new_temp);
5127 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5128 epilog_reduc_code, phis, reduc_index,
5129 double_reduc, slp_node);
5131 VEC_free (gimple, heap, phis);
5132 VEC_free (tree, heap, vec_oprnds0);
5134 VEC_free (tree, heap, vec_oprnds1);
5139 /* Function vect_min_worthwhile_factor.
5141 For a loop where we could vectorize the operation indicated by CODE,
5142 return the minimum vectorization factor that makes it worthwhile
5143 to use generic vectors. */
5145 vect_min_worthwhile_factor (enum tree_code code)
5166 /* Function vectorizable_induction
5168 Check if PHI performs an induction computation that can be vectorized.
5169 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5170 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5171 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5174 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5177 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5178 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5179 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5180 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5181 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5182 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5185 gcc_assert (ncopies >= 1);
5186 /* FORNOW. These restrictions should be relaxed. */
5187 if (nested_in_vect_loop_p (loop, phi))
5189 imm_use_iterator imm_iter;
5190 use_operand_p use_p;
5197 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
5198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5199 "multiple types in nested loop.");
5204 latch_e = loop_latch_edge (loop->inner);
5205 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5206 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5208 if (!flow_bb_inside_loop_p (loop->inner,
5209 gimple_bb (USE_STMT (use_p))))
5211 exit_phi = USE_STMT (use_p);
5217 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5218 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5219 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5221 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
5222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5223 "inner-loop induction only used outside "
5224 "of the outer vectorized loop.");
5230 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5233 /* FORNOW: SLP not supported. */
5234 if (STMT_SLP_TYPE (stmt_info))
5237 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5239 if (gimple_code (phi) != GIMPLE_PHI)
5242 if (!vec_stmt) /* transformation not required. */
5244 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5245 if (dump_kind_p (MSG_NOTE))
5246 dump_printf_loc (MSG_NOTE, vect_location,
5247 "=== vectorizable_induction ===");
5248 vect_model_induction_cost (stmt_info, ncopies);
5254 if (dump_kind_p (MSG_NOTE))
5255 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5257 vec_def = get_initial_def_for_induction (phi);
5258 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5262 /* Function vectorizable_live_operation.
5264 STMT computes a value that is used outside the loop. Check if
5265 it can be supported. */
5268 vectorizable_live_operation (gimple stmt,
5269 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5270 gimple *vec_stmt ATTRIBUTE_UNUSED)
5272 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5273 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5274 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5280 enum vect_def_type dt;
5281 enum tree_code code;
5282 enum gimple_rhs_class rhs_class;
5284 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5286 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5289 if (!is_gimple_assign (stmt))
5292 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5295 /* FORNOW. CHECKME. */
5296 if (nested_in_vect_loop_p (loop, stmt))
5299 code = gimple_assign_rhs_code (stmt);
5300 op_type = TREE_CODE_LENGTH (code);
5301 rhs_class = get_gimple_rhs_class (code);
5302 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5303 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5305 /* FORNOW: support only if all uses are invariant. This means
5306 that the scalar operations can remain in place, unvectorized.
5307 The original last scalar value that they compute will be used. */
5309 for (i = 0; i < op_type; i++)
5311 if (rhs_class == GIMPLE_SINGLE_RHS)
5312 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5314 op = gimple_op (stmt, i + 1);
5316 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5319 if (dump_kind_p (MSG_MISSED_OPTIMIZATION))
5320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5325 if (dt != vect_external_def && dt != vect_constant_def)
5329 /* No transformation is required for the cases we currently support. */
5333 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5336 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5338 ssa_op_iter op_iter;
5339 imm_use_iterator imm_iter;
5340 def_operand_p def_p;
5343 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5345 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5349 if (!is_gimple_debug (ustmt))
5352 bb = gimple_bb (ustmt);
5354 if (!flow_bb_inside_loop_p (loop, bb))
5356 if (gimple_debug_bind_p (ustmt))
5358 if (dump_kind_p (MSG_NOTE))
5359 dump_printf_loc (MSG_NOTE, vect_location,
5360 "killing debug use");
5362 gimple_debug_bind_reset_value (ustmt);
5363 update_stmt (ustmt);
5372 /* Function vect_transform_loop.
5374 The analysis phase has determined that the loop is vectorizable.
5375 Vectorize the loop - created vectorized stmts to replace the scalar
5376 stmts in the loop, and update the loop exit condition. */
5379 vect_transform_loop (loop_vec_info loop_vinfo)
5381 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5382 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5383 int nbbs = loop->num_nodes;
5384 gimple_stmt_iterator si;
5387 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5389 bool slp_scheduled = false;
5390 unsigned int nunits;
5391 gimple stmt, pattern_stmt;
5392 gimple_seq pattern_def_seq = NULL;
5393 gimple_stmt_iterator pattern_def_si = gsi_none ();
5394 bool transform_pattern_stmt = false;
5395 bool check_profitability = false;
5398 if (dump_kind_p (MSG_NOTE))
5399 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5401 /* Use the more conservative vectorization threshold. If the number
5402 of iterations is constant assume the cost check has been performed
5403 by our caller. If the threshold makes all loops profitable that
5404 run at least the vectorization factor number of times checking
5405 is pointless, too. */
5406 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5407 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5408 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5409 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5410 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5412 if (dump_kind_p (MSG_NOTE))
5413 dump_printf_loc (MSG_NOTE, vect_location,
5414 "Profitability threshold is %d loop iterations.", th);
5415 check_profitability = true;
5418 /* Peel the loop if there are data refs with unknown alignment.
5419 Only one data ref with unknown store is allowed. */
5421 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5423 vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5424 check_profitability = false;
5427 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5428 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5430 vect_loop_versioning (loop_vinfo, th, check_profitability);
5431 check_profitability = false;
5434 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5435 compile time constant), or it is a constant that doesn't divide by the
5436 vectorization factor, then an epilog loop needs to be created.
5437 We therefore duplicate the loop: the original loop will be vectorized,
5438 and will compute the first (n/VF) iterations. The second copy of the loop
5439 will remain scalar and will compute the remaining (n%VF) iterations.
5440 (VF is the vectorization factor). */
5442 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5443 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5444 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5445 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5446 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5447 th, check_profitability);
5449 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5450 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5452 /* 1) Make sure the loop header has exactly two entries
5453 2) Make sure we have a preheader basic block. */
5455 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5457 split_edge (loop_preheader_edge (loop));
5459 /* FORNOW: the vectorizer supports only loops which body consist
5460 of one basic block (header + empty latch). When the vectorizer will
5461 support more involved loop forms, the order by which the BBs are
5462 traversed need to be reconsidered. */
5464 for (i = 0; i < nbbs; i++)
5466 basic_block bb = bbs[i];
5467 stmt_vec_info stmt_info;
5470 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5472 phi = gsi_stmt (si);
5473 if (dump_kind_p (MSG_NOTE))
5475 dump_printf_loc (MSG_NOTE, vect_location,
5476 "------>vectorizing phi: ");
5477 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5479 stmt_info = vinfo_for_stmt (phi);
5483 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5484 vect_loop_kill_debug_uses (loop, phi);
5486 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5487 && !STMT_VINFO_LIVE_P (stmt_info))
5490 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5491 != (unsigned HOST_WIDE_INT) vectorization_factor)
5492 && dump_kind_p (MSG_NOTE))
5493 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5495 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5497 if (dump_kind_p (MSG_NOTE))
5498 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5499 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5503 pattern_stmt = NULL;
5504 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5508 if (transform_pattern_stmt)
5509 stmt = pattern_stmt;
5511 stmt = gsi_stmt (si);
5513 if (dump_kind_p (MSG_NOTE))
5515 dump_printf_loc (MSG_NOTE, vect_location,
5516 "------>vectorizing statement: ");
5517 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5520 stmt_info = vinfo_for_stmt (stmt);
5522 /* vector stmts created in the outer-loop during vectorization of
5523 stmts in an inner-loop may not have a stmt_info, and do not
5524 need to be vectorized. */
5531 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5532 vect_loop_kill_debug_uses (loop, stmt);
5534 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5535 && !STMT_VINFO_LIVE_P (stmt_info))
5537 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5538 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5539 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5540 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5542 stmt = pattern_stmt;
5543 stmt_info = vinfo_for_stmt (stmt);
5551 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5552 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5553 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5554 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5555 transform_pattern_stmt = true;
5557 /* If pattern statement has def stmts, vectorize them too. */
5558 if (is_pattern_stmt_p (stmt_info))
5560 if (pattern_def_seq == NULL)
5562 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5563 pattern_def_si = gsi_start (pattern_def_seq);
5565 else if (!gsi_end_p (pattern_def_si))
5566 gsi_next (&pattern_def_si);
5567 if (pattern_def_seq != NULL)
5569 gimple pattern_def_stmt = NULL;
5570 stmt_vec_info pattern_def_stmt_info = NULL;
5572 while (!gsi_end_p (pattern_def_si))
5574 pattern_def_stmt = gsi_stmt (pattern_def_si);
5575 pattern_def_stmt_info
5576 = vinfo_for_stmt (pattern_def_stmt);
5577 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5578 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5580 gsi_next (&pattern_def_si);
5583 if (!gsi_end_p (pattern_def_si))
5585 if (dump_kind_p (MSG_NOTE))
5587 dump_printf_loc (MSG_NOTE, vect_location,
5588 "==> vectorizing pattern def "
5590 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5591 pattern_def_stmt, 0);
5594 stmt = pattern_def_stmt;
5595 stmt_info = pattern_def_stmt_info;
5599 pattern_def_si = gsi_none ();
5600 transform_pattern_stmt = false;
5604 transform_pattern_stmt = false;
5607 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5608 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5609 STMT_VINFO_VECTYPE (stmt_info));
5610 if (!STMT_SLP_TYPE (stmt_info)
5611 && nunits != (unsigned int) vectorization_factor
5612 && dump_kind_p (MSG_NOTE))
5613 /* For SLP VF is set according to unrolling factor, and not to
5614 vector size, hence for SLP this print is not valid. */
5615 dump_printf_loc (MSG_NOTE, vect_location,
5618 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5620 if (STMT_SLP_TYPE (stmt_info))
5624 slp_scheduled = true;
5626 if (dump_kind_p (MSG_NOTE))
5627 dump_printf_loc (MSG_NOTE, vect_location,
5628 "=== scheduling SLP instances ===");
5630 vect_schedule_slp (loop_vinfo, NULL);
5633 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5634 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5636 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5638 pattern_def_seq = NULL;
5645 /* -------- vectorize statement ------------ */
5646 if (dump_kind_p (MSG_NOTE))
5647 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5649 grouped_store = false;
5650 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5653 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5655 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5656 interleaving chain was completed - free all the stores in
5659 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5664 /* Free the attached stmt_vec_info and remove the stmt. */
5665 gimple store = gsi_stmt (si);
5666 free_stmt_vec_info (store);
5667 unlink_stmt_vdef (store);
5668 gsi_remove (&si, true);
5669 release_defs (store);
5674 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5676 pattern_def_seq = NULL;
5682 slpeel_make_loop_iterate_ntimes (loop, ratio);
5684 /* The memory tags and pointers in vectorized statements need to
5685 have their SSA forms updated. FIXME, why can't this be delayed
5686 until all the loops have been transformed? */
5687 update_ssa (TODO_update_ssa);
5689 if (dump_kind_p (MSG_OPTIMIZED_LOCATIONS))
5690 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5691 if (loop->inner && dump_kind_p (MSG_OPTIMIZED_LOCATIONS))
5692 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5693 "OUTER LOOP VECTORIZED.");