1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
41 /* Need to include rtl.h, expr.h, etc. for optabs. */
45 /* Return true if load- or store-lanes optab OPTAB is implemented for
46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
50 tree vectype, unsigned HOST_WIDE_INT count)
52 enum machine_mode mode, array_mode;
55 mode = TYPE_MODE (vectype);
56 limit_p = !targetm.array_mode_supported_p (mode, count);
57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
60 if (array_mode == BLKmode)
62 if (dump_enabled_p ())
63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "cannot use %s<%s><%s>", name,
74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_NOTE, vect_location,
80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
81 GET_MODE_NAME (mode));
87 /* Return the smallest scalar part of STMT.
88 This is used to determine the vectype of the stmt. We generally set the
89 vectype according to the type of the result (lhs). For stmts whose
90 result-type is different than the type of the arguments (e.g., demotion,
91 promotion), vectype will be reset appropriately (later). Note that we have
92 to visit the smallest datatype in this function, because that determines the
93 VF. If the smallest datatype in the loop is present only as the rhs of a
94 promotion operation - we'd miss it.
95 Such a case, where a variable of this datatype does not appear in the lhs
96 anywhere in the loop, can only occur if it's an invariant: e.g.:
97 'int_x = (int) short_inv', which we'd expect to have been optimized away by
98 invariant motion. However, we cannot rely on invariant motion to always
99 take invariants out of the loop, and so in the case of promotion we also
100 have to check the rhs.
101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
106 HOST_WIDE_INT *rhs_size_unit)
108 tree scalar_type = gimple_expr_type (stmt);
109 HOST_WIDE_INT lhs, rhs;
111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
113 if (is_gimple_assign (stmt)
114 && (gimple_assign_cast_p (stmt)
115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
123 scalar_type = rhs_type;
126 *lhs_size_unit = lhs;
127 *rhs_size_unit = rhs;
132 /* Check if data references pointed by DR_I and DR_J are same or
133 belong to same interleaving group. Return FALSE if drs are
134 different, otherwise return TRUE. */
137 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
139 gimple stmt_i = DR_STMT (dr_i);
140 gimple stmt_j = DR_STMT (dr_j);
142 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
143 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
144 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
145 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
146 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
152 /* If address ranges represented by DDR_I and DDR_J are equal,
153 return TRUE, otherwise return FALSE. */
156 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
158 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
159 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
160 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
161 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
167 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
168 tested at run-time. Return TRUE if DDR was successfully inserted.
169 Return false if versioning is not supported. */
172 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
174 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
176 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
179 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location,
182 "mark for run-time aliasing test between ");
183 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
184 dump_printf (MSG_NOTE, " and ");
185 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
188 if (optimize_loop_nest_for_size_p (loop))
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
192 "versioning not supported when optimizing for size.");
196 /* FORNOW: We don't support versioning with outer-loop vectorization. */
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
201 "versioning not yet supported for outer-loops.");
205 /* FORNOW: We don't support creating runtime alias tests for non-constant
207 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
208 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
210 if (dump_enabled_p ())
211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
212 "versioning not yet supported for non-constant "
217 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
222 /* Function vect_analyze_data_ref_dependence.
224 Return TRUE if there (might) exist a dependence between a memory-reference
225 DRA and a memory-reference DRB. When versioning for alias may check a
226 dependence at run-time, return FALSE. Adjust *MAX_VF according to
227 the data dependence. */
230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
231 loop_vec_info loop_vinfo, int *max_vf)
234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
235 struct data_reference *dra = DDR_A (ddr);
236 struct data_reference *drb = DDR_B (ddr);
237 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
238 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
239 lambda_vector dist_v;
240 unsigned int loop_depth;
242 /* In loop analysis all data references should be vectorizable. */
243 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
244 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
247 /* Independent data accesses. */
248 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
252 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
255 /* Unknown data dependence. */
256 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
258 if (STMT_VINFO_GATHER_P (stmtinfo_a)
259 || STMT_VINFO_GATHER_P (stmtinfo_b))
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
264 "versioning for alias not supported for: "
265 "can't determine dependence between ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
268 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
275 if (dump_enabled_p ())
277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
278 "versioning for alias required: "
279 "can't determine dependence between ");
280 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 /* Add to list of ddrs that need to be tested at run-time. */
288 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
291 /* Known data dependence. */
292 if (DDR_NUM_DIST_VECTS (ddr) == 0)
294 if (STMT_VINFO_GATHER_P (stmtinfo_a)
295 || STMT_VINFO_GATHER_P (stmtinfo_b))
297 if (dump_enabled_p ())
299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
300 "versioning for alias not supported for: "
301 "bad dist vector for ");
302 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
304 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
311 if (dump_enabled_p ())
313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
314 "versioning for alias required: "
315 "bad dist vector for ");
316 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
317 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
320 /* Add to list of ddrs that need to be tested at run-time. */
321 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
324 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
325 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
327 int dist = dist_v[loop_depth];
329 if (dump_enabled_p ())
330 dump_printf_loc (MSG_NOTE, vect_location,
331 "dependence distance = %d.", dist);
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location,
338 "dependence distance == 0 between ");
339 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
340 dump_printf (MSG_NOTE, " and ");
341 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
344 /* When we perform grouped accesses and perform implicit CSE
345 by detecting equal accesses and doing disambiguation with
346 runtime alias tests like for
354 where we will end up loading { a[i], a[i+1] } once, make
355 sure that inserting group loads before the first load and
356 stores after the last store will do the right thing. */
357 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
358 && GROUP_SAME_DR_STMT (stmtinfo_a))
359 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
360 && GROUP_SAME_DR_STMT (stmtinfo_b)))
363 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
365 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
367 if (dump_enabled_p ())
368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
369 "READ_WRITE dependence in interleaving.");
377 if (dist > 0 && DDR_REVERSED_P (ddr))
379 /* If DDR_REVERSED_P the order of the data-refs in DDR was
380 reversed (to make distance vector positive), and the actual
381 distance is negative. */
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "dependence distance negative.");
389 && abs (dist) < *max_vf)
391 /* The dependence distance requires reduction of the maximal
392 vectorization factor. */
393 *max_vf = abs (dist);
394 if (dump_enabled_p ())
395 dump_printf_loc (MSG_NOTE, vect_location,
396 "adjusting maximal vectorization factor to %i",
400 if (abs (dist) >= *max_vf)
402 /* Dependence distance does not create dependence, as far as
403 vectorization is concerned, in this case. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_NOTE, vect_location,
406 "dependence distance >= VF.");
410 if (dump_enabled_p ())
412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
413 "not vectorized, possible dependence "
414 "between data-refs ");
415 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
416 dump_printf (MSG_NOTE, " and ");
417 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
426 /* Function vect_analyze_data_ref_dependences.
428 Examine all the data references in the loop, and make sure there do not
429 exist any data dependences between them. Set *MAX_VF according to
430 the maximum vectorization factor the data dependences allow. */
433 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
436 struct data_dependence_relation *ddr;
438 if (dump_enabled_p ())
439 dump_printf_loc (MSG_NOTE, vect_location,
440 "=== vect_analyze_data_ref_dependences ===");
442 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
443 &LOOP_VINFO_DDRS (loop_vinfo),
444 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
447 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
448 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
455 /* Function vect_slp_analyze_data_ref_dependence.
457 Return TRUE if there (might) exist a dependence between a memory-reference
458 DRA and a memory-reference DRB. When versioning for alias may check a
459 dependence at run-time, return FALSE. Adjust *MAX_VF according to
460 the data dependence. */
463 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
465 struct data_reference *dra = DDR_A (ddr);
466 struct data_reference *drb = DDR_B (ddr);
468 /* We need to check dependences of statements marked as unvectorizable
469 as well, they still can prohibit vectorization. */
471 /* Independent data accesses. */
472 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
478 /* Read-read is OK. */
479 if (DR_IS_READ (dra) && DR_IS_READ (drb))
482 /* If dra and drb are part of the same interleaving chain consider
484 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
485 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
486 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
489 /* Unknown data dependence. */
490 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
494 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "can't determine dependence between ");
498 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
499 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
500 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
503 /* We do not vectorize basic blocks with write-write dependencies. */
504 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
507 /* Check that it's not a load-after-store dependence. */
508 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
509 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE, vect_location,
518 "determined dependence between ");
519 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
520 dump_printf (MSG_NOTE, " and ");
521 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
524 /* Do not vectorize basic blocks with write-write dependences. */
525 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
528 /* Check dependence between DRA and DRB for basic block vectorization.
529 If the accesses share same bases and offsets, we can compare their initial
530 constant offsets to decide whether they differ or not. In case of a read-
531 write dependence we check that the load is before the store to ensure that
532 vectorization will not change the order of the accesses. */
534 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
537 /* Check that the data-refs have same bases and offsets. If not, we can't
538 determine if they are dependent. */
539 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
540 || !dr_equal_offsets_p (dra, drb))
543 /* Check the types. */
544 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
545 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
547 if (type_size_a != type_size_b
548 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
549 TREE_TYPE (DR_REF (drb))))
552 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
553 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
555 /* Two different locations - no dependence. */
556 if (init_a != init_b)
559 /* We have a read-write dependence. Check that the load is before the store.
560 When we vectorize basic blocks, vector load can be only before
561 corresponding scalar load, and vector store can be only after its
562 corresponding scalar store. So the order of the acceses is preserved in
563 case the load is before the store. */
564 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
565 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
572 /* Function vect_analyze_data_ref_dependences.
574 Examine all the data references in the basic-block, and make sure there
575 do not exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
579 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
581 struct data_dependence_relation *ddr;
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_NOTE, vect_location,
586 "=== vect_slp_analyze_data_ref_dependences ===");
588 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
589 &BB_VINFO_DDRS (bb_vinfo),
593 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
594 if (vect_slp_analyze_data_ref_dependence (ddr))
601 /* Function vect_compute_data_ref_alignment
603 Compute the misalignment of the data reference DR.
606 1. If during the misalignment computation it is found that the data reference
607 cannot be vectorized then false is returned.
608 2. DR_MISALIGNMENT (DR) is defined.
610 FOR NOW: No analysis is actually performed. Misalignment is calculated
611 only for trivial cases. TODO. */
614 vect_compute_data_ref_alignment (struct data_reference *dr)
616 gimple stmt = DR_STMT (dr);
617 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
618 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
619 struct loop *loop = NULL;
620 tree ref = DR_REF (dr);
622 tree base, base_addr;
625 tree aligned_to, alignment;
627 if (dump_enabled_p ())
628 dump_printf_loc (MSG_NOTE, vect_location,
629 "vect_compute_data_ref_alignment:");
632 loop = LOOP_VINFO_LOOP (loop_vinfo);
634 /* Initialize misalignment to unknown. */
635 SET_DR_MISALIGNMENT (dr, -1);
637 /* Strided loads perform only component accesses, misalignment information
638 is irrelevant for them. */
639 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
642 misalign = DR_INIT (dr);
643 aligned_to = DR_ALIGNED_TO (dr);
644 base_addr = DR_BASE_ADDRESS (dr);
645 vectype = STMT_VINFO_VECTYPE (stmt_info);
647 /* In case the dataref is in an inner-loop of the loop that is being
648 vectorized (LOOP), we use the base and misalignment information
649 relative to the outer-loop (LOOP). This is ok only if the misalignment
650 stays the same throughout the execution of the inner-loop, which is why
651 we have to check that the stride of the dataref in the inner-loop evenly
652 divides by the vector size. */
653 if (loop && nested_in_vect_loop_p (loop, stmt))
655 tree step = DR_STEP (dr);
656 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
658 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "inner step divides the vector-size.");
663 misalign = STMT_VINFO_DR_INIT (stmt_info);
664 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
665 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
671 "inner step doesn't divide the vector-size.");
672 misalign = NULL_TREE;
676 /* Similarly, if we're doing basic-block vectorization, we can only use
677 base and misalignment information relative to an innermost loop if the
678 misalignment stays the same throughout the execution of the loop.
679 As above, this is the case if the stride of the dataref evenly divides
680 by the vector size. */
683 tree step = DR_STEP (dr);
684 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
686 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "SLP: step doesn't divide the vector-size.");
691 misalign = NULL_TREE;
695 base = build_fold_indirect_ref (base_addr);
696 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
698 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
701 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Unknown alignment for access: ");
705 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
711 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
713 || (TREE_CODE (base_addr) == SSA_NAME
714 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
715 TREE_TYPE (base_addr)))),
717 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
720 base_aligned = false;
724 /* Do not change the alignment of global variables here if
725 flag_section_anchors is enabled as we already generated
726 RTL for other functions. Most global variables should
727 have been aligned during the IPA increase_alignment pass. */
728 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
729 || (TREE_STATIC (base) && flag_section_anchors))
731 if (dump_enabled_p ())
733 dump_printf_loc (MSG_NOTE, vect_location,
734 "can't force alignment of ref: ");
735 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
740 /* Force the alignment of the decl.
741 NOTE: This is the only change to the code we make during
742 the analysis phase, before deciding to vectorize the loop. */
743 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
746 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
749 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
750 DECL_USER_ALIGN (base) = 1;
753 /* At this point we assume that the base is aligned. */
754 gcc_assert (base_aligned
755 || (TREE_CODE (base) == VAR_DECL
756 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
758 /* If this is a backward running DR then first access in the larger
759 vectype actually is N-1 elements before the address in the DR.
760 Adjust misalign accordingly. */
761 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
763 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
764 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
765 otherwise we wouldn't be here. */
766 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
767 /* PLUS because DR_STEP was negative. */
768 misalign = size_binop (PLUS_EXPR, misalign, offset);
771 /* Modulo alignment. */
772 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
774 if (!host_integerp (misalign, 1))
776 /* Negative or overflowed misalignment value. */
777 if (dump_enabled_p ())
778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
779 "unexpected misalign value");
783 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
788 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
789 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
796 /* Function vect_compute_data_refs_alignment
798 Compute the misalignment of data references in the loop.
799 Return FALSE if a data reference is found that cannot be vectorized. */
802 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
803 bb_vec_info bb_vinfo)
805 vec<data_reference_p> datarefs;
806 struct data_reference *dr;
810 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
812 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
814 FOR_EACH_VEC_ELT (datarefs, i, dr)
815 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
816 && !vect_compute_data_ref_alignment (dr))
820 /* Mark unsupported statement as unvectorizable. */
821 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
832 /* Function vect_update_misalignment_for_peel
834 DR - the data reference whose misalignment is to be adjusted.
835 DR_PEEL - the data reference whose misalignment is being made
836 zero in the vector loop by the peel.
837 NPEEL - the number of iterations in the peel loop if the misalignment
838 of DR_PEEL is known at compile time. */
841 vect_update_misalignment_for_peel (struct data_reference *dr,
842 struct data_reference *dr_peel, int npeel)
845 vec<dr_p> same_align_drs;
846 struct data_reference *current_dr;
847 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
848 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
849 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
850 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
852 /* For interleaved data accesses the step in the loop must be multiplied by
853 the size of the interleaving group. */
854 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
855 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
856 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
857 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
859 /* It can be assumed that the data refs with the same alignment as dr_peel
860 are aligned in the vector loop. */
862 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
863 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
865 if (current_dr != dr)
867 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
868 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
869 SET_DR_MISALIGNMENT (dr, 0);
873 if (known_alignment_for_access_p (dr)
874 && known_alignment_for_access_p (dr_peel))
876 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
877 int misal = DR_MISALIGNMENT (dr);
878 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
879 misal += negative ? -npeel * dr_size : npeel * dr_size;
880 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
881 SET_DR_MISALIGNMENT (dr, misal);
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
887 SET_DR_MISALIGNMENT (dr, -1);
891 /* Function vect_verify_datarefs_alignment
893 Return TRUE if all data references in the loop can be
894 handled with respect to alignment. */
897 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
899 vec<data_reference_p> datarefs;
900 struct data_reference *dr;
901 enum dr_alignment_support supportable_dr_alignment;
905 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
907 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
909 FOR_EACH_VEC_ELT (datarefs, i, dr)
911 gimple stmt = DR_STMT (dr);
912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
914 if (!STMT_VINFO_RELEVANT_P (stmt_info))
917 /* For interleaving, only the alignment of the first access matters.
918 Skip statements marked as not vectorizable. */
919 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
920 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
921 || !STMT_VINFO_VECTORIZABLE (stmt_info))
924 /* Strided loads perform only component accesses, alignment is
925 irrelevant for them. */
926 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
929 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
930 if (!supportable_dr_alignment)
932 if (dump_enabled_p ())
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "not vectorized: unsupported unaligned load.");
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "not vectorized: unsupported unaligned "
942 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
947 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
948 dump_printf_loc (MSG_NOTE, vect_location,
949 "Vectorizing an unaligned access.");
954 /* Given an memory reference EXP return whether its alignment is less
958 not_size_aligned (tree exp)
960 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
963 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
964 > get_object_alignment (exp));
967 /* Function vector_alignment_reachable_p
969 Return true if vector alignment for DR is reachable by peeling
970 a few loop iterations. Return false otherwise. */
973 vector_alignment_reachable_p (struct data_reference *dr)
975 gimple stmt = DR_STMT (dr);
976 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
981 /* For interleaved access we peel only if number of iterations in
982 the prolog loop ({VF - misalignment}), is a multiple of the
983 number of the interleaved accesses. */
984 int elem_size, mis_in_elements;
985 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
987 /* FORNOW: handle only known alignment. */
988 if (!known_alignment_for_access_p (dr))
991 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
992 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
994 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
998 /* If misalignment is known at the compile time then allow peeling
999 only if natural alignment is reachable through peeling. */
1000 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1002 HOST_WIDE_INT elmsize =
1003 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_NOTE, vect_location,
1007 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1008 dump_printf (MSG_NOTE,
1009 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1011 if (DR_MISALIGNMENT (dr) % elmsize)
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "data size does not divide the misalignment.\n");
1020 if (!known_alignment_for_access_p (dr))
1022 tree type = TREE_TYPE (DR_REF (dr));
1023 bool is_packed = not_size_aligned (DR_REF (dr));
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "Unknown misalignment, is_packed = %d",is_packed);
1027 if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
1037 /* Calculate the cost of the memory access represented by DR. */
1040 vect_get_data_access_cost (struct data_reference *dr,
1041 unsigned int *inside_cost,
1042 unsigned int *outside_cost,
1043 stmt_vector_for_cost *body_cost_vec)
1045 gimple stmt = DR_STMT (dr);
1046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1047 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1048 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1049 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1050 int ncopies = vf / nunits;
1052 if (DR_IS_READ (dr))
1053 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1054 NULL, body_cost_vec, false);
1056 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1058 if (dump_enabled_p ())
1059 dump_printf_loc (MSG_NOTE, vect_location,
1060 "vect_get_data_access_cost: inside_cost = %d, "
1061 "outside_cost = %d.", *inside_cost, *outside_cost);
1066 vect_peeling_hash (const void *elem)
1068 const struct _vect_peel_info *peel_info;
1070 peel_info = (const struct _vect_peel_info *) elem;
1071 return (hashval_t) peel_info->npeel;
1076 vect_peeling_hash_eq (const void *elem1, const void *elem2)
1078 const struct _vect_peel_info *a, *b;
1080 a = (const struct _vect_peel_info *) elem1;
1081 b = (const struct _vect_peel_info *) elem2;
1082 return (a->npeel == b->npeel);
1086 /* Insert DR into peeling hash table with NPEEL as key. */
1089 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1092 struct _vect_peel_info elem, *slot;
1094 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1097 slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1103 slot = XNEW (struct _vect_peel_info);
1104 slot->npeel = npeel;
1107 new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot,
1112 if (!supportable_dr_alignment && !flag_vect_cost_model)
1113 slot->count += VECT_MAX_COST;
1117 /* Traverse peeling hash table to find peeling option that aligns maximum
1118 number of data accesses. */
1121 vect_peeling_hash_get_most_frequent (void **slot, void *data)
1123 vect_peel_info elem = (vect_peel_info) *slot;
1124 vect_peel_extended_info max = (vect_peel_extended_info) data;
1126 if (elem->count > max->peel_info.count
1127 || (elem->count == max->peel_info.count
1128 && max->peel_info.npeel > elem->npeel))
1130 max->peel_info.npeel = elem->npeel;
1131 max->peel_info.count = elem->count;
1132 max->peel_info.dr = elem->dr;
1139 /* Traverse peeling hash table and calculate cost for each peeling option.
1140 Find the one with the lowest cost. */
1143 vect_peeling_hash_get_lowest_cost (void **slot, void *data)
1145 vect_peel_info elem = (vect_peel_info) *slot;
1146 vect_peel_extended_info min = (vect_peel_extended_info) data;
1147 int save_misalignment, dummy;
1148 unsigned int inside_cost = 0, outside_cost = 0, i;
1149 gimple stmt = DR_STMT (elem->dr);
1150 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1151 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1152 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1153 struct data_reference *dr;
1154 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1155 int single_iter_cost;
1157 prologue_cost_vec.create (2);
1158 body_cost_vec.create (2);
1159 epilogue_cost_vec.create (2);
1161 FOR_EACH_VEC_ELT (datarefs, i, dr)
1163 stmt = DR_STMT (dr);
1164 stmt_info = vinfo_for_stmt (stmt);
1165 /* For interleaving, only the alignment of the first access
1167 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1168 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1171 save_misalignment = DR_MISALIGNMENT (dr);
1172 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1173 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1175 SET_DR_MISALIGNMENT (dr, save_misalignment);
1178 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1179 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1180 &dummy, single_iter_cost,
1182 &epilogue_cost_vec);
1184 /* Prologue and epilogue costs are added to the target model later.
1185 These costs depend only on the scalar iteration cost, the
1186 number of peeling iterations finally chosen, and the number of
1187 misaligned statements. So discard the information found here. */
1188 prologue_cost_vec.release ();
1189 epilogue_cost_vec.release ();
1191 if (inside_cost < min->inside_cost
1192 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1194 min->inside_cost = inside_cost;
1195 min->outside_cost = outside_cost;
1196 min->body_cost_vec.release ();
1197 min->body_cost_vec = body_cost_vec;
1198 min->peel_info.dr = elem->dr;
1199 min->peel_info.npeel = elem->npeel;
1202 body_cost_vec.release ();
1208 /* Choose best peeling option by traversing peeling hash table and either
1209 choosing an option with the lowest cost (if cost model is enabled) or the
1210 option that aligns as many accesses as possible. */
1212 static struct data_reference *
1213 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1214 unsigned int *npeel,
1215 stmt_vector_for_cost *body_cost_vec)
1217 struct _vect_peel_extended_info res;
1219 res.peel_info.dr = NULL;
1220 res.body_cost_vec = stmt_vector_for_cost();
1222 if (flag_vect_cost_model)
1224 res.inside_cost = INT_MAX;
1225 res.outside_cost = INT_MAX;
1226 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1227 vect_peeling_hash_get_lowest_cost, &res);
1231 res.peel_info.count = 0;
1232 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1233 vect_peeling_hash_get_most_frequent, &res);
1236 *npeel = res.peel_info.npeel;
1237 *body_cost_vec = res.body_cost_vec;
1238 return res.peel_info.dr;
1242 /* Function vect_enhance_data_refs_alignment
1244 This pass will use loop versioning and loop peeling in order to enhance
1245 the alignment of data references in the loop.
1247 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1248 original loop is to be vectorized. Any other loops that are created by
1249 the transformations performed in this pass - are not supposed to be
1250 vectorized. This restriction will be relaxed.
1252 This pass will require a cost model to guide it whether to apply peeling
1253 or versioning or a combination of the two. For example, the scheme that
1254 intel uses when given a loop with several memory accesses, is as follows:
1255 choose one memory access ('p') which alignment you want to force by doing
1256 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1257 other accesses are not necessarily aligned, or (2) use loop versioning to
1258 generate one loop in which all accesses are aligned, and another loop in
1259 which only 'p' is necessarily aligned.
1261 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1262 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1263 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1265 Devising a cost model is the most critical aspect of this work. It will
1266 guide us on which access to peel for, whether to use loop versioning, how
1267 many versions to create, etc. The cost model will probably consist of
1268 generic considerations as well as target specific considerations (on
1269 powerpc for example, misaligned stores are more painful than misaligned
1272 Here are the general steps involved in alignment enhancements:
1274 -- original loop, before alignment analysis:
1275 for (i=0; i<N; i++){
1276 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1277 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1280 -- After vect_compute_data_refs_alignment:
1281 for (i=0; i<N; i++){
1282 x = q[i]; # DR_MISALIGNMENT(q) = 3
1283 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1286 -- Possibility 1: we do loop versioning:
1288 for (i=0; i<N; i++){ # loop 1A
1289 x = q[i]; # DR_MISALIGNMENT(q) = 3
1290 p[i] = y; # DR_MISALIGNMENT(p) = 0
1294 for (i=0; i<N; i++){ # loop 1B
1295 x = q[i]; # DR_MISALIGNMENT(q) = 3
1296 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1300 -- Possibility 2: we do loop peeling:
1301 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1305 for (i = 3; i < N; i++){ # loop 2A
1306 x = q[i]; # DR_MISALIGNMENT(q) = 0
1307 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1310 -- Possibility 3: combination of loop peeling and versioning:
1311 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1316 for (i = 3; i<N; i++){ # loop 3A
1317 x = q[i]; # DR_MISALIGNMENT(q) = 0
1318 p[i] = y; # DR_MISALIGNMENT(p) = 0
1322 for (i = 3; i<N; i++){ # loop 3B
1323 x = q[i]; # DR_MISALIGNMENT(q) = 0
1324 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1328 These loops are later passed to loop_transform to be vectorized. The
1329 vectorizer will use the alignment information to guide the transformation
1330 (whether to generate regular loads/stores, or with special handling for
1334 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1336 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1337 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1338 enum dr_alignment_support supportable_dr_alignment;
1339 struct data_reference *dr0 = NULL, *first_store = NULL;
1340 struct data_reference *dr;
1342 bool do_peeling = false;
1343 bool do_versioning = false;
1346 stmt_vec_info stmt_info;
1347 int vect_versioning_for_alias_required;
1348 unsigned int npeel = 0;
1349 bool all_misalignments_unknown = true;
1350 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1351 unsigned possible_npeel_number = 1;
1353 unsigned int nelements, mis, same_align_drs_max = 0;
1354 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1356 if (dump_enabled_p ())
1357 dump_printf_loc (MSG_NOTE, vect_location,
1358 "=== vect_enhance_data_refs_alignment ===");
1360 /* While cost model enhancements are expected in the future, the high level
1361 view of the code at this time is as follows:
1363 A) If there is a misaligned access then see if peeling to align
1364 this access can make all data references satisfy
1365 vect_supportable_dr_alignment. If so, update data structures
1366 as needed and return true.
1368 B) If peeling wasn't possible and there is a data reference with an
1369 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1370 then see if loop versioning checks can be used to make all data
1371 references satisfy vect_supportable_dr_alignment. If so, update
1372 data structures as needed and return true.
1374 C) If neither peeling nor versioning were successful then return false if
1375 any data reference does not satisfy vect_supportable_dr_alignment.
1377 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1379 Note, Possibility 3 above (which is peeling and versioning together) is not
1380 being done at this time. */
1382 /* (1) Peeling to force alignment. */
1384 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1386 + How many accesses will become aligned due to the peeling
1387 - How many accesses will become unaligned due to the peeling,
1388 and the cost of misaligned accesses.
1389 - The cost of peeling (the extra runtime checks, the increase
1392 FOR_EACH_VEC_ELT (datarefs, i, dr)
1394 stmt = DR_STMT (dr);
1395 stmt_info = vinfo_for_stmt (stmt);
1397 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1400 /* For interleaving, only the alignment of the first access
1402 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1403 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1406 /* For invariant accesses there is nothing to enhance. */
1407 if (integer_zerop (DR_STEP (dr)))
1410 /* Strided loads perform only component accesses, alignment is
1411 irrelevant for them. */
1412 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1415 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1416 do_peeling = vector_alignment_reachable_p (dr);
1419 if (known_alignment_for_access_p (dr))
1421 unsigned int npeel_tmp;
1422 bool negative = tree_int_cst_compare (DR_STEP (dr),
1423 size_zero_node) < 0;
1425 /* Save info about DR in the hash table. */
1426 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1427 LOOP_VINFO_PEELING_HTAB (loop_vinfo) =
1428 htab_create (1, vect_peeling_hash,
1429 vect_peeling_hash_eq, free);
1431 vectype = STMT_VINFO_VECTYPE (stmt_info);
1432 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1433 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1434 TREE_TYPE (DR_REF (dr))));
1435 npeel_tmp = (negative
1436 ? (mis - nelements) : (nelements - mis))
1439 /* For multiple types, it is possible that the bigger type access
1440 will have more than one peeling option. E.g., a loop with two
1441 types: one of size (vector size / 4), and the other one of
1442 size (vector size / 8). Vectorization factor will 8. If both
1443 access are misaligned by 3, the first one needs one scalar
1444 iteration to be aligned, and the second one needs 5. But the
1445 the first one will be aligned also by peeling 5 scalar
1446 iterations, and in that case both accesses will be aligned.
1447 Hence, except for the immediate peeling amount, we also want
1448 to try to add full vector size, while we don't exceed
1449 vectorization factor.
1450 We do this automtically for cost model, since we calculate cost
1451 for every peeling option. */
1452 if (!flag_vect_cost_model)
1453 possible_npeel_number = vf /nelements;
1455 /* Handle the aligned case. We may decide to align some other
1456 access, making DR unaligned. */
1457 if (DR_MISALIGNMENT (dr) == 0)
1460 if (!flag_vect_cost_model)
1461 possible_npeel_number++;
1464 for (j = 0; j < possible_npeel_number; j++)
1466 gcc_assert (npeel_tmp <= vf);
1467 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1468 npeel_tmp += nelements;
1471 all_misalignments_unknown = false;
1472 /* Data-ref that was chosen for the case that all the
1473 misalignments are unknown is not relevant anymore, since we
1474 have a data-ref with known alignment. */
1479 /* If we don't know any misalignment values, we prefer
1480 peeling for data-ref that has the maximum number of data-refs
1481 with the same alignment, unless the target prefers to align
1482 stores over load. */
1483 if (all_misalignments_unknown)
1485 unsigned same_align_drs
1486 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1488 || same_align_drs_max < same_align_drs)
1490 same_align_drs_max = same_align_drs;
1493 /* For data-refs with the same number of related
1494 accesses prefer the one where the misalign
1495 computation will be invariant in the outermost loop. */
1496 else if (same_align_drs_max == same_align_drs)
1498 struct loop *ivloop0, *ivloop;
1499 ivloop0 = outermost_invariant_loop_for_expr
1500 (loop, DR_BASE_ADDRESS (dr0));
1501 ivloop = outermost_invariant_loop_for_expr
1502 (loop, DR_BASE_ADDRESS (dr));
1503 if ((ivloop && !ivloop0)
1504 || (ivloop && ivloop0
1505 && flow_loop_nested_p (ivloop, ivloop0)))
1509 if (!first_store && DR_IS_WRITE (dr))
1513 /* If there are both known and unknown misaligned accesses in the
1514 loop, we choose peeling amount according to the known
1516 if (!supportable_dr_alignment)
1519 if (!first_store && DR_IS_WRITE (dr))
1526 if (!aligned_access_p (dr))
1528 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 "vector alignment may not be reachable");
1536 vect_versioning_for_alias_required
1537 = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
1539 /* Temporarily, if versioning for alias is required, we disable peeling
1540 until we support peeling and versioning. Often peeling for alignment
1541 will require peeling for loop-bound, which in turn requires that we
1542 know how to adjust the loop ivs after the loop. */
1543 if (vect_versioning_for_alias_required
1544 || !vect_can_advance_ivs_p (loop_vinfo)
1545 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1548 if (do_peeling && all_misalignments_unknown
1549 && vect_supportable_dr_alignment (dr0, false))
1552 /* Check if the target requires to prefer stores over loads, i.e., if
1553 misaligned stores are more expensive than misaligned loads (taking
1554 drs with same alignment into account). */
1555 if (first_store && DR_IS_READ (dr0))
1557 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1558 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1559 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1560 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1561 stmt_vector_for_cost dummy;
1564 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1566 vect_get_data_access_cost (first_store, &store_inside_cost,
1567 &store_outside_cost, &dummy);
1571 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1572 aligning the load DR0). */
1573 load_inside_penalty = store_inside_cost;
1574 load_outside_penalty = store_outside_cost;
1576 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1577 DR_STMT (first_store))).iterate (i, &dr);
1579 if (DR_IS_READ (dr))
1581 load_inside_penalty += load_inside_cost;
1582 load_outside_penalty += load_outside_cost;
1586 load_inside_penalty += store_inside_cost;
1587 load_outside_penalty += store_outside_cost;
1590 /* Calculate the penalty for leaving DR0 unaligned (by
1591 aligning the FIRST_STORE). */
1592 store_inside_penalty = load_inside_cost;
1593 store_outside_penalty = load_outside_cost;
1595 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1596 DR_STMT (dr0))).iterate (i, &dr);
1598 if (DR_IS_READ (dr))
1600 store_inside_penalty += load_inside_cost;
1601 store_outside_penalty += load_outside_cost;
1605 store_inside_penalty += store_inside_cost;
1606 store_outside_penalty += store_outside_cost;
1609 if (load_inside_penalty > store_inside_penalty
1610 || (load_inside_penalty == store_inside_penalty
1611 && load_outside_penalty > store_outside_penalty))
1615 /* In case there are only loads with different unknown misalignments, use
1616 peeling only if it may help to align other accesses in the loop. */
1618 && !STMT_VINFO_SAME_ALIGN_REFS (
1619 vinfo_for_stmt (DR_STMT (dr0))).length ()
1620 && vect_supportable_dr_alignment (dr0, false)
1621 != dr_unaligned_supported)
1625 if (do_peeling && !dr0)
1627 /* Peeling is possible, but there is no data access that is not supported
1628 unless aligned. So we try to choose the best possible peeling. */
1630 /* We should get here only if there are drs with known misalignment. */
1631 gcc_assert (!all_misalignments_unknown);
1633 /* Choose the best peeling from the hash table. */
1634 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1642 stmt = DR_STMT (dr0);
1643 stmt_info = vinfo_for_stmt (stmt);
1644 vectype = STMT_VINFO_VECTYPE (stmt_info);
1645 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1647 if (known_alignment_for_access_p (dr0))
1649 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1650 size_zero_node) < 0;
1653 /* Since it's known at compile time, compute the number of
1654 iterations in the peeled loop (the peeling factor) for use in
1655 updating DR_MISALIGNMENT values. The peeling factor is the
1656 vectorization factor minus the misalignment as an element
1658 mis = DR_MISALIGNMENT (dr0);
1659 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1660 npeel = ((negative ? mis - nelements : nelements - mis)
1664 /* For interleaved data access every iteration accesses all the
1665 members of the group, therefore we divide the number of iterations
1666 by the group size. */
1667 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1668 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1669 npeel /= GROUP_SIZE (stmt_info);
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_NOTE, vect_location,
1673 "Try peeling by %d", npeel);
1676 /* Ensure that all data refs can be vectorized after the peel. */
1677 FOR_EACH_VEC_ELT (datarefs, i, dr)
1679 int save_misalignment;
1684 stmt = DR_STMT (dr);
1685 stmt_info = vinfo_for_stmt (stmt);
1686 /* For interleaving, only the alignment of the first access
1688 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1689 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1692 /* Strided loads perform only component accesses, alignment is
1693 irrelevant for them. */
1694 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1697 save_misalignment = DR_MISALIGNMENT (dr);
1698 vect_update_misalignment_for_peel (dr, dr0, npeel);
1699 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1700 SET_DR_MISALIGNMENT (dr, save_misalignment);
1702 if (!supportable_dr_alignment)
1709 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1711 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1716 body_cost_vec.release ();
1723 stmt_info_for_cost *si;
1724 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1726 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1727 If the misalignment of DR_i is identical to that of dr0 then set
1728 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1729 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1730 by the peeling factor times the element size of DR_i (MOD the
1731 vectorization factor times the size). Otherwise, the
1732 misalignment of DR_i must be set to unknown. */
1733 FOR_EACH_VEC_ELT (datarefs, i, dr)
1735 vect_update_misalignment_for_peel (dr, dr0, npeel);
1737 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1739 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1741 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1742 SET_DR_MISALIGNMENT (dr0, 0);
1743 if (dump_enabled_p ())
1745 dump_printf_loc (MSG_NOTE, vect_location,
1746 "Alignment of access forced using peeling.");
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "Peeling for alignment will be applied.");
1750 /* We've delayed passing the inside-loop peeling costs to the
1751 target cost model until we were sure peeling would happen.
1753 if (body_cost_vec.exists ())
1755 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1757 struct _stmt_vec_info *stmt_info
1758 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1759 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1760 si->misalign, vect_body);
1762 body_cost_vec.release ();
1765 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1771 body_cost_vec.release ();
1773 /* (2) Versioning to force alignment. */
1775 /* Try versioning if:
1776 1) flag_tree_vect_loop_version is TRUE
1777 2) optimize loop for speed
1778 3) there is at least one unsupported misaligned data ref with an unknown
1780 4) all misaligned data refs with a known misalignment are supported, and
1781 5) the number of runtime alignment checks is within reason. */
1784 flag_tree_vect_loop_version
1785 && optimize_loop_nest_for_speed_p (loop)
1786 && (!loop->inner); /* FORNOW */
1790 FOR_EACH_VEC_ELT (datarefs, i, dr)
1792 stmt = DR_STMT (dr);
1793 stmt_info = vinfo_for_stmt (stmt);
1795 /* For interleaving, only the alignment of the first access
1797 if (aligned_access_p (dr)
1798 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1799 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1802 /* Strided loads perform only component accesses, alignment is
1803 irrelevant for them. */
1804 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1807 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1809 if (!supportable_dr_alignment)
1815 if (known_alignment_for_access_p (dr)
1816 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1817 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1819 do_versioning = false;
1823 stmt = DR_STMT (dr);
1824 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1825 gcc_assert (vectype);
1827 /* The rightmost bits of an aligned address must be zeros.
1828 Construct the mask needed for this test. For example,
1829 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1830 mask must be 15 = 0xf. */
1831 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1833 /* FORNOW: use the same mask to test all potentially unaligned
1834 references in the loop. The vectorizer currently supports
1835 a single vector size, see the reference to
1836 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1837 vectorization factor is computed. */
1838 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1839 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1840 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1841 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1846 /* Versioning requires at least one misaligned data reference. */
1847 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1848 do_versioning = false;
1849 else if (!do_versioning)
1850 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1855 vec<gimple> may_misalign_stmts
1856 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1859 /* It can now be assumed that the data references in the statements
1860 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1861 of the loop being vectorized. */
1862 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1864 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1865 dr = STMT_VINFO_DATA_REF (stmt_info);
1866 SET_DR_MISALIGNMENT (dr, 0);
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_NOTE, vect_location,
1869 "Alignment of access forced using versioning.");
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_NOTE, vect_location,
1874 "Versioning for alignment will be applied.");
1876 /* Peeling and versioning can't be done together at this time. */
1877 gcc_assert (! (do_peeling && do_versioning));
1879 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1884 /* This point is reached if neither peeling nor versioning is being done. */
1885 gcc_assert (! (do_peeling || do_versioning));
1887 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1892 /* Function vect_find_same_alignment_drs.
1894 Update group and alignment relations according to the chosen
1895 vectorization factor. */
1898 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1899 loop_vec_info loop_vinfo)
1902 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1903 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1904 struct data_reference *dra = DDR_A (ddr);
1905 struct data_reference *drb = DDR_B (ddr);
1906 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1907 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1908 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1909 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1910 lambda_vector dist_v;
1911 unsigned int loop_depth;
1913 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1919 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1922 /* Loop-based vectorization and known data dependence. */
1923 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1926 /* Data-dependence analysis reports a distance vector of zero
1927 for data-references that overlap only in the first iteration
1928 but have different sign step (see PR45764).
1929 So as a sanity check require equal DR_STEP. */
1930 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1933 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1934 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1936 int dist = dist_v[loop_depth];
1938 if (dump_enabled_p ())
1939 dump_printf_loc (MSG_NOTE, vect_location,
1940 "dependence distance = %d.", dist);
1942 /* Same loop iteration. */
1944 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1946 /* Two references with distance zero have the same alignment. */
1947 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1948 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1949 if (dump_enabled_p ())
1951 dump_printf_loc (MSG_NOTE, vect_location,
1952 "accesses have the same alignment.");
1953 dump_printf (MSG_NOTE,
1954 "dependence distance modulo vf == 0 between ");
1955 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1956 dump_printf (MSG_NOTE, " and ");
1957 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1964 /* Function vect_analyze_data_refs_alignment
1966 Analyze the alignment of the data-references in the loop.
1967 Return FALSE if a data reference is found that cannot be vectorized. */
1970 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1971 bb_vec_info bb_vinfo)
1973 if (dump_enabled_p ())
1974 dump_printf_loc (MSG_NOTE, vect_location,
1975 "=== vect_analyze_data_refs_alignment ===");
1977 /* Mark groups of data references with same alignment using
1978 data dependence information. */
1981 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1982 struct data_dependence_relation *ddr;
1985 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1986 vect_find_same_alignment_drs (ddr, loop_vinfo);
1989 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1991 if (dump_enabled_p ())
1992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1993 "not vectorized: can't calculate alignment "
2002 /* Analyze groups of accesses: check that DR belongs to a group of
2003 accesses of legal size, step, etc. Detect gaps, single element
2004 interleaving, and other special cases. Set grouped access info.
2005 Collect groups of strided stores for further use in SLP analysis. */
2008 vect_analyze_group_access (struct data_reference *dr)
2010 tree step = DR_STEP (dr);
2011 tree scalar_type = TREE_TYPE (DR_REF (dr));
2012 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2013 gimple stmt = DR_STMT (dr);
2014 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2015 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2016 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2017 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2018 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2019 bool slp_impossible = false;
2020 struct loop *loop = NULL;
2023 loop = LOOP_VINFO_LOOP (loop_vinfo);
2025 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2026 size of the interleaving group (including gaps). */
2027 groupsize = dr_step / type_size;
2029 /* Not consecutive access is possible only if it is a part of interleaving. */
2030 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2032 /* Check if it this DR is a part of interleaving, and is a single
2033 element of the group that is accessed in the loop. */
2035 /* Gaps are supported only for loads. STEP must be a multiple of the type
2036 size. The size of the group must be a power of 2. */
2038 && (dr_step % type_size) == 0
2040 && exact_log2 (groupsize) != -1)
2042 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2043 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2044 if (dump_enabled_p ())
2046 dump_printf_loc (MSG_NOTE, vect_location,
2047 "Detected single element interleaving ");
2048 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2049 dump_printf (MSG_NOTE, " step ");
2050 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_NOTE, vect_location,
2057 "Data access with gaps requires scalar "
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063 "Peeling for outer loop is not"
2068 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2074 if (dump_enabled_p ())
2076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2077 "not consecutive access ");
2078 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2083 /* Mark the statement as unvectorizable. */
2084 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2091 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2093 /* First stmt in the interleaving chain. Check the chain. */
2094 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2095 struct data_reference *data_ref = dr;
2096 unsigned int count = 1;
2098 tree prev_init = DR_INIT (data_ref);
2100 HOST_WIDE_INT diff, count_in_bytes, gaps = 0;
2104 /* Skip same data-refs. In case that two or more stmts share
2105 data-ref (supported only for loads), we vectorize only the first
2106 stmt, and the rest get their vectorized loads from the first
2108 if (!tree_int_cst_compare (DR_INIT (data_ref),
2109 DR_INIT (STMT_VINFO_DATA_REF (
2110 vinfo_for_stmt (next)))))
2112 if (DR_IS_WRITE (data_ref))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 "Two store stmts share the same dr.");
2120 /* For load use the same data-ref load. */
2121 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2124 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2130 /* Check that all the accesses have the same STEP. */
2131 next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
2132 if (tree_int_cst_compare (step, next_step))
2134 if (dump_enabled_p ())
2135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136 "not consecutive access in interleaving");
2140 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2141 /* Check that the distance between two accesses is equal to the type
2142 size. Otherwise, we have gaps. */
2143 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2144 - TREE_INT_CST_LOW (prev_init)) / type_size;
2147 /* FORNOW: SLP of accesses with gaps is not supported. */
2148 slp_impossible = true;
2149 if (DR_IS_WRITE (data_ref))
2151 if (dump_enabled_p ())
2152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153 "interleaved store with gaps");
2160 last_accessed_element += diff;
2162 /* Store the gap from the previous member of the group. If there is no
2163 gap in the access, GROUP_GAP is always 1. */
2164 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2166 prev_init = DR_INIT (data_ref);
2167 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2168 /* Count the number of data-refs in the chain. */
2172 /* COUNT is the number of accesses found, we multiply it by the size of
2173 the type to get COUNT_IN_BYTES. */
2174 count_in_bytes = type_size * count;
2176 /* Check that the size of the interleaving (including gaps) is not
2177 greater than STEP. */
2178 if (dr_step && dr_step < count_in_bytes + gaps * type_size)
2180 if (dump_enabled_p ())
2182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2183 "interleaving size is greater than step for ");
2184 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2189 /* Check that the size of the interleaving is equal to STEP for stores,
2190 i.e., that there are no gaps. */
2191 if (dr_step && dr_step != count_in_bytes)
2193 if (DR_IS_READ (dr))
2195 slp_impossible = true;
2196 /* There is a gap after the last load in the group. This gap is a
2197 difference between the groupsize and the number of elements.
2198 When there is no gap, this difference should be 0. */
2199 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2203 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2205 "interleaved store with gaps");
2210 /* Check that STEP is a multiple of type size. */
2211 if (dr_step && (dr_step % type_size) != 0)
2213 if (dump_enabled_p ())
2215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2216 "step is not a multiple of type size: step ");
2217 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2218 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2219 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2220 TYPE_SIZE_UNIT (scalar_type));
2228 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_NOTE, vect_location,
2231 "Detected interleaving of size %d", (int)groupsize);
2233 /* SLP: create an SLP data structure for every interleaving group of
2234 stores for further analysis in vect_analyse_slp. */
2235 if (DR_IS_WRITE (dr) && !slp_impossible)
2238 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2240 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2243 /* There is a gap in the end of the group. */
2244 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2246 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248 "Data access with gaps requires scalar "
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2254 "Peeling for outer loop is not supported");
2258 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2266 /* Analyze the access pattern of the data-reference DR.
2267 In case of non-consecutive accesses call vect_analyze_group_access() to
2268 analyze groups of accesses. */
2271 vect_analyze_data_ref_access (struct data_reference *dr)
2273 tree step = DR_STEP (dr);
2274 tree scalar_type = TREE_TYPE (DR_REF (dr));
2275 gimple stmt = DR_STMT (dr);
2276 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2278 struct loop *loop = NULL;
2281 loop = LOOP_VINFO_LOOP (loop_vinfo);
2283 if (loop_vinfo && !step)
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "bad data-ref access in loop");
2291 /* Allow invariant loads in loops. */
2292 if (loop_vinfo && integer_zerop (step))
2294 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2295 return DR_IS_READ (dr);
2298 if (loop && nested_in_vect_loop_p (loop, stmt))
2300 /* Interleaved accesses are not yet supported within outer-loop
2301 vectorization for references in the inner-loop. */
2302 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2304 /* For the rest of the analysis we use the outer-loop step. */
2305 step = STMT_VINFO_DR_STEP (stmt_info);
2306 if (integer_zerop (step))
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_NOTE, vect_location,
2310 "zero step in outer loop.");
2311 if (DR_IS_READ (dr))
2319 if (TREE_CODE (step) == INTEGER_CST)
2321 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2322 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2324 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2326 /* Mark that it is not interleaving. */
2327 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2332 if (loop && nested_in_vect_loop_p (loop, stmt))
2334 if (dump_enabled_p ())
2335 dump_printf_loc (MSG_NOTE, vect_location,
2336 "grouped access in outer loop.");
2340 /* Assume this is a DR handled by non-constant strided load case. */
2341 if (TREE_CODE (step) != INTEGER_CST)
2342 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2344 /* Not consecutive access - check if it's a part of interleaving group. */
2345 return vect_analyze_group_access (dr);
2348 /* Compare two data-references DRA and DRB to group them into chunks
2349 suitable for grouping. */
2352 dr_group_sort_cmp (const void *dra_, const void *drb_)
2354 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2355 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2359 /* Stabilize sort. */
2363 /* Ordering of DRs according to base. */
2364 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2366 h1 = iterative_hash_expr (DR_BASE_ADDRESS (dra), 0);
2367 h2 = iterative_hash_expr (DR_BASE_ADDRESS (drb), 0);
2369 return h1 < h2 ? -1 : 1;
2372 /* And according to DR_OFFSET. */
2373 if (!dr_equal_offsets_p (dra, drb))
2375 h1 = iterative_hash_expr (DR_OFFSET (dra), 0);
2376 h2 = iterative_hash_expr (DR_OFFSET (drb), 0);
2378 return h1 < h2 ? -1 : 1;
2381 /* Put reads before writes. */
2382 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2383 return DR_IS_READ (dra) ? -1 : 1;
2385 /* Then sort after access size. */
2386 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2387 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2389 h1 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 0);
2390 h2 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0);
2392 return h1 < h2 ? -1 : 1;
2395 /* And after step. */
2396 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2398 h1 = iterative_hash_expr (DR_STEP (dra), 0);
2399 h2 = iterative_hash_expr (DR_STEP (drb), 0);
2401 return h1 < h2 ? -1 : 1;
2404 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2405 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2407 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2411 /* Function vect_analyze_data_ref_accesses.
2413 Analyze the access pattern of all the data references in the loop.
2415 FORNOW: the only access pattern that is considered vectorizable is a
2416 simple step 1 (consecutive) access.
2418 FORNOW: handle only arrays and pointer accesses. */
2421 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2424 vec<data_reference_p> datarefs;
2425 struct data_reference *dr;
2427 if (dump_enabled_p ())
2428 dump_printf_loc (MSG_NOTE, vect_location,
2429 "=== vect_analyze_data_ref_accesses ===");
2432 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2434 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2436 if (datarefs.is_empty ())
2439 /* Sort the array of datarefs to make building the interleaving chains
2441 qsort (datarefs.address(), datarefs.length (),
2442 sizeof (data_reference_p), dr_group_sort_cmp);
2444 /* Build the interleaving chains. */
2445 for (i = 0; i < datarefs.length () - 1;)
2447 data_reference_p dra = datarefs[i];
2448 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2449 stmt_vec_info lastinfo = NULL;
2450 for (i = i + 1; i < datarefs.length (); ++i)
2452 data_reference_p drb = datarefs[i];
2453 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2455 /* ??? Imperfect sorting (non-compatible types, non-modulo
2456 accesses, same accesses) can lead to a group to be artificially
2457 split here as we don't just skip over those. If it really
2458 matters we can push those to a worklist and re-iterate
2459 over them. The we can just skip ahead to the next DR here. */
2461 /* Check that the data-refs have same first location (except init)
2462 and they are both either store or load (not load and store). */
2463 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2464 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2465 DR_BASE_ADDRESS (drb), 0)
2466 || !dr_equal_offsets_p (dra, drb))
2469 /* Check that the data-refs have the same constant size and step. */
2470 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2471 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2472 if (!host_integerp (sza, 1)
2473 || !host_integerp (szb, 1)
2474 || !tree_int_cst_equal (sza, szb)
2475 || !host_integerp (DR_STEP (dra), 0)
2476 || !host_integerp (DR_STEP (drb), 0)
2477 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2480 /* Do not place the same access in the interleaving chain twice. */
2481 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2484 /* Check the types are compatible.
2485 ??? We don't distinguish this during sorting. */
2486 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2487 TREE_TYPE (DR_REF (drb))))
2490 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2491 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2492 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2493 gcc_assert (init_a < init_b);
2495 /* If init_b == init_a + the size of the type * k, we have an
2496 interleaving, and DRA is accessed before DRB. */
2497 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2498 if ((init_b - init_a) % type_size_a != 0)
2501 /* The step (if not zero) is greater than the difference between
2502 data-refs' inits. This splits groups into suitable sizes. */
2503 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2504 if (step != 0 && step <= (init_b - init_a))
2507 if (dump_enabled_p ())
2509 dump_printf_loc (MSG_NOTE, vect_location,
2510 "Detected interleaving ");
2511 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2512 dump_printf (MSG_NOTE, " and ");
2513 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2516 /* Link the found element into the group list. */
2517 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2519 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2520 lastinfo = stmtinfo_a;
2522 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2523 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2524 lastinfo = stmtinfo_b;
2528 FOR_EACH_VEC_ELT (datarefs, i, dr)
2529 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2530 && !vect_analyze_data_ref_access (dr))
2532 if (dump_enabled_p ())
2533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2534 "not vectorized: complicated access pattern.");
2538 /* Mark the statement as not vectorizable. */
2539 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2549 /* Function vect_prune_runtime_alias_test_list.
2551 Prune a list of ddrs to be tested at run-time by versioning for alias.
2552 Return FALSE if resulting list of ddrs is longer then allowed by
2553 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2556 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2559 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2562 if (dump_enabled_p ())
2563 dump_printf_loc (MSG_NOTE, vect_location,
2564 "=== vect_prune_runtime_alias_test_list ===");
2566 for (i = 0; i < ddrs.length (); )
2574 for (j = 0; j < i; j++)
2576 ddr_p ddr_j = ddrs[j];
2578 if (vect_vfa_range_equal (ddr_i, ddr_j))
2580 if (dump_enabled_p ())
2582 dump_printf_loc (MSG_NOTE, vect_location,
2583 "found equal ranges ");
2584 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2585 dump_printf (MSG_NOTE, ", ");
2586 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2587 dump_printf (MSG_NOTE, " and ");
2588 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2589 dump_printf (MSG_NOTE, ", ");
2590 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2599 ddrs.ordered_remove (i);
2605 if (ddrs.length () >
2606 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2608 if (dump_enabled_p ())
2610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2611 "disable versioning for alias - max number of "
2612 "generated checks exceeded.");
2615 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2623 /* Check whether a non-affine read in stmt is suitable for gather load
2624 and if so, return a builtin decl for that operation. */
2627 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2628 tree *offp, int *scalep)
2630 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2631 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2632 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2633 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2634 tree offtype = NULL_TREE;
2635 tree decl, base, off;
2636 enum machine_mode pmode;
2637 int punsignedp, pvolatilep;
2639 /* The gather builtins need address of the form
2640 loop_invariant + vector * {1, 2, 4, 8}
2642 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2643 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2644 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2645 multiplications and additions in it. To get a vector, we need
2646 a single SSA_NAME that will be defined in the loop and will
2647 contain everything that is not loop invariant and that can be
2648 vectorized. The following code attempts to find such a preexistng
2649 SSA_NAME OFF and put the loop invariants into a tree BASE
2650 that can be gimplified before the loop. */
2651 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2652 &pmode, &punsignedp, &pvolatilep, false);
2653 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2655 if (TREE_CODE (base) == MEM_REF)
2657 if (!integer_zerop (TREE_OPERAND (base, 1)))
2659 if (off == NULL_TREE)
2661 double_int moff = mem_ref_offset (base);
2662 off = double_int_to_tree (sizetype, moff);
2665 off = size_binop (PLUS_EXPR, off,
2666 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2668 base = TREE_OPERAND (base, 0);
2671 base = build_fold_addr_expr (base);
2673 if (off == NULL_TREE)
2674 off = size_zero_node;
2676 /* If base is not loop invariant, either off is 0, then we start with just
2677 the constant offset in the loop invariant BASE and continue with base
2678 as OFF, otherwise give up.
2679 We could handle that case by gimplifying the addition of base + off
2680 into some SSA_NAME and use that as off, but for now punt. */
2681 if (!expr_invariant_in_loop_p (loop, base))
2683 if (!integer_zerop (off))
2686 base = size_int (pbitpos / BITS_PER_UNIT);
2688 /* Otherwise put base + constant offset into the loop invariant BASE
2689 and continue with OFF. */
2692 base = fold_convert (sizetype, base);
2693 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2696 /* OFF at this point may be either a SSA_NAME or some tree expression
2697 from get_inner_reference. Try to peel off loop invariants from it
2698 into BASE as long as possible. */
2700 while (offtype == NULL_TREE)
2702 enum tree_code code;
2703 tree op0, op1, add = NULL_TREE;
2705 if (TREE_CODE (off) == SSA_NAME)
2707 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2709 if (expr_invariant_in_loop_p (loop, off))
2712 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2715 op0 = gimple_assign_rhs1 (def_stmt);
2716 code = gimple_assign_rhs_code (def_stmt);
2717 op1 = gimple_assign_rhs2 (def_stmt);
2721 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2723 code = TREE_CODE (off);
2724 extract_ops_from_tree (off, &code, &op0, &op1);
2728 case POINTER_PLUS_EXPR:
2730 if (expr_invariant_in_loop_p (loop, op0))
2735 add = fold_convert (sizetype, add);
2737 add = size_binop (MULT_EXPR, add, size_int (scale));
2738 base = size_binop (PLUS_EXPR, base, add);
2741 if (expr_invariant_in_loop_p (loop, op1))
2749 if (expr_invariant_in_loop_p (loop, op1))
2751 add = fold_convert (sizetype, op1);
2752 add = size_binop (MINUS_EXPR, size_zero_node, add);
2758 if (scale == 1 && host_integerp (op1, 0))
2760 scale = tree_low_cst (op1, 0);
2769 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2770 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2772 if (TYPE_PRECISION (TREE_TYPE (op0))
2773 == TYPE_PRECISION (TREE_TYPE (off)))
2778 if (TYPE_PRECISION (TREE_TYPE (op0))
2779 < TYPE_PRECISION (TREE_TYPE (off)))
2782 offtype = TREE_TYPE (off);
2793 /* If at the end OFF still isn't a SSA_NAME or isn't
2794 defined in the loop, punt. */
2795 if (TREE_CODE (off) != SSA_NAME
2796 || expr_invariant_in_loop_p (loop, off))
2799 if (offtype == NULL_TREE)
2800 offtype = TREE_TYPE (off);
2802 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2804 if (decl == NULL_TREE)
2816 /* Function vect_analyze_data_refs.
2818 Find all the data references in the loop or basic block.
2820 The general structure of the analysis of data refs in the vectorizer is as
2822 1- vect_analyze_data_refs(loop/bb): call
2823 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2824 in the loop/bb and their dependences.
2825 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2826 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2827 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2832 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2833 bb_vec_info bb_vinfo,
2836 struct loop *loop = NULL;
2837 basic_block bb = NULL;
2839 vec<data_reference_p> datarefs;
2840 struct data_reference *dr;
2843 if (dump_enabled_p ())
2844 dump_printf_loc (MSG_NOTE, vect_location,
2845 "=== vect_analyze_data_refs ===\n");
2849 loop = LOOP_VINFO_LOOP (loop_vinfo);
2850 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2851 || find_data_references_in_loop
2852 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2854 if (dump_enabled_p ())
2855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2856 "not vectorized: loop contains function calls"
2857 " or data references that cannot be analyzed");
2861 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2865 gimple_stmt_iterator gsi;
2867 bb = BB_VINFO_BB (bb_vinfo);
2868 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2870 gimple stmt = gsi_stmt (gsi);
2871 if (!find_data_references_in_stmt (NULL, stmt,
2872 &BB_VINFO_DATAREFS (bb_vinfo)))
2874 /* Mark the rest of the basic-block as unvectorizable. */
2875 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2877 stmt = gsi_stmt (gsi);
2878 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2884 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2887 /* Go through the data-refs, check that the analysis succeeded. Update
2888 pointer from stmt_vec_info struct to DR and vectype. */
2890 FOR_EACH_VEC_ELT (datarefs, i, dr)
2893 stmt_vec_info stmt_info;
2894 tree base, offset, init;
2895 bool gather = false;
2898 if (!dr || !DR_REF (dr))
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2902 "not vectorized: unhandled data-ref ");
2906 stmt = DR_STMT (dr);
2907 stmt_info = vinfo_for_stmt (stmt);
2909 /* Check that analysis of the data-ref succeeded. */
2910 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2913 /* If target supports vector gather loads, see if they can't
2917 && !TREE_THIS_VOLATILE (DR_REF (dr))
2918 && targetm.vectorize.builtin_gather != NULL
2919 && !nested_in_vect_loop_p (loop, stmt))
2921 struct data_reference *newdr
2922 = create_data_ref (NULL, loop_containing_stmt (stmt),
2923 DR_REF (dr), stmt, true);
2924 gcc_assert (newdr != NULL && DR_REF (newdr));
2925 if (DR_BASE_ADDRESS (newdr)
2926 && DR_OFFSET (newdr)
2929 && integer_zerop (DR_STEP (newdr)))
2935 free_data_ref (newdr);
2940 if (dump_enabled_p ())
2942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2943 "not vectorized: data ref analysis "
2945 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2955 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
2957 if (dump_enabled_p ())
2958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2959 "not vectorized: base addr of dr is a "
2970 if (TREE_THIS_VOLATILE (DR_REF (dr)))
2972 if (dump_enabled_p ())
2974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2975 "not vectorized: volatile type ");
2976 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2985 if (stmt_can_throw_internal (stmt))
2987 if (dump_enabled_p ())
2989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2990 "not vectorized: statement can throw an "
2992 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3003 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3004 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3006 if (dump_enabled_p ())
3008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3009 "not vectorized: statement is bitfield "
3011 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3022 base = unshare_expr (DR_BASE_ADDRESS (dr));
3023 offset = unshare_expr (DR_OFFSET (dr));
3024 init = unshare_expr (DR_INIT (dr));
3026 if (is_gimple_call (stmt))
3028 if (dump_enabled_p ())
3030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3031 "not vectorized: dr in a call ");
3032 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3043 /* Update DR field in stmt_vec_info struct. */
3045 /* If the dataref is in an inner-loop of the loop that is considered for
3046 for vectorization, we also want to analyze the access relative to
3047 the outer-loop (DR contains information only relative to the
3048 inner-most enclosing loop). We do that by building a reference to the
3049 first location accessed by the inner-loop, and analyze it relative to
3051 if (loop && nested_in_vect_loop_p (loop, stmt))
3053 tree outer_step, outer_base, outer_init;
3054 HOST_WIDE_INT pbitsize, pbitpos;
3056 enum machine_mode pmode;
3057 int punsignedp, pvolatilep;
3058 affine_iv base_iv, offset_iv;
3061 /* Build a reference to the first location accessed by the
3062 inner-loop: *(BASE+INIT). (The first location is actually
3063 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3064 tree inner_base = build_fold_indirect_ref
3065 (fold_build_pointer_plus (base, init));
3067 if (dump_enabled_p ())
3069 dump_printf_loc (MSG_NOTE, vect_location,
3070 "analyze in outer-loop: ");
3071 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3074 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3075 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3076 gcc_assert (outer_base != NULL_TREE);
3078 if (pbitpos % BITS_PER_UNIT != 0)
3080 if (dump_enabled_p ())
3081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3082 "failed: bit offset alignment.\n");
3086 outer_base = build_fold_addr_expr (outer_base);
3087 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3090 if (dump_enabled_p ())
3091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3092 "failed: evolution of base is not affine.\n");
3099 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3107 offset_iv.base = ssize_int (0);
3108 offset_iv.step = ssize_int (0);
3110 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3113 if (dump_enabled_p ())
3114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3115 "evolution of offset is not affine.\n");
3119 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3120 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3121 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3122 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3123 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3125 outer_step = size_binop (PLUS_EXPR,
3126 fold_convert (ssizetype, base_iv.step),
3127 fold_convert (ssizetype, offset_iv.step));
3129 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3130 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3131 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3132 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3133 STMT_VINFO_DR_OFFSET (stmt_info) =
3134 fold_convert (ssizetype, offset_iv.base);
3135 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3136 size_int (highest_pow2_factor (offset_iv.base));
3138 if (dump_enabled_p ())
3140 dump_printf_loc (MSG_NOTE, vect_location,
3141 "\touter base_address: ");
3142 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3143 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3144 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3145 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3146 STMT_VINFO_DR_OFFSET (stmt_info));
3147 dump_printf (MSG_NOTE,
3148 "\n\touter constant offset from base address: ");
3149 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3150 STMT_VINFO_DR_INIT (stmt_info));
3151 dump_printf (MSG_NOTE, "\n\touter step: ");
3152 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3153 STMT_VINFO_DR_STEP (stmt_info));
3154 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3155 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3156 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3160 if (STMT_VINFO_DATA_REF (stmt_info))
3162 if (dump_enabled_p ())
3164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3165 "not vectorized: more than one data ref "
3167 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3178 STMT_VINFO_DATA_REF (stmt_info) = dr;
3180 /* Set vectype for STMT. */
3181 scalar_type = TREE_TYPE (DR_REF (dr));
3182 STMT_VINFO_VECTYPE (stmt_info) =
3183 get_vectype_for_scalar_type (scalar_type);
3184 if (!STMT_VINFO_VECTYPE (stmt_info))
3186 if (dump_enabled_p ())
3188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3189 "not vectorized: no vectype for stmt: ");
3190 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3191 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3192 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3201 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3208 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_NOTE, vect_location,
3211 "got vectype for stmt: ");
3212 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3213 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3214 STMT_VINFO_VECTYPE (stmt_info));
3218 /* Adjust the minimal vectorization factor according to the
3220 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3228 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3230 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3234 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3236 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3239 "not vectorized: not suitable for gather "
3241 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3247 STMT_VINFO_GATHER_P (stmt_info) = true;
3250 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3252 if (nested_in_vect_loop_p (loop, stmt)
3253 || !DR_IS_READ (dr))
3255 if (dump_enabled_p ())
3257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3258 "not vectorized: not suitable for strided "
3260 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3264 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3268 /* If we stopped analysis at the first dataref we could not analyze
3269 when trying to vectorize a basic-block mark the rest of the datarefs
3270 as not vectorizable and truncate the vector of datarefs. That
3271 avoids spending useless time in analyzing their dependence. */
3272 if (i != datarefs.length ())
3274 gcc_assert (bb_vinfo != NULL);
3275 for (unsigned j = i; j < datarefs.length (); ++j)
3277 data_reference_p dr = datarefs[j];
3278 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3281 datarefs.truncate (i);
3288 /* Function vect_get_new_vect_var.
3290 Returns a name for a new variable. The current naming scheme appends the
3291 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3292 the name of vectorizer generated variables, and appends that to NAME if
3296 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3303 case vect_simple_var:
3306 case vect_scalar_var:
3309 case vect_pointer_var:
3318 char* tmp = concat (prefix, "_", name, NULL);
3319 new_vect_var = create_tmp_reg (type, tmp);
3323 new_vect_var = create_tmp_reg (type, prefix);
3325 return new_vect_var;
3329 /* Function vect_create_addr_base_for_vector_ref.
3331 Create an expression that computes the address of the first memory location
3332 that will be accessed for a data reference.
3335 STMT: The statement containing the data reference.
3336 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3337 OFFSET: Optional. If supplied, it is be added to the initial address.
3338 LOOP: Specify relative to which loop-nest should the address be computed.
3339 For example, when the dataref is in an inner-loop nested in an
3340 outer-loop that is now being vectorized, LOOP can be either the
3341 outer-loop, or the inner-loop. The first memory location accessed
3342 by the following dataref ('in' points to short):
3349 if LOOP=i_loop: &in (relative to i_loop)
3350 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3353 1. Return an SSA_NAME whose value is the address of the memory location of
3354 the first vector of the data reference.
3355 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3356 these statement(s) which define the returned SSA_NAME.
3358 FORNOW: We are only handling array accesses with step 1. */
3361 vect_create_addr_base_for_vector_ref (gimple stmt,
3362 gimple_seq *new_stmt_list,
3366 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3367 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3369 const char *base_name;
3372 gimple_seq seq = NULL;
3376 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3377 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3379 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3381 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3383 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3385 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3386 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3387 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3391 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3392 base_offset = unshare_expr (DR_OFFSET (dr));
3393 init = unshare_expr (DR_INIT (dr));
3397 base_name = get_name (data_ref_base);
3400 base_offset = ssize_int (0);
3401 init = ssize_int (0);
3402 base_name = get_name (DR_REF (dr));
3405 /* Create base_offset */
3406 base_offset = size_binop (PLUS_EXPR,
3407 fold_convert (sizetype, base_offset),
3408 fold_convert (sizetype, init));
3412 offset = fold_build2 (MULT_EXPR, sizetype,
3413 fold_convert (sizetype, offset), step);
3414 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3415 base_offset, offset);
3418 /* base + base_offset */
3420 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3423 addr_base = build1 (ADDR_EXPR,
3424 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3425 unshare_expr (DR_REF (dr)));
3428 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3429 addr_base = fold_convert (vect_ptr_type, addr_base);
3430 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3431 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3432 gimple_seq_add_seq (new_stmt_list, seq);
3434 if (DR_PTR_INFO (dr)
3435 && TREE_CODE (addr_base) == SSA_NAME)
3437 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3439 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3442 if (dump_enabled_p ())
3444 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3445 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3452 /* Function vect_create_data_ref_ptr.
3454 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3455 location accessed in the loop by STMT, along with the def-use update
3456 chain to appropriately advance the pointer through the loop iterations.
3457 Also set aliasing information for the pointer. This pointer is used by
3458 the callers to this function to create a memory reference expression for
3459 vector load/store access.
3462 1. STMT: a stmt that references memory. Expected to be of the form
3463 GIMPLE_ASSIGN <name, data-ref> or
3464 GIMPLE_ASSIGN <data-ref, name>.
3465 2. AGGR_TYPE: the type of the reference, which should be either a vector
3467 3. AT_LOOP: the loop where the vector memref is to be created.
3468 4. OFFSET (optional): an offset to be added to the initial address accessed
3469 by the data-ref in STMT.
3470 5. BSI: location where the new stmts are to be placed if there is no loop
3471 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3472 pointing to the initial address.
3475 1. Declare a new ptr to vector_type, and have it point to the base of the
3476 data reference (initial addressed accessed by the data reference).
3477 For example, for vector of type V8HI, the following code is generated:
3480 ap = (v8hi *)initial_address;
3482 if OFFSET is not supplied:
3483 initial_address = &a[init];
3484 if OFFSET is supplied:
3485 initial_address = &a[init + OFFSET];
3487 Return the initial_address in INITIAL_ADDRESS.
3489 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3490 update the pointer in each iteration of the loop.
3492 Return the increment stmt that updates the pointer in PTR_INCR.
3494 3. Set INV_P to true if the access pattern of the data reference in the
3495 vectorized loop is invariant. Set it to false otherwise.
3497 4. Return the pointer. */
3500 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3501 tree offset, tree *initial_address,
3502 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3503 bool only_init, bool *inv_p)
3505 const char *base_name;
3506 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3507 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3508 struct loop *loop = NULL;
3509 bool nested_in_vect_loop = false;
3510 struct loop *containing_loop = NULL;
3515 gimple_seq new_stmt_list = NULL;
3519 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3521 gimple_stmt_iterator incr_gsi;
3524 tree indx_before_incr, indx_after_incr;
3527 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3529 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3530 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3534 loop = LOOP_VINFO_LOOP (loop_vinfo);
3535 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3536 containing_loop = (gimple_bb (stmt))->loop_father;
3537 pe = loop_preheader_edge (loop);
3541 gcc_assert (bb_vinfo);
3546 /* Check the step (evolution) of the load in LOOP, and record
3547 whether it's invariant. */
3548 if (nested_in_vect_loop)
3549 step = STMT_VINFO_DR_STEP (stmt_info);
3551 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3553 if (tree_int_cst_compare (step, size_zero_node) == 0)
3557 negative = tree_int_cst_compare (step, size_zero_node) < 0;
3559 /* Create an expression for the first address accessed by this load
3561 base_name = get_name (DR_BASE_ADDRESS (dr));
3563 if (dump_enabled_p ())
3565 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3566 dump_printf_loc (MSG_NOTE, vect_location,
3567 "create %s-pointer variable to type: ",
3568 tree_code_name[(int) TREE_CODE (aggr_type)]);
3569 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3570 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3571 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3572 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3573 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3574 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3575 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3577 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3578 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3581 /* (1) Create the new aggregate-pointer variable.
3582 Vector and array types inherit the alias set of their component
3583 type by default so we need to use a ref-all pointer if the data
3584 reference does not conflict with the created aggregated data
3585 reference because it is not addressable. */
3586 bool need_ref_all = false;
3587 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3588 get_alias_set (DR_REF (dr))))
3589 need_ref_all = true;
3590 /* Likewise for any of the data references in the stmt group. */
3591 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3593 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3596 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3597 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3598 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3599 get_alias_set (DR_REF (sdr))))
3601 need_ref_all = true;
3604 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3608 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3610 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3613 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3614 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3615 def-use update cycles for the pointer: one relative to the outer-loop
3616 (LOOP), which is what steps (3) and (4) below do. The other is relative
3617 to the inner-loop (which is the inner-most loop containing the dataref),
3618 and this is done be step (5) below.
3620 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3621 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3622 redundant. Steps (3),(4) create the following:
3625 LOOP: vp1 = phi(vp0,vp2)
3631 If there is an inner-loop nested in loop, then step (5) will also be
3632 applied, and an additional update in the inner-loop will be created:
3635 LOOP: vp1 = phi(vp0,vp2)
3637 inner: vp3 = phi(vp1,vp4)
3638 vp4 = vp3 + inner_step
3644 /* (2) Calculate the initial address of the aggregate-pointer, and set
3645 the aggregate-pointer to point to it before the loop. */
3647 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3649 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3655 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3656 gcc_assert (!new_bb);
3659 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3662 *initial_address = new_temp;
3664 /* Create: p = (aggr_type *) initial_base */
3665 if (TREE_CODE (new_temp) != SSA_NAME
3666 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3668 vec_stmt = gimple_build_assign (aggr_ptr,
3669 fold_convert (aggr_ptr_type, new_temp));
3670 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3671 /* Copy the points-to information if it exists. */
3672 if (DR_PTR_INFO (dr))
3673 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3674 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3677 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3678 gcc_assert (!new_bb);
3681 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3684 aggr_ptr_init = new_temp;
3686 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3687 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3688 inner-loop nested in LOOP (during outer-loop vectorization). */
3690 /* No update in loop is required. */
3691 if (only_init && (!loop_vinfo || at_loop == loop))
3692 aptr = aggr_ptr_init;
3695 /* The step of the aggregate pointer is the type size. */
3696 tree step = TYPE_SIZE_UNIT (aggr_type);
3697 /* One exception to the above is when the scalar step of the load in
3698 LOOP is zero. In this case the step here is also zero. */
3700 step = size_zero_node;
3702 step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
3704 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3706 create_iv (aggr_ptr_init,
3707 fold_convert (aggr_ptr_type, step),
3708 aggr_ptr, loop, &incr_gsi, insert_after,
3709 &indx_before_incr, &indx_after_incr);
3710 incr = gsi_stmt (incr_gsi);
3711 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3713 /* Copy the points-to information if it exists. */
3714 if (DR_PTR_INFO (dr))
3716 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3717 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3722 aptr = indx_before_incr;
3725 if (!nested_in_vect_loop || only_init)
3729 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3730 nested in LOOP, if exists. */
3732 gcc_assert (nested_in_vect_loop);
3735 standard_iv_increment_position (containing_loop, &incr_gsi,
3737 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3738 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3740 incr = gsi_stmt (incr_gsi);
3741 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3743 /* Copy the points-to information if it exists. */
3744 if (DR_PTR_INFO (dr))
3746 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3747 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3752 return indx_before_incr;
3759 /* Function bump_vector_ptr
3761 Increment a pointer (to a vector type) by vector-size. If requested,
3762 i.e. if PTR-INCR is given, then also connect the new increment stmt
3763 to the existing def-use update-chain of the pointer, by modifying
3764 the PTR_INCR as illustrated below:
3766 The pointer def-use update-chain before this function:
3767 DATAREF_PTR = phi (p_0, p_2)
3769 PTR_INCR: p_2 = DATAREF_PTR + step
3771 The pointer def-use update-chain after this function:
3772 DATAREF_PTR = phi (p_0, p_2)
3774 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3776 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3779 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3781 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3782 the loop. The increment amount across iterations is expected
3784 BSI - location where the new update stmt is to be placed.
3785 STMT - the original scalar memory-access stmt that is being vectorized.
3786 BUMP - optional. The offset by which to bump the pointer. If not given,
3787 the offset is assumed to be vector_size.
3789 Output: Return NEW_DATAREF_PTR as illustrated above.
3794 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3795 gimple stmt, tree bump)
3797 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3798 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3799 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3800 tree update = TYPE_SIZE_UNIT (vectype);
3803 use_operand_p use_p;
3804 tree new_dataref_ptr;
3809 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3810 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3811 dataref_ptr, update);
3812 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3814 /* Copy the points-to information if it exists. */
3815 if (DR_PTR_INFO (dr))
3817 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3818 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3822 return new_dataref_ptr;
3824 /* Update the vector-pointer's cross-iteration increment. */
3825 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3827 tree use = USE_FROM_PTR (use_p);
3829 if (use == dataref_ptr)
3830 SET_USE (use_p, new_dataref_ptr);
3832 gcc_assert (tree_int_cst_compare (use, update) == 0);
3835 return new_dataref_ptr;
3839 /* Function vect_create_destination_var.
3841 Create a new temporary of type VECTYPE. */
3844 vect_create_destination_var (tree scalar_dest, tree vectype)
3850 enum vect_var_kind kind;
3852 kind = vectype ? vect_simple_var : vect_scalar_var;
3853 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3855 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3857 name = get_name (scalar_dest);
3859 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
3861 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
3862 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3868 /* Function vect_grouped_store_supported.
3870 Returns TRUE if interleave high and interleave low permutations
3871 are supported, and FALSE otherwise. */
3874 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3876 enum machine_mode mode = TYPE_MODE (vectype);
3878 /* vect_permute_store_chain requires the group size to be a power of two. */
3879 if (exact_log2 (count) == -1)
3881 if (dump_enabled_p ())
3882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3883 "the size of the group of accesses"
3884 " is not a power of 2");
3888 /* Check that the permutation is supported. */
3889 if (VECTOR_MODE_P (mode))
3891 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3892 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3893 for (i = 0; i < nelt / 2; i++)
3896 sel[i * 2 + 1] = i + nelt;
3898 if (can_vec_perm_p (mode, false, sel))
3900 for (i = 0; i < nelt; i++)
3902 if (can_vec_perm_p (mode, false, sel))
3907 if (dump_enabled_p ())
3908 dump_printf (MSG_MISSED_OPTIMIZATION,
3909 "interleave op not supported by target.");
3914 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3918 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3920 return vect_lanes_optab_supported_p ("vec_store_lanes",
3921 vec_store_lanes_optab,
3926 /* Function vect_permute_store_chain.
3928 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3929 a power of 2, generate interleave_high/low stmts to reorder the data
3930 correctly for the stores. Return the final references for stores in
3933 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3934 The input is 4 vectors each containing 8 elements. We assign a number to
3935 each element, the input sequence is:
3937 1st vec: 0 1 2 3 4 5 6 7
3938 2nd vec: 8 9 10 11 12 13 14 15
3939 3rd vec: 16 17 18 19 20 21 22 23
3940 4th vec: 24 25 26 27 28 29 30 31
3942 The output sequence should be:
3944 1st vec: 0 8 16 24 1 9 17 25
3945 2nd vec: 2 10 18 26 3 11 19 27
3946 3rd vec: 4 12 20 28 5 13 21 30
3947 4th vec: 6 14 22 30 7 15 23 31
3949 i.e., we interleave the contents of the four vectors in their order.
3951 We use interleave_high/low instructions to create such output. The input of
3952 each interleave_high/low operation is two vectors:
3955 the even elements of the result vector are obtained left-to-right from the
3956 high/low elements of the first vector. The odd elements of the result are
3957 obtained left-to-right from the high/low elements of the second vector.
3958 The output of interleave_high will be: 0 4 1 5
3959 and of interleave_low: 2 6 3 7
3962 The permutation is done in log LENGTH stages. In each stage interleave_high
3963 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
3964 where the first argument is taken from the first half of DR_CHAIN and the
3965 second argument from it's second half.
3968 I1: interleave_high (1st vec, 3rd vec)
3969 I2: interleave_low (1st vec, 3rd vec)
3970 I3: interleave_high (2nd vec, 4th vec)
3971 I4: interleave_low (2nd vec, 4th vec)
3973 The output for the first stage is:
3975 I1: 0 16 1 17 2 18 3 19
3976 I2: 4 20 5 21 6 22 7 23
3977 I3: 8 24 9 25 10 26 11 27
3978 I4: 12 28 13 29 14 30 15 31
3980 The output of the second stage, i.e. the final result is:
3982 I1: 0 8 16 24 1 9 17 25
3983 I2: 2 10 18 26 3 11 19 27
3984 I3: 4 12 20 28 5 13 21 30
3985 I4: 6 14 22 30 7 15 23 31. */
3988 vect_permute_store_chain (vec<tree> dr_chain,
3989 unsigned int length,
3991 gimple_stmt_iterator *gsi,
3992 vec<tree> *result_chain)
3994 tree vect1, vect2, high, low;
3996 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3997 tree perm_mask_low, perm_mask_high;
3999 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4000 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4002 result_chain->quick_grow (length);
4003 memcpy (result_chain->address (), dr_chain.address (),
4004 length * sizeof (tree));
4006 for (i = 0, n = nelt / 2; i < n; i++)
4009 sel[i * 2 + 1] = i + nelt;
4011 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4012 gcc_assert (perm_mask_high != NULL);
4014 for (i = 0; i < nelt; i++)
4016 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4017 gcc_assert (perm_mask_low != NULL);
4019 for (i = 0, n = exact_log2 (length); i < n; i++)
4021 for (j = 0; j < length/2; j++)
4023 vect1 = dr_chain[j];
4024 vect2 = dr_chain[j+length/2];
4026 /* Create interleaving stmt:
4027 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4028 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4030 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4031 vect1, vect2, perm_mask_high);
4032 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4033 (*result_chain)[2*j] = high;
4035 /* Create interleaving stmt:
4036 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4037 nelt*3/2+1, ...}> */
4038 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4040 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4041 vect1, vect2, perm_mask_low);
4042 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4043 (*result_chain)[2*j+1] = low;
4045 memcpy (dr_chain.address (), result_chain->address (),
4046 length * sizeof (tree));
4050 /* Function vect_setup_realignment
4052 This function is called when vectorizing an unaligned load using
4053 the dr_explicit_realign[_optimized] scheme.
4054 This function generates the following code at the loop prolog:
4057 x msq_init = *(floor(p)); # prolog load
4058 realignment_token = call target_builtin;
4060 x msq = phi (msq_init, ---)
4062 The stmts marked with x are generated only for the case of
4063 dr_explicit_realign_optimized.
4065 The code above sets up a new (vector) pointer, pointing to the first
4066 location accessed by STMT, and a "floor-aligned" load using that pointer.
4067 It also generates code to compute the "realignment-token" (if the relevant
4068 target hook was defined), and creates a phi-node at the loop-header bb
4069 whose arguments are the result of the prolog-load (created by this
4070 function) and the result of a load that takes place in the loop (to be
4071 created by the caller to this function).
4073 For the case of dr_explicit_realign_optimized:
4074 The caller to this function uses the phi-result (msq) to create the
4075 realignment code inside the loop, and sets up the missing phi argument,
4078 msq = phi (msq_init, lsq)
4079 lsq = *(floor(p')); # load in loop
4080 result = realign_load (msq, lsq, realignment_token);
4082 For the case of dr_explicit_realign:
4084 msq = *(floor(p)); # load in loop
4086 lsq = *(floor(p')); # load in loop
4087 result = realign_load (msq, lsq, realignment_token);
4090 STMT - (scalar) load stmt to be vectorized. This load accesses
4091 a memory location that may be unaligned.
4092 BSI - place where new code is to be inserted.
4093 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4097 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4098 target hook, if defined.
4099 Return value - the result of the loop-header phi node. */
4102 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4103 tree *realignment_token,
4104 enum dr_alignment_support alignment_support_scheme,
4106 struct loop **at_loop)
4108 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4109 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4110 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4111 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4112 struct loop *loop = NULL;
4114 tree scalar_dest = gimple_assign_lhs (stmt);
4121 tree msq_init = NULL_TREE;
4124 tree msq = NULL_TREE;
4125 gimple_seq stmts = NULL;
4127 bool compute_in_loop = false;
4128 bool nested_in_vect_loop = false;
4129 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4130 struct loop *loop_for_initial_load = NULL;
4134 loop = LOOP_VINFO_LOOP (loop_vinfo);
4135 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4138 gcc_assert (alignment_support_scheme == dr_explicit_realign
4139 || alignment_support_scheme == dr_explicit_realign_optimized);
4141 /* We need to generate three things:
4142 1. the misalignment computation
4143 2. the extra vector load (for the optimized realignment scheme).
4144 3. the phi node for the two vectors from which the realignment is
4145 done (for the optimized realignment scheme). */
4147 /* 1. Determine where to generate the misalignment computation.
4149 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4150 calculation will be generated by this function, outside the loop (in the
4151 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4152 caller, inside the loop.
4154 Background: If the misalignment remains fixed throughout the iterations of
4155 the loop, then both realignment schemes are applicable, and also the
4156 misalignment computation can be done outside LOOP. This is because we are
4157 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4158 are a multiple of VS (the Vector Size), and therefore the misalignment in
4159 different vectorized LOOP iterations is always the same.
4160 The problem arises only if the memory access is in an inner-loop nested
4161 inside LOOP, which is now being vectorized using outer-loop vectorization.
4162 This is the only case when the misalignment of the memory access may not
4163 remain fixed throughout the iterations of the inner-loop (as explained in
4164 detail in vect_supportable_dr_alignment). In this case, not only is the
4165 optimized realignment scheme not applicable, but also the misalignment
4166 computation (and generation of the realignment token that is passed to
4167 REALIGN_LOAD) have to be done inside the loop.
4169 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4170 or not, which in turn determines if the misalignment is computed inside
4171 the inner-loop, or outside LOOP. */
4173 if (init_addr != NULL_TREE || !loop_vinfo)
4175 compute_in_loop = true;
4176 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4180 /* 2. Determine where to generate the extra vector load.
4182 For the optimized realignment scheme, instead of generating two vector
4183 loads in each iteration, we generate a single extra vector load in the
4184 preheader of the loop, and in each iteration reuse the result of the
4185 vector load from the previous iteration. In case the memory access is in
4186 an inner-loop nested inside LOOP, which is now being vectorized using
4187 outer-loop vectorization, we need to determine whether this initial vector
4188 load should be generated at the preheader of the inner-loop, or can be
4189 generated at the preheader of LOOP. If the memory access has no evolution
4190 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4191 to be generated inside LOOP (in the preheader of the inner-loop). */
4193 if (nested_in_vect_loop)
4195 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4196 bool invariant_in_outerloop =
4197 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4198 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4201 loop_for_initial_load = loop;
4203 *at_loop = loop_for_initial_load;
4205 if (loop_for_initial_load)
4206 pe = loop_preheader_edge (loop_for_initial_load);
4208 /* 3. For the case of the optimized realignment, create the first vector
4209 load at the loop preheader. */
4211 if (alignment_support_scheme == dr_explicit_realign_optimized)
4213 /* Create msq_init = *(floor(p1)) in the loop preheader */
4215 gcc_assert (!compute_in_loop);
4216 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4217 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4218 NULL_TREE, &init_addr, NULL, &inc,
4220 new_temp = copy_ssa_name (ptr, NULL);
4221 new_stmt = gimple_build_assign_with_ops
4222 (BIT_AND_EXPR, new_temp, ptr,
4223 build_int_cst (TREE_TYPE (ptr),
4224 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4225 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4226 gcc_assert (!new_bb);
4228 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4229 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4230 new_stmt = gimple_build_assign (vec_dest, data_ref);
4231 new_temp = make_ssa_name (vec_dest, new_stmt);
4232 gimple_assign_set_lhs (new_stmt, new_temp);
4235 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4236 gcc_assert (!new_bb);
4239 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4241 msq_init = gimple_assign_lhs (new_stmt);
4244 /* 4. Create realignment token using a target builtin, if available.
4245 It is done either inside the containing loop, or before LOOP (as
4246 determined above). */
4248 if (targetm.vectorize.builtin_mask_for_load)
4252 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4255 /* Generate the INIT_ADDR computation outside LOOP. */
4256 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4260 pe = loop_preheader_edge (loop);
4261 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4262 gcc_assert (!new_bb);
4265 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4268 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4269 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4271 vect_create_destination_var (scalar_dest,
4272 gimple_call_return_type (new_stmt));
4273 new_temp = make_ssa_name (vec_dest, new_stmt);
4274 gimple_call_set_lhs (new_stmt, new_temp);
4276 if (compute_in_loop)
4277 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4280 /* Generate the misalignment computation outside LOOP. */
4281 pe = loop_preheader_edge (loop);
4282 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4283 gcc_assert (!new_bb);
4286 *realignment_token = gimple_call_lhs (new_stmt);
4288 /* The result of the CALL_EXPR to this builtin is determined from
4289 the value of the parameter and no global variables are touched
4290 which makes the builtin a "const" function. Requiring the
4291 builtin to have the "const" attribute makes it unnecessary
4292 to call mark_call_clobbered. */
4293 gcc_assert (TREE_READONLY (builtin_decl));
4296 if (alignment_support_scheme == dr_explicit_realign)
4299 gcc_assert (!compute_in_loop);
4300 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4303 /* 5. Create msq = phi <msq_init, lsq> in loop */
4305 pe = loop_preheader_edge (containing_loop);
4306 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4307 msq = make_ssa_name (vec_dest, NULL);
4308 phi_stmt = create_phi_node (msq, containing_loop->header);
4309 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4315 /* Function vect_grouped_load_supported.
4317 Returns TRUE if even and odd permutations are supported,
4318 and FALSE otherwise. */
4321 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4323 enum machine_mode mode = TYPE_MODE (vectype);
4325 /* vect_permute_load_chain requires the group size to be a power of two. */
4326 if (exact_log2 (count) == -1)
4328 if (dump_enabled_p ())
4329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4330 "the size of the group of accesses"
4331 " is not a power of 2");
4335 /* Check that the permutation is supported. */
4336 if (VECTOR_MODE_P (mode))
4338 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4339 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4341 for (i = 0; i < nelt; i++)
4343 if (can_vec_perm_p (mode, false, sel))
4345 for (i = 0; i < nelt; i++)
4347 if (can_vec_perm_p (mode, false, sel))
4352 if (dump_enabled_p ())
4353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4354 "extract even/odd not supported by target");
4358 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4362 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4364 return vect_lanes_optab_supported_p ("vec_load_lanes",
4365 vec_load_lanes_optab,
4369 /* Function vect_permute_load_chain.
4371 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4372 a power of 2, generate extract_even/odd stmts to reorder the input data
4373 correctly. Return the final references for loads in RESULT_CHAIN.
4375 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4376 The input is 4 vectors each containing 8 elements. We assign a number to each
4377 element, the input sequence is:
4379 1st vec: 0 1 2 3 4 5 6 7
4380 2nd vec: 8 9 10 11 12 13 14 15
4381 3rd vec: 16 17 18 19 20 21 22 23
4382 4th vec: 24 25 26 27 28 29 30 31
4384 The output sequence should be:
4386 1st vec: 0 4 8 12 16 20 24 28
4387 2nd vec: 1 5 9 13 17 21 25 29
4388 3rd vec: 2 6 10 14 18 22 26 30
4389 4th vec: 3 7 11 15 19 23 27 31
4391 i.e., the first output vector should contain the first elements of each
4392 interleaving group, etc.
4394 We use extract_even/odd instructions to create such output. The input of
4395 each extract_even/odd operation is two vectors
4399 and the output is the vector of extracted even/odd elements. The output of
4400 extract_even will be: 0 2 4 6
4401 and of extract_odd: 1 3 5 7
4404 The permutation is done in log LENGTH stages. In each stage extract_even
4405 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4406 their order. In our example,
4408 E1: extract_even (1st vec, 2nd vec)
4409 E2: extract_odd (1st vec, 2nd vec)
4410 E3: extract_even (3rd vec, 4th vec)
4411 E4: extract_odd (3rd vec, 4th vec)
4413 The output for the first stage will be:
4415 E1: 0 2 4 6 8 10 12 14
4416 E2: 1 3 5 7 9 11 13 15
4417 E3: 16 18 20 22 24 26 28 30
4418 E4: 17 19 21 23 25 27 29 31
4420 In order to proceed and create the correct sequence for the next stage (or
4421 for the correct output, if the second stage is the last one, as in our
4422 example), we first put the output of extract_even operation and then the
4423 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4424 The input for the second stage is:
4426 1st vec (E1): 0 2 4 6 8 10 12 14
4427 2nd vec (E3): 16 18 20 22 24 26 28 30
4428 3rd vec (E2): 1 3 5 7 9 11 13 15
4429 4th vec (E4): 17 19 21 23 25 27 29 31
4431 The output of the second stage:
4433 E1: 0 4 8 12 16 20 24 28
4434 E2: 2 6 10 14 18 22 26 30
4435 E3: 1 5 9 13 17 21 25 29
4436 E4: 3 7 11 15 19 23 27 31
4438 And RESULT_CHAIN after reordering:
4440 1st vec (E1): 0 4 8 12 16 20 24 28
4441 2nd vec (E3): 1 5 9 13 17 21 25 29
4442 3rd vec (E2): 2 6 10 14 18 22 26 30
4443 4th vec (E4): 3 7 11 15 19 23 27 31. */
4446 vect_permute_load_chain (vec<tree> dr_chain,
4447 unsigned int length,
4449 gimple_stmt_iterator *gsi,
4450 vec<tree> *result_chain)
4452 tree data_ref, first_vect, second_vect;
4453 tree perm_mask_even, perm_mask_odd;
4455 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4456 unsigned int i, j, log_length = exact_log2 (length);
4457 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4458 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4460 result_chain->quick_grow (length);
4461 memcpy (result_chain->address (), dr_chain.address (),
4462 length * sizeof (tree));
4464 for (i = 0; i < nelt; ++i)
4466 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4467 gcc_assert (perm_mask_even != NULL);
4469 for (i = 0; i < nelt; ++i)
4471 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4472 gcc_assert (perm_mask_odd != NULL);
4474 for (i = 0; i < log_length; i++)
4476 for (j = 0; j < length; j += 2)
4478 first_vect = dr_chain[j];
4479 second_vect = dr_chain[j+1];
4481 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4482 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4483 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4484 first_vect, second_vect,
4486 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4487 (*result_chain)[j/2] = data_ref;
4489 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4490 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4491 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4492 first_vect, second_vect,
4494 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4495 (*result_chain)[j/2+length/2] = data_ref;
4497 memcpy (dr_chain.address (), result_chain->address (),
4498 length * sizeof (tree));
4503 /* Function vect_transform_grouped_load.
4505 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4506 to perform their permutation and ascribe the result vectorized statements to
4507 the scalar statements.
4511 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4512 gimple_stmt_iterator *gsi)
4514 vec<tree> result_chain = vNULL;
4516 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4517 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4518 vectors, that are ready for vector computation. */
4519 result_chain.create (size);
4520 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4521 vect_record_grouped_load_vectors (stmt, result_chain);
4522 result_chain.release ();
4525 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4526 generated as part of the vectorization of STMT. Assign the statement
4527 for each vector to the associated scalar statement. */
4530 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4532 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4533 gimple next_stmt, new_stmt;
4534 unsigned int i, gap_count;
4537 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4538 Since we scan the chain starting from it's first node, their order
4539 corresponds the order of data-refs in RESULT_CHAIN. */
4540 next_stmt = first_stmt;
4542 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4547 /* Skip the gaps. Loads created for the gaps will be removed by dead
4548 code elimination pass later. No need to check for the first stmt in
4549 the group, since it always exists.
4550 GROUP_GAP is the number of steps in elements from the previous
4551 access (if there is no gap GROUP_GAP is 1). We skip loads that
4552 correspond to the gaps. */
4553 if (next_stmt != first_stmt
4554 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4562 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4563 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4564 copies, and we put the new vector statement in the first available
4566 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4567 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4570 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4573 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4575 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4578 prev_stmt = rel_stmt;
4580 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4583 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4588 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4590 /* If NEXT_STMT accesses the same DR as the previous statement,
4591 put the same TMP_DATA_REF as its vectorized statement; otherwise
4592 get the next data-ref from RESULT_CHAIN. */
4593 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4599 /* Function vect_force_dr_alignment_p.
4601 Returns whether the alignment of a DECL can be forced to be aligned
4602 on ALIGNMENT bit boundary. */
4605 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4607 if (TREE_CODE (decl) != VAR_DECL)
4610 /* We cannot change alignment of common or external symbols as another
4611 translation unit may contain a definition with lower alignment.
4612 The rules of common symbol linking mean that the definition
4613 will override the common symbol. The same is true for constant
4614 pool entries which may be shared and are not properly merged
4616 if (DECL_EXTERNAL (decl)
4617 || DECL_COMMON (decl)
4618 || DECL_IN_CONSTANT_POOL (decl))
4621 if (TREE_ASM_WRITTEN (decl))
4624 /* Do not override the alignment as specified by the ABI when the used
4625 attribute is set. */
4626 if (DECL_PRESERVE_P (decl))
4629 /* Do not override explicit alignment set by the user when an explicit
4630 section name is also used. This is a common idiom used by many
4631 software projects. */
4632 if (DECL_SECTION_NAME (decl) != NULL_TREE
4633 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4636 if (TREE_STATIC (decl))
4637 return (alignment <= MAX_OFILE_ALIGNMENT);
4639 return (alignment <= MAX_STACK_ALIGNMENT);
4643 /* Return whether the data reference DR is supported with respect to its
4645 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4646 it is aligned, i.e., check if it is possible to vectorize it with different
4649 enum dr_alignment_support
4650 vect_supportable_dr_alignment (struct data_reference *dr,
4651 bool check_aligned_accesses)
4653 gimple stmt = DR_STMT (dr);
4654 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4655 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4656 enum machine_mode mode = TYPE_MODE (vectype);
4657 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4658 struct loop *vect_loop = NULL;
4659 bool nested_in_vect_loop = false;
4661 if (aligned_access_p (dr) && !check_aligned_accesses)
4666 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4667 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4670 /* Possibly unaligned access. */
4672 /* We can choose between using the implicit realignment scheme (generating
4673 a misaligned_move stmt) and the explicit realignment scheme (generating
4674 aligned loads with a REALIGN_LOAD). There are two variants to the
4675 explicit realignment scheme: optimized, and unoptimized.
4676 We can optimize the realignment only if the step between consecutive
4677 vector loads is equal to the vector size. Since the vector memory
4678 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4679 is guaranteed that the misalignment amount remains the same throughout the
4680 execution of the vectorized loop. Therefore, we can create the
4681 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4682 at the loop preheader.
4684 However, in the case of outer-loop vectorization, when vectorizing a
4685 memory access in the inner-loop nested within the LOOP that is now being
4686 vectorized, while it is guaranteed that the misalignment of the
4687 vectorized memory access will remain the same in different outer-loop
4688 iterations, it is *not* guaranteed that is will remain the same throughout
4689 the execution of the inner-loop. This is because the inner-loop advances
4690 with the original scalar step (and not in steps of VS). If the inner-loop
4691 step happens to be a multiple of VS, then the misalignment remains fixed
4692 and we can use the optimized realignment scheme. For example:
4698 When vectorizing the i-loop in the above example, the step between
4699 consecutive vector loads is 1, and so the misalignment does not remain
4700 fixed across the execution of the inner-loop, and the realignment cannot
4701 be optimized (as illustrated in the following pseudo vectorized loop):
4703 for (i=0; i<N; i+=4)
4704 for (j=0; j<M; j++){
4705 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4706 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4707 // (assuming that we start from an aligned address).
4710 We therefore have to use the unoptimized realignment scheme:
4712 for (i=0; i<N; i+=4)
4713 for (j=k; j<M; j+=4)
4714 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4715 // that the misalignment of the initial address is
4718 The loop can then be vectorized as follows:
4720 for (k=0; k<4; k++){
4721 rt = get_realignment_token (&vp[k]);
4722 for (i=0; i<N; i+=4){
4724 for (j=k; j<M; j+=4){
4726 va = REALIGN_LOAD <v1,v2,rt>;
4733 if (DR_IS_READ (dr))
4735 bool is_packed = false;
4736 tree type = (TREE_TYPE (DR_REF (dr)));
4738 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4739 && (!targetm.vectorize.builtin_mask_for_load
4740 || targetm.vectorize.builtin_mask_for_load ()))
4742 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4743 if ((nested_in_vect_loop
4744 && (TREE_INT_CST_LOW (DR_STEP (dr))
4745 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4747 return dr_explicit_realign;
4749 return dr_explicit_realign_optimized;
4751 if (!known_alignment_for_access_p (dr))
4752 is_packed = not_size_aligned (DR_REF (dr));
4754 if (targetm.vectorize.
4755 support_vector_misalignment (mode, type,
4756 DR_MISALIGNMENT (dr), is_packed))
4757 /* Can't software pipeline the loads, but can at least do them. */
4758 return dr_unaligned_supported;
4762 bool is_packed = false;
4763 tree type = (TREE_TYPE (DR_REF (dr)));
4765 if (!known_alignment_for_access_p (dr))
4766 is_packed = not_size_aligned (DR_REF (dr));
4768 if (targetm.vectorize.
4769 support_vector_misalignment (mode, type,
4770 DR_MISALIGNMENT (dr), is_packed))
4771 return dr_unaligned_supported;
4775 return dr_unaligned_unsupported;