1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
33 #include "optabs-tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
338 int dist = dist_v[loop_depth];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
393 if (dist > 0 && DDR_REVERSED_P (ddr))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
412 && abs (dist) < *max_vf)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
423 if (abs (dist) >= *max_vf)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
460 struct data_dependence_relation *ddr;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
510 /* If dra and drb are part of the same interleaving chain consider
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 vec<gimple *> stores, gimple *last_store)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559 if (access == last_access)
561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
565 gimple *stmt = gsi_stmt (gsi);
566 if (! gimple_vuse (stmt)
567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt))
584 if (stmt != last_store)
588 FOR_EACH_VEC_ELT (stores, i, store)
590 data_reference *store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 ddr_p ddr = initialize_data_dependence_relation
593 (dr_a, store_dr, vNULL);
594 if (vect_slp_analyze_data_ref_dependence (ddr))
596 free_dependence_relation (ddr);
599 free_dependence_relation (ddr);
603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 if (vect_slp_analyze_data_ref_dependence (ddr))
606 free_dependence_relation (ddr);
609 free_dependence_relation (ddr);
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
623 vect_slp_analyze_instance_dependence (slp_instance instance)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store = SLP_INSTANCE_TREE (instance);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple *last_store = NULL;
638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
641 /* Mark stores in this instance and remember the last one. */
642 last_store = vect_find_last_scalar_stmt_in_slp (store);
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654 if (! vect_slp_analyze_node_dependences (instance, load,
656 ? SLP_TREE_SCALAR_STMTS (store)
657 : vNULL, last_store))
663 /* Unset the visited flag. */
665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
684 vect_compute_data_ref_alignment (struct data_reference *dr)
686 gimple *stmt = DR_STMT (dr);
687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689 struct loop *loop = NULL;
690 tree ref = DR_REF (dr);
692 tree base, base_addr;
693 tree misalign = NULL_TREE;
696 unsigned HOST_WIDE_INT alignment;
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_NOTE, vect_location,
700 "vect_compute_data_ref_alignment:\n");
703 loop = LOOP_VINFO_LOOP (loop_vinfo);
705 /* Initialize misalignment to unknown. */
706 SET_DR_MISALIGNMENT (dr, -1);
708 if (tree_fits_shwi_p (DR_STEP (dr)))
709 misalign = DR_INIT (dr);
710 aligned_to = DR_ALIGNED_TO (dr);
711 base_addr = DR_BASE_ADDRESS (dr);
712 vectype = STMT_VINFO_VECTYPE (stmt_info);
714 /* In case the dataref is in an inner-loop of the loop that is being
715 vectorized (LOOP), we use the base and misalignment information
716 relative to the outer-loop (LOOP). This is ok only if the misalignment
717 stays the same throughout the execution of the inner-loop, which is why
718 we have to check that the stride of the dataref in the inner-loop evenly
719 divides by the vector size. */
720 if (loop && nested_in_vect_loop_p (loop, stmt))
722 tree step = DR_STEP (dr);
724 if (tree_fits_shwi_p (step)
725 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
727 if (dump_enabled_p ())
728 dump_printf_loc (MSG_NOTE, vect_location,
729 "inner step divides the vector-size.\n");
730 misalign = STMT_VINFO_DR_INIT (stmt_info);
731 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
732 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 "inner step doesn't divide the vector-size.\n");
739 misalign = NULL_TREE;
743 /* Similarly we can only use base and misalignment information relative to
744 an innermost loop if the misalignment stays the same throughout the
745 execution of the loop. As above, this is the case if the stride of
746 the dataref evenly divides by the vector size. */
749 tree step = DR_STEP (dr);
750 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
752 if (tree_fits_shwi_p (step)
753 && ((tree_to_shwi (step) * vf)
754 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "step doesn't divide the vector-size.\n");
759 misalign = NULL_TREE;
763 /* To look at alignment of the base we have to preserve an inner MEM_REF
764 as that carries alignment information of the actual access. */
766 while (handled_component_p (base))
767 base = TREE_OPERAND (base, 0);
768 unsigned int base_alignment;
769 unsigned HOST_WIDE_INT base_bitpos;
770 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
771 /* As data-ref analysis strips the MEM_REF down to its base operand
772 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
773 adjust things to make base_alignment valid as the alignment of
775 if (TREE_CODE (base) == MEM_REF)
777 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
778 base_bitpos &= (base_alignment - 1);
780 if (base_bitpos != 0)
781 base_alignment = base_bitpos & -base_bitpos;
782 /* Also look at the alignment of the base address DR analysis
784 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
785 if (base_addr_alignment > base_alignment)
786 base_alignment = base_addr_alignment;
788 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
789 DR_VECT_AUX (dr)->base_element_aligned = true;
791 alignment = TYPE_ALIGN_UNIT (vectype);
793 if ((compare_tree_int (aligned_to, alignment) < 0)
796 if (dump_enabled_p ())
798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
799 "Unknown alignment for access: ");
800 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
801 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
806 if (base_alignment < TYPE_ALIGN (vectype))
809 if (TREE_CODE (base) == ADDR_EXPR)
810 base = TREE_OPERAND (base, 0);
811 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE, vect_location,
816 "can't force alignment of ref: ");
817 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
818 dump_printf (MSG_NOTE, "\n");
823 /* Force the alignment of the decl.
824 NOTE: This is the only change to the code we make during
825 the analysis phase, before deciding to vectorize the loop. */
826 if (dump_enabled_p ())
828 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
829 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
830 dump_printf (MSG_NOTE, "\n");
833 DR_VECT_AUX (dr)->base_decl = base;
834 DR_VECT_AUX (dr)->base_misaligned = true;
835 DR_VECT_AUX (dr)->base_element_aligned = true;
838 if (loop && nested_in_vect_loop_p (loop, stmt))
839 step = STMT_VINFO_DR_STEP (stmt_info);
842 /* If this is a backward running DR then first access in the larger
843 vectype actually is N-1 elements before the address in the DR.
844 Adjust misalign accordingly. */
845 if (tree_int_cst_sgn (step) < 0)
847 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
848 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
849 otherwise we wouldn't be here. */
850 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
851 /* PLUS because STEP was negative. */
852 misalign = size_binop (PLUS_EXPR, misalign, offset);
855 SET_DR_MISALIGNMENT (dr,
856 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
858 if (dump_enabled_p ())
860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
861 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
862 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
863 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
870 /* Function vect_update_misalignment_for_peel
872 DR - the data reference whose misalignment is to be adjusted.
873 DR_PEEL - the data reference whose misalignment is being made
874 zero in the vector loop by the peel.
875 NPEEL - the number of iterations in the peel loop if the misalignment
876 of DR_PEEL is known at compile time. */
879 vect_update_misalignment_for_peel (struct data_reference *dr,
880 struct data_reference *dr_peel, int npeel)
883 vec<dr_p> same_align_drs;
884 struct data_reference *current_dr;
885 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
886 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
887 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
888 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
890 /* For interleaved data accesses the step in the loop must be multiplied by
891 the size of the interleaving group. */
892 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
893 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
894 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
895 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
897 /* It can be assumed that the data refs with the same alignment as dr_peel
898 are aligned in the vector loop. */
900 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
901 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
903 if (current_dr != dr)
905 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
906 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
907 SET_DR_MISALIGNMENT (dr, 0);
911 if (known_alignment_for_access_p (dr)
912 && known_alignment_for_access_p (dr_peel))
914 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
915 int misal = DR_MISALIGNMENT (dr);
916 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
917 misal += negative ? -npeel * dr_size : npeel * dr_size;
918 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
919 SET_DR_MISALIGNMENT (dr, misal);
923 if (dump_enabled_p ())
924 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
925 SET_DR_MISALIGNMENT (dr, -1);
929 /* Function verify_data_ref_alignment
931 Return TRUE if DR can be handled with respect to alignment. */
934 verify_data_ref_alignment (data_reference_p dr)
936 enum dr_alignment_support supportable_dr_alignment
937 = vect_supportable_dr_alignment (dr, false);
938 if (!supportable_dr_alignment)
940 if (dump_enabled_p ())
943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
944 "not vectorized: unsupported unaligned load.");
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "not vectorized: unsupported unaligned "
950 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
952 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
957 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
958 dump_printf_loc (MSG_NOTE, vect_location,
959 "Vectorizing an unaligned access.\n");
964 /* Function vect_verify_datarefs_alignment
966 Return TRUE if all data references in the loop can be
967 handled with respect to alignment. */
970 vect_verify_datarefs_alignment (loop_vec_info vinfo)
972 vec<data_reference_p> datarefs = vinfo->datarefs;
973 struct data_reference *dr;
976 FOR_EACH_VEC_ELT (datarefs, i, dr)
978 gimple *stmt = DR_STMT (dr);
979 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
981 if (!STMT_VINFO_RELEVANT_P (stmt_info))
984 /* For interleaving, only the alignment of the first access matters. */
985 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
986 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
989 /* Strided accesses perform only component accesses, alignment is
990 irrelevant for them. */
991 if (STMT_VINFO_STRIDED_P (stmt_info)
992 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
995 if (! verify_data_ref_alignment (dr))
1002 /* Given an memory reference EXP return whether its alignment is less
1006 not_size_aligned (tree exp)
1008 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1011 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1012 > get_object_alignment (exp));
1015 /* Function vector_alignment_reachable_p
1017 Return true if vector alignment for DR is reachable by peeling
1018 a few loop iterations. Return false otherwise. */
1021 vector_alignment_reachable_p (struct data_reference *dr)
1023 gimple *stmt = DR_STMT (dr);
1024 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1025 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1027 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1029 /* For interleaved access we peel only if number of iterations in
1030 the prolog loop ({VF - misalignment}), is a multiple of the
1031 number of the interleaved accesses. */
1032 int elem_size, mis_in_elements;
1033 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1035 /* FORNOW: handle only known alignment. */
1036 if (!known_alignment_for_access_p (dr))
1039 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1040 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1042 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1046 /* If misalignment is known at the compile time then allow peeling
1047 only if natural alignment is reachable through peeling. */
1048 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1050 HOST_WIDE_INT elmsize =
1051 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1052 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_NOTE, vect_location,
1055 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1056 dump_printf (MSG_NOTE,
1057 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1059 if (DR_MISALIGNMENT (dr) % elmsize)
1061 if (dump_enabled_p ())
1062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063 "data size does not divide the misalignment.\n");
1068 if (!known_alignment_for_access_p (dr))
1070 tree type = TREE_TYPE (DR_REF (dr));
1071 bool is_packed = not_size_aligned (DR_REF (dr));
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074 "Unknown misalignment, is_packed = %d\n",is_packed);
1075 if ((TYPE_USER_ALIGN (type) && !is_packed)
1076 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1086 /* Calculate the cost of the memory access represented by DR. */
1089 vect_get_data_access_cost (struct data_reference *dr,
1090 unsigned int *inside_cost,
1091 unsigned int *outside_cost,
1092 stmt_vector_for_cost *body_cost_vec)
1094 gimple *stmt = DR_STMT (dr);
1095 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1096 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1097 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1098 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1099 int ncopies = vf / nunits;
1101 if (DR_IS_READ (dr))
1102 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1103 NULL, body_cost_vec, false);
1105 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1107 if (dump_enabled_p ())
1108 dump_printf_loc (MSG_NOTE, vect_location,
1109 "vect_get_data_access_cost: inside_cost = %d, "
1110 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1114 typedef struct _vect_peel_info
1117 struct data_reference *dr;
1121 typedef struct _vect_peel_extended_info
1123 struct _vect_peel_info peel_info;
1124 unsigned int inside_cost;
1125 unsigned int outside_cost;
1126 stmt_vector_for_cost body_cost_vec;
1127 } *vect_peel_extended_info;
1130 /* Peeling hashtable helpers. */
1132 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1134 static inline hashval_t hash (const _vect_peel_info *);
1135 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1139 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1141 return (hashval_t) peel_info->npeel;
1145 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1147 return (a->npeel == b->npeel);
1151 /* Insert DR into peeling hash table with NPEEL as key. */
1154 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1155 loop_vec_info loop_vinfo, struct data_reference *dr,
1158 struct _vect_peel_info elem, *slot;
1159 _vect_peel_info **new_slot;
1160 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1163 slot = peeling_htab->find (&elem);
1168 slot = XNEW (struct _vect_peel_info);
1169 slot->npeel = npeel;
1172 new_slot = peeling_htab->find_slot (slot, INSERT);
1176 if (!supportable_dr_alignment
1177 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1178 slot->count += VECT_MAX_COST;
1182 /* Traverse peeling hash table to find peeling option that aligns maximum
1183 number of data accesses. */
1186 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1187 _vect_peel_extended_info *max)
1189 vect_peel_info elem = *slot;
1191 if (elem->count > max->peel_info.count
1192 || (elem->count == max->peel_info.count
1193 && max->peel_info.npeel > elem->npeel))
1195 max->peel_info.npeel = elem->npeel;
1196 max->peel_info.count = elem->count;
1197 max->peel_info.dr = elem->dr;
1204 /* Traverse peeling hash table and calculate cost for each peeling option.
1205 Find the one with the lowest cost. */
1208 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1209 _vect_peel_extended_info *min)
1211 vect_peel_info elem = *slot;
1212 int save_misalignment, dummy;
1213 unsigned int inside_cost = 0, outside_cost = 0, i;
1214 gimple *stmt = DR_STMT (elem->dr);
1215 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1216 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1217 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1218 struct data_reference *dr;
1219 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1221 prologue_cost_vec.create (2);
1222 body_cost_vec.create (2);
1223 epilogue_cost_vec.create (2);
1225 FOR_EACH_VEC_ELT (datarefs, i, dr)
1227 stmt = DR_STMT (dr);
1228 stmt_info = vinfo_for_stmt (stmt);
1229 /* For interleaving, only the alignment of the first access
1231 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1232 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1235 /* Strided accesses perform only component accesses, alignment is
1236 irrelevant for them. */
1237 if (STMT_VINFO_STRIDED_P (stmt_info)
1238 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1241 save_misalignment = DR_MISALIGNMENT (dr);
1242 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1243 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1245 SET_DR_MISALIGNMENT (dr, save_misalignment);
1248 outside_cost += vect_get_known_peeling_cost
1249 (loop_vinfo, elem->npeel, &dummy,
1250 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1251 &prologue_cost_vec, &epilogue_cost_vec);
1253 /* Prologue and epilogue costs are added to the target model later.
1254 These costs depend only on the scalar iteration cost, the
1255 number of peeling iterations finally chosen, and the number of
1256 misaligned statements. So discard the information found here. */
1257 prologue_cost_vec.release ();
1258 epilogue_cost_vec.release ();
1260 if (inside_cost < min->inside_cost
1261 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1263 min->inside_cost = inside_cost;
1264 min->outside_cost = outside_cost;
1265 min->body_cost_vec.release ();
1266 min->body_cost_vec = body_cost_vec;
1267 min->peel_info.dr = elem->dr;
1268 min->peel_info.npeel = elem->npeel;
1271 body_cost_vec.release ();
1277 /* Choose best peeling option by traversing peeling hash table and either
1278 choosing an option with the lowest cost (if cost model is enabled) or the
1279 option that aligns as many accesses as possible. */
1281 static struct data_reference *
1282 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1283 loop_vec_info loop_vinfo,
1284 unsigned int *npeel,
1285 stmt_vector_for_cost *body_cost_vec)
1287 struct _vect_peel_extended_info res;
1289 res.peel_info.dr = NULL;
1290 res.body_cost_vec = stmt_vector_for_cost ();
1292 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1294 res.inside_cost = INT_MAX;
1295 res.outside_cost = INT_MAX;
1296 peeling_htab->traverse <_vect_peel_extended_info *,
1297 vect_peeling_hash_get_lowest_cost> (&res);
1301 res.peel_info.count = 0;
1302 peeling_htab->traverse <_vect_peel_extended_info *,
1303 vect_peeling_hash_get_most_frequent> (&res);
1306 *npeel = res.peel_info.npeel;
1307 *body_cost_vec = res.body_cost_vec;
1308 return res.peel_info.dr;
1312 /* Function vect_enhance_data_refs_alignment
1314 This pass will use loop versioning and loop peeling in order to enhance
1315 the alignment of data references in the loop.
1317 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1318 original loop is to be vectorized. Any other loops that are created by
1319 the transformations performed in this pass - are not supposed to be
1320 vectorized. This restriction will be relaxed.
1322 This pass will require a cost model to guide it whether to apply peeling
1323 or versioning or a combination of the two. For example, the scheme that
1324 intel uses when given a loop with several memory accesses, is as follows:
1325 choose one memory access ('p') which alignment you want to force by doing
1326 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1327 other accesses are not necessarily aligned, or (2) use loop versioning to
1328 generate one loop in which all accesses are aligned, and another loop in
1329 which only 'p' is necessarily aligned.
1331 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1332 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1333 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1335 Devising a cost model is the most critical aspect of this work. It will
1336 guide us on which access to peel for, whether to use loop versioning, how
1337 many versions to create, etc. The cost model will probably consist of
1338 generic considerations as well as target specific considerations (on
1339 powerpc for example, misaligned stores are more painful than misaligned
1342 Here are the general steps involved in alignment enhancements:
1344 -- original loop, before alignment analysis:
1345 for (i=0; i<N; i++){
1346 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1347 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1350 -- After vect_compute_data_refs_alignment:
1351 for (i=0; i<N; i++){
1352 x = q[i]; # DR_MISALIGNMENT(q) = 3
1353 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1356 -- Possibility 1: we do loop versioning:
1358 for (i=0; i<N; i++){ # loop 1A
1359 x = q[i]; # DR_MISALIGNMENT(q) = 3
1360 p[i] = y; # DR_MISALIGNMENT(p) = 0
1364 for (i=0; i<N; i++){ # loop 1B
1365 x = q[i]; # DR_MISALIGNMENT(q) = 3
1366 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1370 -- Possibility 2: we do loop peeling:
1371 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1375 for (i = 3; i < N; i++){ # loop 2A
1376 x = q[i]; # DR_MISALIGNMENT(q) = 0
1377 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1380 -- Possibility 3: combination of loop peeling and versioning:
1381 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1386 for (i = 3; i<N; i++){ # loop 3A
1387 x = q[i]; # DR_MISALIGNMENT(q) = 0
1388 p[i] = y; # DR_MISALIGNMENT(p) = 0
1392 for (i = 3; i<N; i++){ # loop 3B
1393 x = q[i]; # DR_MISALIGNMENT(q) = 0
1394 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1398 These loops are later passed to loop_transform to be vectorized. The
1399 vectorizer will use the alignment information to guide the transformation
1400 (whether to generate regular loads/stores, or with special handling for
1404 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1406 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1407 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1408 enum dr_alignment_support supportable_dr_alignment;
1409 struct data_reference *dr0 = NULL, *first_store = NULL;
1410 struct data_reference *dr;
1412 bool do_peeling = false;
1413 bool do_versioning = false;
1416 stmt_vec_info stmt_info;
1417 unsigned int npeel = 0;
1418 bool all_misalignments_unknown = true;
1419 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420 unsigned possible_npeel_number = 1;
1422 unsigned int nelements, mis, same_align_drs_max = 0;
1423 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1424 hash_table<peel_info_hasher> peeling_htab (1);
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_NOTE, vect_location,
1428 "=== vect_enhance_data_refs_alignment ===\n");
1430 /* Reset data so we can safely be called multiple times. */
1431 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1432 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1434 /* While cost model enhancements are expected in the future, the high level
1435 view of the code at this time is as follows:
1437 A) If there is a misaligned access then see if peeling to align
1438 this access can make all data references satisfy
1439 vect_supportable_dr_alignment. If so, update data structures
1440 as needed and return true.
1442 B) If peeling wasn't possible and there is a data reference with an
1443 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1444 then see if loop versioning checks can be used to make all data
1445 references satisfy vect_supportable_dr_alignment. If so, update
1446 data structures as needed and return true.
1448 C) If neither peeling nor versioning were successful then return false if
1449 any data reference does not satisfy vect_supportable_dr_alignment.
1451 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1453 Note, Possibility 3 above (which is peeling and versioning together) is not
1454 being done at this time. */
1456 /* (1) Peeling to force alignment. */
1458 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1460 + How many accesses will become aligned due to the peeling
1461 - How many accesses will become unaligned due to the peeling,
1462 and the cost of misaligned accesses.
1463 - The cost of peeling (the extra runtime checks, the increase
1466 FOR_EACH_VEC_ELT (datarefs, i, dr)
1468 stmt = DR_STMT (dr);
1469 stmt_info = vinfo_for_stmt (stmt);
1471 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1474 /* For interleaving, only the alignment of the first access
1476 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1477 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1480 /* For invariant accesses there is nothing to enhance. */
1481 if (integer_zerop (DR_STEP (dr)))
1484 /* Strided accesses perform only component accesses, alignment is
1485 irrelevant for them. */
1486 if (STMT_VINFO_STRIDED_P (stmt_info)
1487 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1490 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1491 do_peeling = vector_alignment_reachable_p (dr);
1494 if (known_alignment_for_access_p (dr))
1496 unsigned int npeel_tmp;
1497 bool negative = tree_int_cst_compare (DR_STEP (dr),
1498 size_zero_node) < 0;
1500 /* Save info about DR in the hash table. */
1501 vectype = STMT_VINFO_VECTYPE (stmt_info);
1502 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1503 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1504 TREE_TYPE (DR_REF (dr))));
1505 npeel_tmp = (negative
1506 ? (mis - nelements) : (nelements - mis))
1509 /* For multiple types, it is possible that the bigger type access
1510 will have more than one peeling option. E.g., a loop with two
1511 types: one of size (vector size / 4), and the other one of
1512 size (vector size / 8). Vectorization factor will 8. If both
1513 access are misaligned by 3, the first one needs one scalar
1514 iteration to be aligned, and the second one needs 5. But the
1515 first one will be aligned also by peeling 5 scalar
1516 iterations, and in that case both accesses will be aligned.
1517 Hence, except for the immediate peeling amount, we also want
1518 to try to add full vector size, while we don't exceed
1519 vectorization factor.
1520 We do this automtically for cost model, since we calculate cost
1521 for every peeling option. */
1522 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1524 if (STMT_SLP_TYPE (stmt_info))
1525 possible_npeel_number
1526 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1528 possible_npeel_number = vf / nelements;
1531 /* Handle the aligned case. We may decide to align some other
1532 access, making DR unaligned. */
1533 if (DR_MISALIGNMENT (dr) == 0)
1536 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1537 possible_npeel_number++;
1540 for (j = 0; j < possible_npeel_number; j++)
1542 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1544 npeel_tmp += nelements;
1547 all_misalignments_unknown = false;
1548 /* Data-ref that was chosen for the case that all the
1549 misalignments are unknown is not relevant anymore, since we
1550 have a data-ref with known alignment. */
1555 /* If we don't know any misalignment values, we prefer
1556 peeling for data-ref that has the maximum number of data-refs
1557 with the same alignment, unless the target prefers to align
1558 stores over load. */
1559 if (all_misalignments_unknown)
1561 unsigned same_align_drs
1562 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1564 || same_align_drs_max < same_align_drs)
1566 same_align_drs_max = same_align_drs;
1569 /* For data-refs with the same number of related
1570 accesses prefer the one where the misalign
1571 computation will be invariant in the outermost loop. */
1572 else if (same_align_drs_max == same_align_drs)
1574 struct loop *ivloop0, *ivloop;
1575 ivloop0 = outermost_invariant_loop_for_expr
1576 (loop, DR_BASE_ADDRESS (dr0));
1577 ivloop = outermost_invariant_loop_for_expr
1578 (loop, DR_BASE_ADDRESS (dr));
1579 if ((ivloop && !ivloop0)
1580 || (ivloop && ivloop0
1581 && flow_loop_nested_p (ivloop, ivloop0)))
1585 if (!first_store && DR_IS_WRITE (dr))
1589 /* If there are both known and unknown misaligned accesses in the
1590 loop, we choose peeling amount according to the known
1592 if (!supportable_dr_alignment)
1595 if (!first_store && DR_IS_WRITE (dr))
1602 if (!aligned_access_p (dr))
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 "vector alignment may not be reachable\n");
1612 /* Check if we can possibly peel the loop. */
1613 if (!vect_can_advance_ivs_p (loop_vinfo)
1614 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1619 && all_misalignments_unknown
1620 && vect_supportable_dr_alignment (dr0, false))
1622 /* Check if the target requires to prefer stores over loads, i.e., if
1623 misaligned stores are more expensive than misaligned loads (taking
1624 drs with same alignment into account). */
1625 if (first_store && DR_IS_READ (dr0))
1627 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1628 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1629 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1630 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1631 stmt_vector_for_cost dummy;
1634 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1636 vect_get_data_access_cost (first_store, &store_inside_cost,
1637 &store_outside_cost, &dummy);
1641 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1642 aligning the load DR0). */
1643 load_inside_penalty = store_inside_cost;
1644 load_outside_penalty = store_outside_cost;
1646 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1647 DR_STMT (first_store))).iterate (i, &dr);
1649 if (DR_IS_READ (dr))
1651 load_inside_penalty += load_inside_cost;
1652 load_outside_penalty += load_outside_cost;
1656 load_inside_penalty += store_inside_cost;
1657 load_outside_penalty += store_outside_cost;
1660 /* Calculate the penalty for leaving DR0 unaligned (by
1661 aligning the FIRST_STORE). */
1662 store_inside_penalty = load_inside_cost;
1663 store_outside_penalty = load_outside_cost;
1665 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1666 DR_STMT (dr0))).iterate (i, &dr);
1668 if (DR_IS_READ (dr))
1670 store_inside_penalty += load_inside_cost;
1671 store_outside_penalty += load_outside_cost;
1675 store_inside_penalty += store_inside_cost;
1676 store_outside_penalty += store_outside_cost;
1679 if (load_inside_penalty > store_inside_penalty
1680 || (load_inside_penalty == store_inside_penalty
1681 && load_outside_penalty > store_outside_penalty))
1685 /* In case there are only loads with different unknown misalignments, use
1686 peeling only if it may help to align other accesses in the loop or
1687 if it may help improving load bandwith when we'd end up using
1689 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1691 && !STMT_VINFO_SAME_ALIGN_REFS (
1692 vinfo_for_stmt (DR_STMT (dr0))).length ()
1693 && (vect_supportable_dr_alignment (dr0, false)
1694 != dr_unaligned_supported
1695 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1696 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1700 if (do_peeling && !dr0)
1702 /* Peeling is possible, but there is no data access that is not supported
1703 unless aligned. So we try to choose the best possible peeling. */
1705 /* We should get here only if there are drs with known misalignment. */
1706 gcc_assert (!all_misalignments_unknown);
1708 /* Choose the best peeling from the hash table. */
1709 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1718 stmt = DR_STMT (dr0);
1719 stmt_info = vinfo_for_stmt (stmt);
1720 vectype = STMT_VINFO_VECTYPE (stmt_info);
1721 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1723 if (known_alignment_for_access_p (dr0))
1725 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1726 size_zero_node) < 0;
1729 /* Since it's known at compile time, compute the number of
1730 iterations in the peeled loop (the peeling factor) for use in
1731 updating DR_MISALIGNMENT values. The peeling factor is the
1732 vectorization factor minus the misalignment as an element
1734 mis = DR_MISALIGNMENT (dr0);
1735 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1736 npeel = ((negative ? mis - nelements : nelements - mis)
1740 /* For interleaved data access every iteration accesses all the
1741 members of the group, therefore we divide the number of iterations
1742 by the group size. */
1743 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1744 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1745 npeel /= GROUP_SIZE (stmt_info);
1747 if (dump_enabled_p ())
1748 dump_printf_loc (MSG_NOTE, vect_location,
1749 "Try peeling by %d\n", npeel);
1752 /* Ensure that all data refs can be vectorized after the peel. */
1753 FOR_EACH_VEC_ELT (datarefs, i, dr)
1755 int save_misalignment;
1760 stmt = DR_STMT (dr);
1761 stmt_info = vinfo_for_stmt (stmt);
1762 /* For interleaving, only the alignment of the first access
1764 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1765 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1768 /* Strided accesses perform only component accesses, alignment is
1769 irrelevant for them. */
1770 if (STMT_VINFO_STRIDED_P (stmt_info)
1771 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1774 save_misalignment = DR_MISALIGNMENT (dr);
1775 vect_update_misalignment_for_peel (dr, dr0, npeel);
1776 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1777 SET_DR_MISALIGNMENT (dr, save_misalignment);
1779 if (!supportable_dr_alignment)
1786 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1788 stat = vect_verify_datarefs_alignment (loop_vinfo);
1793 body_cost_vec.release ();
1798 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1801 unsigned max_allowed_peel
1802 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1803 if (max_allowed_peel != (unsigned)-1)
1805 unsigned max_peel = npeel;
1808 gimple *dr_stmt = DR_STMT (dr0);
1809 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1810 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1811 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1813 if (max_peel > max_allowed_peel)
1816 if (dump_enabled_p ())
1817 dump_printf_loc (MSG_NOTE, vect_location,
1818 "Disable peeling, max peels reached: %d\n", max_peel);
1823 /* Cost model #2 - if peeling may result in a remaining loop not
1824 iterating enough to be vectorized then do not peel. */
1826 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1829 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1830 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1831 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1837 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1838 If the misalignment of DR_i is identical to that of dr0 then set
1839 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1840 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1841 by the peeling factor times the element size of DR_i (MOD the
1842 vectorization factor times the size). Otherwise, the
1843 misalignment of DR_i must be set to unknown. */
1844 FOR_EACH_VEC_ELT (datarefs, i, dr)
1847 /* Strided accesses perform only component accesses, alignment
1848 is irrelevant for them. */
1849 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1850 if (STMT_VINFO_STRIDED_P (stmt_info)
1851 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1854 vect_update_misalignment_for_peel (dr, dr0, npeel);
1857 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1859 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1861 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1862 = DR_MISALIGNMENT (dr0);
1863 SET_DR_MISALIGNMENT (dr0, 0);
1864 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_NOTE, vect_location,
1867 "Alignment of access forced using peeling.\n");
1868 dump_printf_loc (MSG_NOTE, vect_location,
1869 "Peeling for alignment will be applied.\n");
1871 /* The inside-loop cost will be accounted for in vectorizable_load
1872 and vectorizable_store correctly with adjusted alignments.
1873 Drop the body_cst_vec on the floor here. */
1874 body_cost_vec.release ();
1876 stat = vect_verify_datarefs_alignment (loop_vinfo);
1882 body_cost_vec.release ();
1884 /* (2) Versioning to force alignment. */
1886 /* Try versioning if:
1887 1) optimize loop for speed
1888 2) there is at least one unsupported misaligned data ref with an unknown
1890 3) all misaligned data refs with a known misalignment are supported, and
1891 4) the number of runtime alignment checks is within reason. */
1894 optimize_loop_nest_for_speed_p (loop)
1895 && (!loop->inner); /* FORNOW */
1899 FOR_EACH_VEC_ELT (datarefs, i, dr)
1901 stmt = DR_STMT (dr);
1902 stmt_info = vinfo_for_stmt (stmt);
1904 /* For interleaving, only the alignment of the first access
1906 if (aligned_access_p (dr)
1907 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1908 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1911 if (STMT_VINFO_STRIDED_P (stmt_info))
1913 /* Strided loads perform only component accesses, alignment is
1914 irrelevant for them. */
1915 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1917 do_versioning = false;
1921 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1923 if (!supportable_dr_alignment)
1929 if (known_alignment_for_access_p (dr)
1930 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1931 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1933 do_versioning = false;
1937 stmt = DR_STMT (dr);
1938 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1939 gcc_assert (vectype);
1941 /* The rightmost bits of an aligned address must be zeros.
1942 Construct the mask needed for this test. For example,
1943 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1944 mask must be 15 = 0xf. */
1945 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1947 /* FORNOW: use the same mask to test all potentially unaligned
1948 references in the loop. The vectorizer currently supports
1949 a single vector size, see the reference to
1950 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1951 vectorization factor is computed. */
1952 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1953 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1954 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1955 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1960 /* Versioning requires at least one misaligned data reference. */
1961 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1962 do_versioning = false;
1963 else if (!do_versioning)
1964 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1969 vec<gimple *> may_misalign_stmts
1970 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1973 /* It can now be assumed that the data references in the statements
1974 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1975 of the loop being vectorized. */
1976 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1978 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1979 dr = STMT_VINFO_DATA_REF (stmt_info);
1980 SET_DR_MISALIGNMENT (dr, 0);
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_NOTE, vect_location,
1983 "Alignment of access forced using versioning.\n");
1986 if (dump_enabled_p ())
1987 dump_printf_loc (MSG_NOTE, vect_location,
1988 "Versioning for alignment will be applied.\n");
1990 /* Peeling and versioning can't be done together at this time. */
1991 gcc_assert (! (do_peeling && do_versioning));
1993 stat = vect_verify_datarefs_alignment (loop_vinfo);
1998 /* This point is reached if neither peeling nor versioning is being done. */
1999 gcc_assert (! (do_peeling || do_versioning));
2001 stat = vect_verify_datarefs_alignment (loop_vinfo);
2006 /* Function vect_find_same_alignment_drs.
2008 Update group and alignment relations according to the chosen
2009 vectorization factor. */
2012 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2013 loop_vec_info loop_vinfo)
2016 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2017 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2018 struct data_reference *dra = DDR_A (ddr);
2019 struct data_reference *drb = DDR_B (ddr);
2020 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2021 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2022 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2023 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2024 lambda_vector dist_v;
2025 unsigned int loop_depth;
2027 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2033 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2036 /* Loop-based vectorization and known data dependence. */
2037 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2040 /* Data-dependence analysis reports a distance vector of zero
2041 for data-references that overlap only in the first iteration
2042 but have different sign step (see PR45764).
2043 So as a sanity check require equal DR_STEP. */
2044 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2047 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2048 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2050 int dist = dist_v[loop_depth];
2052 if (dump_enabled_p ())
2053 dump_printf_loc (MSG_NOTE, vect_location,
2054 "dependence distance = %d.\n", dist);
2056 /* Same loop iteration. */
2058 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2060 /* Two references with distance zero have the same alignment. */
2061 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2062 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2063 if (dump_enabled_p ())
2065 dump_printf_loc (MSG_NOTE, vect_location,
2066 "accesses have the same alignment.\n");
2067 dump_printf (MSG_NOTE,
2068 "dependence distance modulo vf == 0 between ");
2069 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2070 dump_printf (MSG_NOTE, " and ");
2071 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2072 dump_printf (MSG_NOTE, "\n");
2079 /* Function vect_analyze_data_refs_alignment
2081 Analyze the alignment of the data-references in the loop.
2082 Return FALSE if a data reference is found that cannot be vectorized. */
2085 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2087 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_NOTE, vect_location,
2089 "=== vect_analyze_data_refs_alignment ===\n");
2091 /* Mark groups of data references with same alignment using
2092 data dependence information. */
2093 vec<ddr_p> ddrs = vinfo->ddrs;
2094 struct data_dependence_relation *ddr;
2097 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2098 vect_find_same_alignment_drs (ddr, vinfo);
2100 vec<data_reference_p> datarefs = vinfo->datarefs;
2101 struct data_reference *dr;
2103 FOR_EACH_VEC_ELT (datarefs, i, dr)
2105 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2106 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2107 && !vect_compute_data_ref_alignment (dr))
2109 /* Strided accesses perform only component accesses, misalignment
2110 information is irrelevant for them. */
2111 if (STMT_VINFO_STRIDED_P (stmt_info)
2112 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2115 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117 "not vectorized: can't calculate alignment "
2128 /* Analyze alignment of DRs of stmts in NODE. */
2131 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2133 /* We vectorize from the first scalar stmt in the node unless
2134 the node is permuted in which case we start from the first
2135 element in the group. */
2136 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2137 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2138 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2139 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2141 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2142 if (! vect_compute_data_ref_alignment (dr)
2143 /* For creating the data-ref pointer we need alignment of the
2144 first element anyway. */
2146 && ! vect_compute_data_ref_alignment (first_dr))
2147 || ! verify_data_ref_alignment (dr))
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "not vectorized: bad data alignment in basic "
2159 /* Function vect_slp_analyze_instance_alignment
2161 Analyze the alignment of the data-references in the SLP instance.
2162 Return FALSE if a data reference is found that cannot be vectorized. */
2165 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2167 if (dump_enabled_p ())
2168 dump_printf_loc (MSG_NOTE, vect_location,
2169 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2173 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2174 if (! vect_slp_analyze_and_verify_node_alignment (node))
2177 node = SLP_INSTANCE_TREE (instance);
2178 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2179 && ! vect_slp_analyze_and_verify_node_alignment
2180 (SLP_INSTANCE_TREE (instance)))
2187 /* Analyze groups of accesses: check that DR belongs to a group of
2188 accesses of legal size, step, etc. Detect gaps, single element
2189 interleaving, and other special cases. Set grouped access info.
2190 Collect groups of strided stores for further use in SLP analysis.
2191 Worker for vect_analyze_group_access. */
2194 vect_analyze_group_access_1 (struct data_reference *dr)
2196 tree step = DR_STEP (dr);
2197 tree scalar_type = TREE_TYPE (DR_REF (dr));
2198 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2199 gimple *stmt = DR_STMT (dr);
2200 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2201 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2202 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2203 HOST_WIDE_INT dr_step = -1;
2204 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2205 bool slp_impossible = false;
2207 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2208 size of the interleaving group (including gaps). */
2209 if (tree_fits_shwi_p (step))
2211 dr_step = tree_to_shwi (step);
2212 /* Check that STEP is a multiple of type size. Otherwise there is
2213 a non-element-sized gap at the end of the group which we
2214 cannot represent in GROUP_GAP or GROUP_SIZE.
2215 ??? As we can handle non-constant step fine here we should
2216 simply remove uses of GROUP_GAP between the last and first
2217 element and instead rely on DR_STEP. GROUP_SIZE then would
2218 simply not include that gap. */
2219 if ((dr_step % type_size) != 0)
2221 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE, vect_location,
2225 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2226 dump_printf (MSG_NOTE,
2227 " is not a multiple of the element size for ");
2228 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2229 dump_printf (MSG_NOTE, "\n");
2233 groupsize = absu_hwi (dr_step) / type_size;
2238 /* Not consecutive access is possible only if it is a part of interleaving. */
2239 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2241 /* Check if it this DR is a part of interleaving, and is a single
2242 element of the group that is accessed in the loop. */
2244 /* Gaps are supported only for loads. STEP must be a multiple of the type
2245 size. The size of the group must be a power of 2. */
2247 && (dr_step % type_size) == 0
2249 && exact_log2 (groupsize) != -1)
2251 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2252 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2253 GROUP_GAP (stmt_info) = groupsize - 1;
2254 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_NOTE, vect_location,
2257 "Detected single element interleaving ");
2258 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2259 dump_printf (MSG_NOTE, " step ");
2260 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2261 dump_printf (MSG_NOTE, "\n");
2267 if (dump_enabled_p ())
2269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2270 "not consecutive access ");
2271 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2276 /* Mark the statement as unvectorizable. */
2277 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2281 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2282 STMT_VINFO_STRIDED_P (stmt_info) = true;
2286 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2288 /* First stmt in the interleaving chain. Check the chain. */
2289 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2290 struct data_reference *data_ref = dr;
2291 unsigned int count = 1;
2292 tree prev_init = DR_INIT (data_ref);
2293 gimple *prev = stmt;
2294 HOST_WIDE_INT diff, gaps = 0;
2298 /* Skip same data-refs. In case that two or more stmts share
2299 data-ref (supported only for loads), we vectorize only the first
2300 stmt, and the rest get their vectorized loads from the first
2302 if (!tree_int_cst_compare (DR_INIT (data_ref),
2303 DR_INIT (STMT_VINFO_DATA_REF (
2304 vinfo_for_stmt (next)))))
2306 if (DR_IS_WRITE (data_ref))
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310 "Two store stmts share the same dr.\n");
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 "Two or more load stmts share the same dr.\n");
2318 /* For load use the same data-ref load. */
2319 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2322 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2327 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2329 /* All group members have the same STEP by construction. */
2330 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2332 /* Check that the distance between two accesses is equal to the type
2333 size. Otherwise, we have gaps. */
2334 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2335 - TREE_INT_CST_LOW (prev_init)) / type_size;
2338 /* FORNOW: SLP of accesses with gaps is not supported. */
2339 slp_impossible = true;
2340 if (DR_IS_WRITE (data_ref))
2342 if (dump_enabled_p ())
2343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344 "interleaved store with gaps\n");
2351 last_accessed_element += diff;
2353 /* Store the gap from the previous member of the group. If there is no
2354 gap in the access, GROUP_GAP is always 1. */
2355 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2357 prev_init = DR_INIT (data_ref);
2358 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2359 /* Count the number of data-refs in the chain. */
2364 groupsize = count + gaps;
2366 if (groupsize > UINT_MAX)
2368 if (dump_enabled_p ())
2369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2370 "group is too large\n");
2374 /* Check that the size of the interleaving is equal to count for stores,
2375 i.e., that there are no gaps. */
2376 if (groupsize != count
2377 && !DR_IS_READ (dr))
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 "interleaved store with gaps\n");
2385 /* If there is a gap after the last load in the group it is the
2386 difference between the groupsize and the last accessed
2388 When there is no gap, this difference should be 0. */
2389 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2391 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2392 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_NOTE, vect_location,
2395 "Detected interleaving ");
2396 if (DR_IS_READ (dr))
2397 dump_printf (MSG_NOTE, "load ");
2399 dump_printf (MSG_NOTE, "store ");
2400 dump_printf (MSG_NOTE, "of size %u starting with ",
2401 (unsigned)groupsize);
2402 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2403 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2404 dump_printf_loc (MSG_NOTE, vect_location,
2405 "There is a gap of %u elements after the group\n",
2406 GROUP_GAP (vinfo_for_stmt (stmt)));
2409 /* SLP: create an SLP data structure for every interleaving group of
2410 stores for further analysis in vect_analyse_slp. */
2411 if (DR_IS_WRITE (dr) && !slp_impossible)
2414 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2416 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2423 /* Analyze groups of accesses: check that DR belongs to a group of
2424 accesses of legal size, step, etc. Detect gaps, single element
2425 interleaving, and other special cases. Set grouped access info.
2426 Collect groups of strided stores for further use in SLP analysis. */
2429 vect_analyze_group_access (struct data_reference *dr)
2431 if (!vect_analyze_group_access_1 (dr))
2433 /* Dissolve the group if present. */
2435 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2438 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2439 next = GROUP_NEXT_ELEMENT (vinfo);
2440 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2441 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2449 /* Analyze the access pattern of the data-reference DR.
2450 In case of non-consecutive accesses call vect_analyze_group_access() to
2451 analyze groups of accesses. */
2454 vect_analyze_data_ref_access (struct data_reference *dr)
2456 tree step = DR_STEP (dr);
2457 tree scalar_type = TREE_TYPE (DR_REF (dr));
2458 gimple *stmt = DR_STMT (dr);
2459 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2460 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2461 struct loop *loop = NULL;
2464 loop = LOOP_VINFO_LOOP (loop_vinfo);
2466 if (loop_vinfo && !step)
2468 if (dump_enabled_p ())
2469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2470 "bad data-ref access in loop\n");
2474 /* Allow loads with zero step in inner-loop vectorization. */
2475 if (loop_vinfo && integer_zerop (step))
2477 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2478 if (!nested_in_vect_loop_p (loop, stmt))
2479 return DR_IS_READ (dr);
2480 /* Allow references with zero step for outer loops marked
2481 with pragma omp simd only - it guarantees absence of
2482 loop-carried dependencies between inner loop iterations. */
2483 if (!loop->force_vectorize)
2485 if (dump_enabled_p ())
2486 dump_printf_loc (MSG_NOTE, vect_location,
2487 "zero step in inner loop of nest\n");
2492 if (loop && nested_in_vect_loop_p (loop, stmt))
2494 /* Interleaved accesses are not yet supported within outer-loop
2495 vectorization for references in the inner-loop. */
2496 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2498 /* For the rest of the analysis we use the outer-loop step. */
2499 step = STMT_VINFO_DR_STEP (stmt_info);
2500 if (integer_zerop (step))
2502 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_NOTE, vect_location,
2504 "zero step in outer loop.\n");
2505 return DR_IS_READ (dr);
2510 if (TREE_CODE (step) == INTEGER_CST)
2512 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2513 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2515 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2517 /* Mark that it is not interleaving. */
2518 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2523 if (loop && nested_in_vect_loop_p (loop, stmt))
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_NOTE, vect_location,
2527 "grouped access in outer loop.\n");
2532 /* Assume this is a DR handled by non-constant strided load case. */
2533 if (TREE_CODE (step) != INTEGER_CST)
2534 return (STMT_VINFO_STRIDED_P (stmt_info)
2535 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2536 || vect_analyze_group_access (dr)));
2538 /* Not consecutive access - check if it's a part of interleaving group. */
2539 return vect_analyze_group_access (dr);
2544 /* A helper function used in the comparator function to sort data
2545 references. T1 and T2 are two data references to be compared.
2546 The function returns -1, 0, or 1. */
2549 compare_tree (tree t1, tree t2)
2552 enum tree_code code;
2565 if (TREE_CODE (t1) != TREE_CODE (t2))
2566 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2568 code = TREE_CODE (t1);
2571 /* For const values, we can just use hash values for comparisons. */
2579 hashval_t h1 = iterative_hash_expr (t1, 0);
2580 hashval_t h2 = iterative_hash_expr (t2, 0);
2582 return h1 < h2 ? -1 : 1;
2587 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2591 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2592 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2596 tclass = TREE_CODE_CLASS (code);
2598 /* For var-decl, we could compare their UIDs. */
2599 if (tclass == tcc_declaration)
2601 if (DECL_UID (t1) != DECL_UID (t2))
2602 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2606 /* For expressions with operands, compare their operands recursively. */
2607 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2609 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2619 /* Compare two data-references DRA and DRB to group them into chunks
2620 suitable for grouping. */
2623 dr_group_sort_cmp (const void *dra_, const void *drb_)
2625 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2626 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2629 /* Stabilize sort. */
2633 /* DRs in different loops never belong to the same group. */
2634 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2635 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2637 return loopa->num < loopb->num ? -1 : 1;
2639 /* Ordering of DRs according to base. */
2640 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2642 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2647 /* And according to DR_OFFSET. */
2648 if (!dr_equal_offsets_p (dra, drb))
2650 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2655 /* Put reads before writes. */
2656 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2657 return DR_IS_READ (dra) ? -1 : 1;
2659 /* Then sort after access size. */
2660 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2661 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2663 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2664 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2669 /* And after step. */
2670 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2672 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2677 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2678 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2680 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2684 /* Function vect_analyze_data_ref_accesses.
2686 Analyze the access pattern of all the data references in the loop.
2688 FORNOW: the only access pattern that is considered vectorizable is a
2689 simple step 1 (consecutive) access.
2691 FORNOW: handle only arrays and pointer accesses. */
2694 vect_analyze_data_ref_accesses (vec_info *vinfo)
2697 vec<data_reference_p> datarefs = vinfo->datarefs;
2698 struct data_reference *dr;
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE, vect_location,
2702 "=== vect_analyze_data_ref_accesses ===\n");
2704 if (datarefs.is_empty ())
2707 /* Sort the array of datarefs to make building the interleaving chains
2708 linear. Don't modify the original vector's order, it is needed for
2709 determining what dependencies are reversed. */
2710 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2711 datarefs_copy.qsort (dr_group_sort_cmp);
2713 /* Build the interleaving chains. */
2714 for (i = 0; i < datarefs_copy.length () - 1;)
2716 data_reference_p dra = datarefs_copy[i];
2717 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2718 stmt_vec_info lastinfo = NULL;
2719 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2724 for (i = i + 1; i < datarefs_copy.length (); ++i)
2726 data_reference_p drb = datarefs_copy[i];
2727 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2728 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2731 /* ??? Imperfect sorting (non-compatible types, non-modulo
2732 accesses, same accesses) can lead to a group to be artificially
2733 split here as we don't just skip over those. If it really
2734 matters we can push those to a worklist and re-iterate
2735 over them. The we can just skip ahead to the next DR here. */
2737 /* DRs in a different loop should not be put into the same
2738 interleaving group. */
2739 if (gimple_bb (DR_STMT (dra))->loop_father
2740 != gimple_bb (DR_STMT (drb))->loop_father)
2743 /* Check that the data-refs have same first location (except init)
2744 and they are both either store or load (not load and store,
2745 not masked loads or stores). */
2746 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2747 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2748 DR_BASE_ADDRESS (drb), 0)
2749 || !dr_equal_offsets_p (dra, drb)
2750 || !gimple_assign_single_p (DR_STMT (dra))
2751 || !gimple_assign_single_p (DR_STMT (drb)))
2754 /* Check that the data-refs have the same constant size. */
2755 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2756 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2757 if (!tree_fits_uhwi_p (sza)
2758 || !tree_fits_uhwi_p (szb)
2759 || !tree_int_cst_equal (sza, szb))
2762 /* Check that the data-refs have the same step. */
2763 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2766 /* Do not place the same access in the interleaving chain twice. */
2767 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2770 /* Check the types are compatible.
2771 ??? We don't distinguish this during sorting. */
2772 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2773 TREE_TYPE (DR_REF (drb))))
2776 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2777 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2778 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2779 gcc_assert (init_a <= init_b);
2781 /* If init_b == init_a + the size of the type * k, we have an
2782 interleaving, and DRA is accessed before DRB. */
2783 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2784 if (type_size_a == 0
2785 || (init_b - init_a) % type_size_a != 0)
2788 /* If we have a store, the accesses are adjacent. This splits
2789 groups into chunks we support (we don't support vectorization
2790 of stores with gaps). */
2791 if (!DR_IS_READ (dra)
2792 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2793 (DR_INIT (datarefs_copy[i-1]))
2797 /* If the step (if not zero or non-constant) is greater than the
2798 difference between data-refs' inits this splits groups into
2800 if (tree_fits_shwi_p (DR_STEP (dra)))
2802 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2803 if (step != 0 && step <= (init_b - init_a))
2807 if (dump_enabled_p ())
2809 dump_printf_loc (MSG_NOTE, vect_location,
2810 "Detected interleaving ");
2811 if (DR_IS_READ (dra))
2812 dump_printf (MSG_NOTE, "load ");
2814 dump_printf (MSG_NOTE, "store ");
2815 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2816 dump_printf (MSG_NOTE, " and ");
2817 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2818 dump_printf (MSG_NOTE, "\n");
2821 /* Link the found element into the group list. */
2822 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2824 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2825 lastinfo = stmtinfo_a;
2827 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2828 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2829 lastinfo = stmtinfo_b;
2833 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2834 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2835 && !vect_analyze_data_ref_access (dr))
2837 if (dump_enabled_p ())
2838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2839 "not vectorized: complicated access pattern.\n");
2841 if (is_a <bb_vec_info> (vinfo))
2843 /* Mark the statement as not vectorizable. */
2844 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2849 datarefs_copy.release ();
2854 datarefs_copy.release ();
2859 /* Operator == between two dr_with_seg_len objects.
2861 This equality operator is used to make sure two data refs
2862 are the same one so that we will consider to combine the
2863 aliasing checks of those two pairs of data dependent data
2867 operator == (const dr_with_seg_len& d1,
2868 const dr_with_seg_len& d2)
2870 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2871 DR_BASE_ADDRESS (d2.dr), 0)
2872 && compare_tree (d1.offset, d2.offset) == 0
2873 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2876 /* Function comp_dr_with_seg_len_pair.
2878 Comparison function for sorting objects of dr_with_seg_len_pair_t
2879 so that we can combine aliasing checks in one scan. */
2882 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2884 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2885 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2887 const dr_with_seg_len &p11 = p1->first,
2892 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2893 if a and c have the same basic address snd step, and b and d have the same
2894 address and step. Therefore, if any a&c or b&d don't have the same address
2895 and step, we don't care the order of those two pairs after sorting. */
2898 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2899 DR_BASE_ADDRESS (p21.dr))) != 0)
2901 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2902 DR_BASE_ADDRESS (p22.dr))) != 0)
2904 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2906 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2908 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2910 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2916 /* Function vect_vfa_segment_size.
2918 Create an expression that computes the size of segment
2919 that will be accessed for a data reference. The functions takes into
2920 account that realignment loads may access one more vector.
2923 DR: The data reference.
2924 LENGTH_FACTOR: segment length to consider.
2926 Return an expression whose value is the size of segment which will be
2930 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2932 tree segment_length;
2934 if (integer_zerop (DR_STEP (dr)))
2935 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2937 segment_length = size_binop (MULT_EXPR,
2938 fold_convert (sizetype, DR_STEP (dr)),
2939 fold_convert (sizetype, length_factor));
2941 if (vect_supportable_dr_alignment (dr, false)
2942 == dr_explicit_realign_optimized)
2944 tree vector_size = TYPE_SIZE_UNIT
2945 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2947 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2949 return segment_length;
2952 /* Function vect_prune_runtime_alias_test_list.
2954 Prune a list of ddrs to be tested at run-time by versioning for alias.
2955 Merge several alias checks into one if possible.
2956 Return FALSE if resulting list of ddrs is longer then allowed by
2957 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2960 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2962 vec<ddr_p> may_alias_ddrs =
2963 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2964 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2965 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2966 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2967 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2973 if (dump_enabled_p ())
2974 dump_printf_loc (MSG_NOTE, vect_location,
2975 "=== vect_prune_runtime_alias_test_list ===\n");
2977 if (may_alias_ddrs.is_empty ())
2980 /* Basically, for each pair of dependent data refs store_ptr_0
2981 and load_ptr_0, we create an expression:
2983 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2984 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2986 for aliasing checks. However, in some cases we can decrease
2987 the number of checks by combining two checks into one. For
2988 example, suppose we have another pair of data refs store_ptr_0
2989 and load_ptr_1, and if the following condition is satisfied:
2991 load_ptr_0 < load_ptr_1 &&
2992 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2994 (this condition means, in each iteration of vectorized loop,
2995 the accessed memory of store_ptr_0 cannot be between the memory
2996 of load_ptr_0 and load_ptr_1.)
2998 we then can use only the following expression to finish the
2999 alising checks between store_ptr_0 & load_ptr_0 and
3000 store_ptr_0 & load_ptr_1:
3002 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3003 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3005 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3006 same basic address. */
3008 comp_alias_ddrs.create (may_alias_ddrs.length ());
3010 /* First, we collect all data ref pairs for aliasing checks. */
3011 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3013 struct data_reference *dr_a, *dr_b;
3014 gimple *dr_group_first_a, *dr_group_first_b;
3015 tree segment_length_a, segment_length_b;
3016 gimple *stmt_a, *stmt_b;
3019 stmt_a = DR_STMT (DDR_A (ddr));
3020 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3021 if (dr_group_first_a)
3023 stmt_a = dr_group_first_a;
3024 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3028 stmt_b = DR_STMT (DDR_B (ddr));
3029 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3030 if (dr_group_first_b)
3032 stmt_b = dr_group_first_b;
3033 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3036 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3037 length_factor = scalar_loop_iters;
3039 length_factor = size_int (vect_factor);
3040 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3041 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3043 dr_with_seg_len_pair_t dr_with_seg_len_pair
3044 (dr_with_seg_len (dr_a, segment_length_a),
3045 dr_with_seg_len (dr_b, segment_length_b));
3047 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3048 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3050 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3053 /* Second, we sort the collected data ref pairs so that we can scan
3054 them once to combine all possible aliasing checks. */
3055 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3057 /* Third, we scan the sorted dr pairs and check if we can combine
3058 alias checks of two neighboring dr pairs. */
3059 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3061 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3062 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3063 *dr_b1 = &comp_alias_ddrs[i-1].second,
3064 *dr_a2 = &comp_alias_ddrs[i].first,
3065 *dr_b2 = &comp_alias_ddrs[i].second;
3067 /* Remove duplicate data ref pairs. */
3068 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3070 if (dump_enabled_p ())
3072 dump_printf_loc (MSG_NOTE, vect_location,
3073 "found equal ranges ");
3074 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3075 DR_REF (dr_a1->dr));
3076 dump_printf (MSG_NOTE, ", ");
3077 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3078 DR_REF (dr_b1->dr));
3079 dump_printf (MSG_NOTE, " and ");
3080 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3081 DR_REF (dr_a2->dr));
3082 dump_printf (MSG_NOTE, ", ");
3083 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3084 DR_REF (dr_b2->dr));
3085 dump_printf (MSG_NOTE, "\n");
3088 comp_alias_ddrs.ordered_remove (i--);
3092 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3094 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3095 and DR_A1 and DR_A2 are two consecutive memrefs. */
3096 if (*dr_a1 == *dr_a2)
3098 std::swap (dr_a1, dr_b1);
3099 std::swap (dr_a2, dr_b2);
3102 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3103 DR_BASE_ADDRESS (dr_a2->dr),
3105 || !tree_fits_shwi_p (dr_a1->offset)
3106 || !tree_fits_shwi_p (dr_a2->offset))
3109 /* Make sure dr_a1 starts left of dr_a2. */
3110 if (tree_int_cst_lt (dr_a2->offset, dr_a1->offset))
3111 std::swap (*dr_a1, *dr_a2);
3113 unsigned HOST_WIDE_INT diff
3114 = tree_to_shwi (dr_a2->offset) - tree_to_shwi (dr_a1->offset);
3117 bool do_remove = false;
3119 /* If the left segment does not extend beyond the start of the
3120 right segment the new segment length is that of the right
3121 plus the segment distance. */
3122 if (tree_fits_uhwi_p (dr_a1->seg_len)
3123 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3125 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3129 /* Generally the new segment length is the maximum of the
3130 left segment size and the right segment size plus the distance.
3131 ??? We can also build tree MAX_EXPR here but it's not clear this
3133 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3134 && tree_fits_uhwi_p (dr_a2->seg_len))
3136 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3137 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3138 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3141 /* Now we check if the following condition is satisfied:
3143 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3145 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3146 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3147 have to make a best estimation. We can get the minimum value
3148 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3149 then either of the following two conditions can guarantee the
3152 1: DIFF <= MIN_SEG_LEN_B
3153 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3156 unsigned HOST_WIDE_INT min_seg_len_b
3157 = (tree_fits_uhwi_p (dr_b1->seg_len)
3158 ? tree_to_uhwi (dr_b1->seg_len)
3161 if (diff <= min_seg_len_b
3162 || (tree_fits_uhwi_p (dr_a1->seg_len)
3163 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3165 dr_a1->seg_len = size_binop (PLUS_EXPR,
3166 dr_a2->seg_len, size_int (diff));
3173 if (dump_enabled_p ())
3175 dump_printf_loc (MSG_NOTE, vect_location,
3176 "merging ranges for ");
3177 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3178 dump_printf (MSG_NOTE, ", ");
3179 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3180 dump_printf (MSG_NOTE, " and ");
3181 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3182 dump_printf (MSG_NOTE, ", ");
3183 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3184 dump_printf (MSG_NOTE, "\n");
3186 comp_alias_ddrs.ordered_remove (i--);
3191 dump_printf_loc (MSG_NOTE, vect_location,
3192 "improved number of alias checks from %d to %d\n",
3193 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3194 if ((int) comp_alias_ddrs.length () >
3195 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3201 /* Check whether a non-affine read or write in stmt is suitable for gather load
3202 or scatter store and if so, return a builtin decl for that operation. */
3205 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3206 tree *offp, int *scalep)
3208 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3210 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3211 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3212 tree offtype = NULL_TREE;
3213 tree decl, base, off;
3215 int punsignedp, reversep, pvolatilep = 0;
3218 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3219 see if we can use the def stmt of the address. */
3220 if (is_gimple_call (stmt)
3221 && gimple_call_internal_p (stmt)
3222 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3223 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3224 && TREE_CODE (base) == MEM_REF
3225 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3226 && integer_zerop (TREE_OPERAND (base, 1))
3227 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3229 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3230 if (is_gimple_assign (def_stmt)
3231 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3232 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3235 /* The gather and scatter builtins need address of the form
3236 loop_invariant + vector * {1, 2, 4, 8}
3238 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3239 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3240 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3241 multiplications and additions in it. To get a vector, we need
3242 a single SSA_NAME that will be defined in the loop and will
3243 contain everything that is not loop invariant and that can be
3244 vectorized. The following code attempts to find such a preexistng
3245 SSA_NAME OFF and put the loop invariants into a tree BASE
3246 that can be gimplified before the loop. */
3247 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3248 &punsignedp, &reversep, &pvolatilep, false);
3249 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3251 if (TREE_CODE (base) == MEM_REF)
3253 if (!integer_zerop (TREE_OPERAND (base, 1)))
3255 if (off == NULL_TREE)
3257 offset_int moff = mem_ref_offset (base);
3258 off = wide_int_to_tree (sizetype, moff);
3261 off = size_binop (PLUS_EXPR, off,
3262 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3264 base = TREE_OPERAND (base, 0);
3267 base = build_fold_addr_expr (base);
3269 if (off == NULL_TREE)
3270 off = size_zero_node;
3272 /* If base is not loop invariant, either off is 0, then we start with just
3273 the constant offset in the loop invariant BASE and continue with base
3274 as OFF, otherwise give up.
3275 We could handle that case by gimplifying the addition of base + off
3276 into some SSA_NAME and use that as off, but for now punt. */
3277 if (!expr_invariant_in_loop_p (loop, base))
3279 if (!integer_zerop (off))
3282 base = size_int (pbitpos / BITS_PER_UNIT);
3284 /* Otherwise put base + constant offset into the loop invariant BASE
3285 and continue with OFF. */
3288 base = fold_convert (sizetype, base);
3289 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3292 /* OFF at this point may be either a SSA_NAME or some tree expression
3293 from get_inner_reference. Try to peel off loop invariants from it
3294 into BASE as long as possible. */
3296 while (offtype == NULL_TREE)
3298 enum tree_code code;
3299 tree op0, op1, add = NULL_TREE;
3301 if (TREE_CODE (off) == SSA_NAME)
3303 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3305 if (expr_invariant_in_loop_p (loop, off))
3308 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3311 op0 = gimple_assign_rhs1 (def_stmt);
3312 code = gimple_assign_rhs_code (def_stmt);
3313 op1 = gimple_assign_rhs2 (def_stmt);
3317 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3319 code = TREE_CODE (off);
3320 extract_ops_from_tree (off, &code, &op0, &op1);
3324 case POINTER_PLUS_EXPR:
3326 if (expr_invariant_in_loop_p (loop, op0))
3331 add = fold_convert (sizetype, add);
3333 add = size_binop (MULT_EXPR, add, size_int (scale));
3334 base = size_binop (PLUS_EXPR, base, add);
3337 if (expr_invariant_in_loop_p (loop, op1))
3345 if (expr_invariant_in_loop_p (loop, op1))
3347 add = fold_convert (sizetype, op1);
3348 add = size_binop (MINUS_EXPR, size_zero_node, add);
3354 if (scale == 1 && tree_fits_shwi_p (op1))
3356 scale = tree_to_shwi (op1);
3365 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3366 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3368 if (TYPE_PRECISION (TREE_TYPE (op0))
3369 == TYPE_PRECISION (TREE_TYPE (off)))
3374 if (TYPE_PRECISION (TREE_TYPE (op0))
3375 < TYPE_PRECISION (TREE_TYPE (off)))
3378 offtype = TREE_TYPE (off);
3389 /* If at the end OFF still isn't a SSA_NAME or isn't
3390 defined in the loop, punt. */
3391 if (TREE_CODE (off) != SSA_NAME
3392 || expr_invariant_in_loop_p (loop, off))
3395 if (offtype == NULL_TREE)
3396 offtype = TREE_TYPE (off);
3398 if (DR_IS_READ (dr))
3399 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3402 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3405 if (decl == NULL_TREE)
3417 /* Function vect_analyze_data_refs.
3419 Find all the data references in the loop or basic block.
3421 The general structure of the analysis of data refs in the vectorizer is as
3423 1- vect_analyze_data_refs(loop/bb): call
3424 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3425 in the loop/bb and their dependences.
3426 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3427 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3428 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3433 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3435 struct loop *loop = NULL;
3437 struct data_reference *dr;
3440 if (dump_enabled_p ())
3441 dump_printf_loc (MSG_NOTE, vect_location,
3442 "=== vect_analyze_data_refs ===\n");
3444 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3445 loop = LOOP_VINFO_LOOP (loop_vinfo);
3447 /* Go through the data-refs, check that the analysis succeeded. Update
3448 pointer from stmt_vec_info struct to DR and vectype. */
3450 vec<data_reference_p> datarefs = vinfo->datarefs;
3451 FOR_EACH_VEC_ELT (datarefs, i, dr)
3454 stmt_vec_info stmt_info;
3455 tree base, offset, init;
3456 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3457 bool simd_lane_access = false;
3461 if (!dr || !DR_REF (dr))
3463 if (dump_enabled_p ())
3464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3465 "not vectorized: unhandled data-ref\n");
3469 stmt = DR_STMT (dr);
3470 stmt_info = vinfo_for_stmt (stmt);
3472 /* Discard clobbers from the dataref vector. We will remove
3473 clobber stmts during vectorization. */
3474 if (gimple_clobber_p (stmt))
3477 if (i == datarefs.length () - 1)
3482 datarefs.ordered_remove (i);
3487 /* Check that analysis of the data-ref succeeded. */
3488 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3493 && !TREE_THIS_VOLATILE (DR_REF (dr))
3494 && targetm.vectorize.builtin_gather != NULL;
3497 && !TREE_THIS_VOLATILE (DR_REF (dr))
3498 && targetm.vectorize.builtin_scatter != NULL;
3499 bool maybe_simd_lane_access
3500 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3502 /* If target supports vector gather loads or scatter stores, or if
3503 this might be a SIMD lane access, see if they can't be used. */
3504 if (is_a <loop_vec_info> (vinfo)
3505 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3506 && !nested_in_vect_loop_p (loop, stmt))
3508 struct data_reference *newdr
3509 = create_data_ref (NULL, loop_containing_stmt (stmt),
3510 DR_REF (dr), stmt, maybe_scatter ? false : true);
3511 gcc_assert (newdr != NULL && DR_REF (newdr));
3512 if (DR_BASE_ADDRESS (newdr)
3513 && DR_OFFSET (newdr)
3516 && integer_zerop (DR_STEP (newdr)))
3518 if (maybe_simd_lane_access)
3520 tree off = DR_OFFSET (newdr);
3522 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3523 && TREE_CODE (off) == MULT_EXPR
3524 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3526 tree step = TREE_OPERAND (off, 1);
3527 off = TREE_OPERAND (off, 0);
3529 if (CONVERT_EXPR_P (off)
3530 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3532 < TYPE_PRECISION (TREE_TYPE (off)))
3533 off = TREE_OPERAND (off, 0);
3534 if (TREE_CODE (off) == SSA_NAME)
3536 gimple *def = SSA_NAME_DEF_STMT (off);
3537 tree reft = TREE_TYPE (DR_REF (newdr));
3538 if (is_gimple_call (def)
3539 && gimple_call_internal_p (def)
3540 && (gimple_call_internal_fn (def)
3541 == IFN_GOMP_SIMD_LANE))
3543 tree arg = gimple_call_arg (def, 0);
3544 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3545 arg = SSA_NAME_VAR (arg);
3546 if (arg == loop->simduid
3548 && tree_int_cst_equal
3549 (TYPE_SIZE_UNIT (reft),
3552 DR_OFFSET (newdr) = ssize_int (0);
3553 DR_STEP (newdr) = step;
3554 DR_ALIGNED_TO (newdr)
3555 = size_int (BIGGEST_ALIGNMENT);
3557 simd_lane_access = true;
3563 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3567 gatherscatter = GATHER;
3569 gatherscatter = SCATTER;
3572 if (gatherscatter == SG_NONE && !simd_lane_access)
3573 free_data_ref (newdr);
3576 if (gatherscatter == SG_NONE && !simd_lane_access)
3578 if (dump_enabled_p ())
3580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3581 "not vectorized: data ref analysis "
3583 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3584 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3587 if (is_a <bb_vec_info> (vinfo))
3594 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3596 if (dump_enabled_p ())
3597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598 "not vectorized: base addr of dr is a "
3601 if (is_a <bb_vec_info> (vinfo))
3604 if (gatherscatter != SG_NONE || simd_lane_access)
3609 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3611 if (dump_enabled_p ())
3613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3614 "not vectorized: volatile type ");
3615 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3616 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3619 if (is_a <bb_vec_info> (vinfo))
3625 if (stmt_can_throw_internal (stmt))
3627 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3630 "not vectorized: statement can throw an "
3632 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3633 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3636 if (is_a <bb_vec_info> (vinfo))
3639 if (gatherscatter != SG_NONE || simd_lane_access)
3644 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3645 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3647 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650 "not vectorized: statement is bitfield "
3652 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3653 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3656 if (is_a <bb_vec_info> (vinfo))
3659 if (gatherscatter != SG_NONE || simd_lane_access)
3664 base = unshare_expr (DR_BASE_ADDRESS (dr));
3665 offset = unshare_expr (DR_OFFSET (dr));
3666 init = unshare_expr (DR_INIT (dr));
3668 if (is_gimple_call (stmt)
3669 && (!gimple_call_internal_p (stmt)
3670 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3671 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3673 if (dump_enabled_p ())
3675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3676 "not vectorized: dr in a call ");
3677 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3678 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3681 if (is_a <bb_vec_info> (vinfo))
3684 if (gatherscatter != SG_NONE || simd_lane_access)
3689 /* Update DR field in stmt_vec_info struct. */
3691 /* If the dataref is in an inner-loop of the loop that is considered for
3692 for vectorization, we also want to analyze the access relative to
3693 the outer-loop (DR contains information only relative to the
3694 inner-most enclosing loop). We do that by building a reference to the
3695 first location accessed by the inner-loop, and analyze it relative to
3697 if (loop && nested_in_vect_loop_p (loop, stmt))
3699 tree outer_step, outer_base, outer_init;
3700 HOST_WIDE_INT pbitsize, pbitpos;
3703 int punsignedp, preversep, pvolatilep;
3704 affine_iv base_iv, offset_iv;
3707 /* Build a reference to the first location accessed by the
3708 inner-loop: *(BASE+INIT). (The first location is actually
3709 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3710 tree inner_base = build_fold_indirect_ref
3711 (fold_build_pointer_plus (base, init));
3713 if (dump_enabled_p ())
3715 dump_printf_loc (MSG_NOTE, vect_location,
3716 "analyze in outer-loop: ");
3717 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3718 dump_printf (MSG_NOTE, "\n");
3721 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3722 &poffset, &pmode, &punsignedp,
3723 &preversep, &pvolatilep, false);
3724 gcc_assert (outer_base != NULL_TREE);
3726 if (pbitpos % BITS_PER_UNIT != 0)
3728 if (dump_enabled_p ())
3729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3730 "failed: bit offset alignment.\n");
3736 if (dump_enabled_p ())
3737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3738 "failed: reverse storage order.\n");
3742 outer_base = build_fold_addr_expr (outer_base);
3743 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3746 if (dump_enabled_p ())
3747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3748 "failed: evolution of base is not affine.\n");
3755 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3763 offset_iv.base = ssize_int (0);
3764 offset_iv.step = ssize_int (0);
3766 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3769 if (dump_enabled_p ())
3770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3771 "evolution of offset is not affine.\n");
3775 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3776 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3777 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3778 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3779 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3781 outer_step = size_binop (PLUS_EXPR,
3782 fold_convert (ssizetype, base_iv.step),
3783 fold_convert (ssizetype, offset_iv.step));
3785 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3786 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3787 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3788 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3789 STMT_VINFO_DR_OFFSET (stmt_info) =
3790 fold_convert (ssizetype, offset_iv.base);
3791 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3792 size_int (highest_pow2_factor (offset_iv.base));
3794 if (dump_enabled_p ())
3796 dump_printf_loc (MSG_NOTE, vect_location,
3797 "\touter base_address: ");
3798 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3799 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3800 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3801 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3802 STMT_VINFO_DR_OFFSET (stmt_info));
3803 dump_printf (MSG_NOTE,
3804 "\n\touter constant offset from base address: ");
3805 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3806 STMT_VINFO_DR_INIT (stmt_info));
3807 dump_printf (MSG_NOTE, "\n\touter step: ");
3808 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3809 STMT_VINFO_DR_STEP (stmt_info));
3810 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3811 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3812 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3813 dump_printf (MSG_NOTE, "\n");
3817 if (STMT_VINFO_DATA_REF (stmt_info))
3819 if (dump_enabled_p ())
3821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3822 "not vectorized: more than one data ref "
3824 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3825 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3828 if (is_a <bb_vec_info> (vinfo))
3831 if (gatherscatter != SG_NONE || simd_lane_access)
3836 STMT_VINFO_DATA_REF (stmt_info) = dr;
3837 if (simd_lane_access)
3839 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3840 free_data_ref (datarefs[i]);
3844 /* Set vectype for STMT. */
3845 scalar_type = TREE_TYPE (DR_REF (dr));
3846 STMT_VINFO_VECTYPE (stmt_info)
3847 = get_vectype_for_scalar_type (scalar_type);
3848 if (!STMT_VINFO_VECTYPE (stmt_info))
3850 if (dump_enabled_p ())
3852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3853 "not vectorized: no vectype for stmt: ");
3854 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3855 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3856 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3858 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3861 if (is_a <bb_vec_info> (vinfo))
3863 /* No vector type is fine, the ref can still participate
3864 in dependence analysis, we just can't vectorize it. */
3865 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3869 if (gatherscatter != SG_NONE || simd_lane_access)
3871 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3872 if (gatherscatter != SG_NONE)
3879 if (dump_enabled_p ())
3881 dump_printf_loc (MSG_NOTE, vect_location,
3882 "got vectype for stmt: ");
3883 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3884 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3885 STMT_VINFO_VECTYPE (stmt_info));
3886 dump_printf (MSG_NOTE, "\n");
3890 /* Adjust the minimal vectorization factor according to the
3892 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3896 if (gatherscatter != SG_NONE)
3899 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3901 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3903 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3905 if (dump_enabled_p ())
3907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3908 (gatherscatter == GATHER) ?
3909 "not vectorized: not suitable for gather "
3911 "not vectorized: not suitable for scatter "
3913 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3914 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3919 free_data_ref (datarefs[i]);
3921 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3924 else if (is_a <loop_vec_info> (vinfo)
3925 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3927 if (nested_in_vect_loop_p (loop, stmt))
3929 if (dump_enabled_p ())
3931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3932 "not vectorized: not suitable for strided "
3934 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3935 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3939 STMT_VINFO_STRIDED_P (stmt_info) = true;
3943 /* If we stopped analysis at the first dataref we could not analyze
3944 when trying to vectorize a basic-block mark the rest of the datarefs
3945 as not vectorizable and truncate the vector of datarefs. That
3946 avoids spending useless time in analyzing their dependence. */
3947 if (i != datarefs.length ())
3949 gcc_assert (is_a <bb_vec_info> (vinfo));
3950 for (unsigned j = i; j < datarefs.length (); ++j)
3952 data_reference_p dr = datarefs[j];
3953 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3956 datarefs.truncate (i);
3963 /* Function vect_get_new_vect_var.
3965 Returns a name for a new variable. The current naming scheme appends the
3966 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3967 the name of vectorizer generated variables, and appends that to NAME if
3971 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3978 case vect_simple_var:
3981 case vect_scalar_var:
3987 case vect_pointer_var:
3996 char* tmp = concat (prefix, "_", name, NULL);
3997 new_vect_var = create_tmp_reg (type, tmp);
4001 new_vect_var = create_tmp_reg (type, prefix);
4003 return new_vect_var;
4006 /* Like vect_get_new_vect_var but return an SSA name. */
4009 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4016 case vect_simple_var:
4019 case vect_scalar_var:
4022 case vect_pointer_var:
4031 char* tmp = concat (prefix, "_", name, NULL);
4032 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4036 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4038 return new_vect_var;
4041 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4044 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4045 stmt_vec_info stmt_info)
4047 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4048 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4049 int misalign = DR_MISALIGNMENT (dr);
4051 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4053 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4056 /* Function vect_create_addr_base_for_vector_ref.
4058 Create an expression that computes the address of the first memory location
4059 that will be accessed for a data reference.
4062 STMT: The statement containing the data reference.
4063 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4064 OFFSET: Optional. If supplied, it is be added to the initial address.
4065 LOOP: Specify relative to which loop-nest should the address be computed.
4066 For example, when the dataref is in an inner-loop nested in an
4067 outer-loop that is now being vectorized, LOOP can be either the
4068 outer-loop, or the inner-loop. The first memory location accessed
4069 by the following dataref ('in' points to short):
4076 if LOOP=i_loop: &in (relative to i_loop)
4077 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4078 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4079 initial address. Unlike OFFSET, which is number of elements to
4080 be added, BYTE_OFFSET is measured in bytes.
4083 1. Return an SSA_NAME whose value is the address of the memory location of
4084 the first vector of the data reference.
4085 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4086 these statement(s) which define the returned SSA_NAME.
4088 FORNOW: We are only handling array accesses with step 1. */
4091 vect_create_addr_base_for_vector_ref (gimple *stmt,
4092 gimple_seq *new_stmt_list,
4097 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4098 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4100 const char *base_name;
4103 gimple_seq seq = NULL;
4107 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4108 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4110 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4112 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4114 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4116 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4117 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4118 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4122 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4123 base_offset = unshare_expr (DR_OFFSET (dr));
4124 init = unshare_expr (DR_INIT (dr));
4128 base_name = get_name (data_ref_base);
4131 base_offset = ssize_int (0);
4132 init = ssize_int (0);
4133 base_name = get_name (DR_REF (dr));
4136 /* Create base_offset */
4137 base_offset = size_binop (PLUS_EXPR,
4138 fold_convert (sizetype, base_offset),
4139 fold_convert (sizetype, init));
4143 offset = fold_build2 (MULT_EXPR, sizetype,
4144 fold_convert (sizetype, offset), step);
4145 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4146 base_offset, offset);
4150 byte_offset = fold_convert (sizetype, byte_offset);
4151 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4152 base_offset, byte_offset);
4155 /* base + base_offset */
4157 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4160 addr_base = build1 (ADDR_EXPR,
4161 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4162 unshare_expr (DR_REF (dr)));
4165 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4166 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4167 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4168 gimple_seq_add_seq (new_stmt_list, seq);
4170 if (DR_PTR_INFO (dr)
4171 && TREE_CODE (addr_base) == SSA_NAME
4172 && !SSA_NAME_PTR_INFO (addr_base))
4174 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4175 if (offset || byte_offset)
4176 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4179 if (dump_enabled_p ())
4181 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4182 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4183 dump_printf (MSG_NOTE, "\n");
4190 /* Function vect_create_data_ref_ptr.
4192 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4193 location accessed in the loop by STMT, along with the def-use update
4194 chain to appropriately advance the pointer through the loop iterations.
4195 Also set aliasing information for the pointer. This pointer is used by
4196 the callers to this function to create a memory reference expression for
4197 vector load/store access.
4200 1. STMT: a stmt that references memory. Expected to be of the form
4201 GIMPLE_ASSIGN <name, data-ref> or
4202 GIMPLE_ASSIGN <data-ref, name>.
4203 2. AGGR_TYPE: the type of the reference, which should be either a vector
4205 3. AT_LOOP: the loop where the vector memref is to be created.
4206 4. OFFSET (optional): an offset to be added to the initial address accessed
4207 by the data-ref in STMT.
4208 5. BSI: location where the new stmts are to be placed if there is no loop
4209 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4210 pointing to the initial address.
4211 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4212 to the initial address accessed by the data-ref in STMT. This is
4213 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4217 1. Declare a new ptr to vector_type, and have it point to the base of the
4218 data reference (initial addressed accessed by the data reference).
4219 For example, for vector of type V8HI, the following code is generated:
4222 ap = (v8hi *)initial_address;
4224 if OFFSET is not supplied:
4225 initial_address = &a[init];
4226 if OFFSET is supplied:
4227 initial_address = &a[init + OFFSET];
4228 if BYTE_OFFSET is supplied:
4229 initial_address = &a[init] + BYTE_OFFSET;
4231 Return the initial_address in INITIAL_ADDRESS.
4233 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4234 update the pointer in each iteration of the loop.
4236 Return the increment stmt that updates the pointer in PTR_INCR.
4238 3. Set INV_P to true if the access pattern of the data reference in the
4239 vectorized loop is invariant. Set it to false otherwise.
4241 4. Return the pointer. */
4244 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4245 tree offset, tree *initial_address,
4246 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4247 bool only_init, bool *inv_p, tree byte_offset)
4249 const char *base_name;
4250 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4251 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4252 struct loop *loop = NULL;
4253 bool nested_in_vect_loop = false;
4254 struct loop *containing_loop = NULL;
4258 gimple_seq new_stmt_list = NULL;
4262 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4264 gimple_stmt_iterator incr_gsi;
4266 tree indx_before_incr, indx_after_incr;
4269 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4271 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4272 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4276 loop = LOOP_VINFO_LOOP (loop_vinfo);
4277 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4278 containing_loop = (gimple_bb (stmt))->loop_father;
4279 pe = loop_preheader_edge (loop);
4283 gcc_assert (bb_vinfo);
4288 /* Check the step (evolution) of the load in LOOP, and record
4289 whether it's invariant. */
4290 if (nested_in_vect_loop)
4291 step = STMT_VINFO_DR_STEP (stmt_info);
4293 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4295 if (integer_zerop (step))
4300 /* Create an expression for the first address accessed by this load
4302 base_name = get_name (DR_BASE_ADDRESS (dr));
4304 if (dump_enabled_p ())
4306 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4307 dump_printf_loc (MSG_NOTE, vect_location,
4308 "create %s-pointer variable to type: ",
4309 get_tree_code_name (TREE_CODE (aggr_type)));
4310 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4311 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4312 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4313 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4314 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4315 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4316 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4318 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4319 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4320 dump_printf (MSG_NOTE, "\n");
4323 /* (1) Create the new aggregate-pointer variable.
4324 Vector and array types inherit the alias set of their component
4325 type by default so we need to use a ref-all pointer if the data
4326 reference does not conflict with the created aggregated data
4327 reference because it is not addressable. */
4328 bool need_ref_all = false;
4329 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4330 get_alias_set (DR_REF (dr))))
4331 need_ref_all = true;
4332 /* Likewise for any of the data references in the stmt group. */
4333 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4335 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4338 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4339 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4340 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4341 get_alias_set (DR_REF (sdr))))
4343 need_ref_all = true;
4346 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4350 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4352 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4355 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4356 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4357 def-use update cycles for the pointer: one relative to the outer-loop
4358 (LOOP), which is what steps (3) and (4) below do. The other is relative
4359 to the inner-loop (which is the inner-most loop containing the dataref),
4360 and this is done be step (5) below.
4362 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4363 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4364 redundant. Steps (3),(4) create the following:
4367 LOOP: vp1 = phi(vp0,vp2)
4373 If there is an inner-loop nested in loop, then step (5) will also be
4374 applied, and an additional update in the inner-loop will be created:
4377 LOOP: vp1 = phi(vp0,vp2)
4379 inner: vp3 = phi(vp1,vp4)
4380 vp4 = vp3 + inner_step
4386 /* (2) Calculate the initial address of the aggregate-pointer, and set
4387 the aggregate-pointer to point to it before the loop. */
4389 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4391 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4392 offset, loop, byte_offset);
4397 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4398 gcc_assert (!new_bb);
4401 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4404 *initial_address = new_temp;
4405 aggr_ptr_init = new_temp;
4407 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4408 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4409 inner-loop nested in LOOP (during outer-loop vectorization). */
4411 /* No update in loop is required. */
4412 if (only_init && (!loop_vinfo || at_loop == loop))
4413 aptr = aggr_ptr_init;
4416 /* The step of the aggregate pointer is the type size. */
4417 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4418 /* One exception to the above is when the scalar step of the load in
4419 LOOP is zero. In this case the step here is also zero. */
4421 iv_step = size_zero_node;
4422 else if (tree_int_cst_sgn (step) == -1)
4423 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4425 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4427 create_iv (aggr_ptr_init,
4428 fold_convert (aggr_ptr_type, iv_step),
4429 aggr_ptr, loop, &incr_gsi, insert_after,
4430 &indx_before_incr, &indx_after_incr);
4431 incr = gsi_stmt (incr_gsi);
4432 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4434 /* Copy the points-to information if it exists. */
4435 if (DR_PTR_INFO (dr))
4437 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4438 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4443 aptr = indx_before_incr;
4446 if (!nested_in_vect_loop || only_init)
4450 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4451 nested in LOOP, if exists. */
4453 gcc_assert (nested_in_vect_loop);
4456 standard_iv_increment_position (containing_loop, &incr_gsi,
4458 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4459 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4461 incr = gsi_stmt (incr_gsi);
4462 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4464 /* Copy the points-to information if it exists. */
4465 if (DR_PTR_INFO (dr))
4467 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4468 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4473 return indx_before_incr;
4480 /* Function bump_vector_ptr
4482 Increment a pointer (to a vector type) by vector-size. If requested,
4483 i.e. if PTR-INCR is given, then also connect the new increment stmt
4484 to the existing def-use update-chain of the pointer, by modifying
4485 the PTR_INCR as illustrated below:
4487 The pointer def-use update-chain before this function:
4488 DATAREF_PTR = phi (p_0, p_2)
4490 PTR_INCR: p_2 = DATAREF_PTR + step
4492 The pointer def-use update-chain after this function:
4493 DATAREF_PTR = phi (p_0, p_2)
4495 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4497 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4500 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4502 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4503 the loop. The increment amount across iterations is expected
4505 BSI - location where the new update stmt is to be placed.
4506 STMT - the original scalar memory-access stmt that is being vectorized.
4507 BUMP - optional. The offset by which to bump the pointer. If not given,
4508 the offset is assumed to be vector_size.
4510 Output: Return NEW_DATAREF_PTR as illustrated above.
4515 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4516 gimple *stmt, tree bump)
4518 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4519 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4520 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4521 tree update = TYPE_SIZE_UNIT (vectype);
4524 use_operand_p use_p;
4525 tree new_dataref_ptr;
4530 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4531 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4533 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4534 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4535 dataref_ptr, update);
4536 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4538 /* Copy the points-to information if it exists. */
4539 if (DR_PTR_INFO (dr))
4541 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4542 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4546 return new_dataref_ptr;
4548 /* Update the vector-pointer's cross-iteration increment. */
4549 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4551 tree use = USE_FROM_PTR (use_p);
4553 if (use == dataref_ptr)
4554 SET_USE (use_p, new_dataref_ptr);
4556 gcc_assert (tree_int_cst_compare (use, update) == 0);
4559 return new_dataref_ptr;
4563 /* Function vect_create_destination_var.
4565 Create a new temporary of type VECTYPE. */
4568 vect_create_destination_var (tree scalar_dest, tree vectype)
4574 enum vect_var_kind kind;
4577 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4581 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4583 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4585 name = get_name (scalar_dest);
4587 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4589 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4590 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4596 /* Function vect_grouped_store_supported.
4598 Returns TRUE if interleave high and interleave low permutations
4599 are supported, and FALSE otherwise. */
4602 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4604 machine_mode mode = TYPE_MODE (vectype);
4606 /* vect_permute_store_chain requires the group size to be equal to 3 or
4607 be a power of two. */
4608 if (count != 3 && exact_log2 (count) == -1)
4610 if (dump_enabled_p ())
4611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4612 "the size of the group of accesses"
4613 " is not a power of 2 or not eqaul to 3\n");
4617 /* Check that the permutation is supported. */
4618 if (VECTOR_MODE_P (mode))
4620 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4621 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4625 unsigned int j0 = 0, j1 = 0, j2 = 0;
4628 for (j = 0; j < 3; j++)
4630 int nelt0 = ((3 - j) * nelt) % 3;
4631 int nelt1 = ((3 - j) * nelt + 1) % 3;
4632 int nelt2 = ((3 - j) * nelt + 2) % 3;
4633 for (i = 0; i < nelt; i++)
4635 if (3 * i + nelt0 < nelt)
4636 sel[3 * i + nelt0] = j0++;
4637 if (3 * i + nelt1 < nelt)
4638 sel[3 * i + nelt1] = nelt + j1++;
4639 if (3 * i + nelt2 < nelt)
4640 sel[3 * i + nelt2] = 0;
4642 if (!can_vec_perm_p (mode, false, sel))
4644 if (dump_enabled_p ())
4645 dump_printf (MSG_MISSED_OPTIMIZATION,
4646 "permutaion op not supported by target.\n");
4650 for (i = 0; i < nelt; i++)
4652 if (3 * i + nelt0 < nelt)
4653 sel[3 * i + nelt0] = 3 * i + nelt0;
4654 if (3 * i + nelt1 < nelt)
4655 sel[3 * i + nelt1] = 3 * i + nelt1;
4656 if (3 * i + nelt2 < nelt)
4657 sel[3 * i + nelt2] = nelt + j2++;
4659 if (!can_vec_perm_p (mode, false, sel))
4661 if (dump_enabled_p ())
4662 dump_printf (MSG_MISSED_OPTIMIZATION,
4663 "permutaion op not supported by target.\n");
4671 /* If length is not equal to 3 then only power of 2 is supported. */
4672 gcc_assert (exact_log2 (count) != -1);
4674 for (i = 0; i < nelt / 2; i++)
4677 sel[i * 2 + 1] = i + nelt;
4679 if (can_vec_perm_p (mode, false, sel))
4681 for (i = 0; i < nelt; i++)
4683 if (can_vec_perm_p (mode, false, sel))
4689 if (dump_enabled_p ())
4690 dump_printf (MSG_MISSED_OPTIMIZATION,
4691 "permutaion op not supported by target.\n");
4696 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4700 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4702 return vect_lanes_optab_supported_p ("vec_store_lanes",
4703 vec_store_lanes_optab,
4708 /* Function vect_permute_store_chain.
4710 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4711 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4712 the data correctly for the stores. Return the final references for stores
4715 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4716 The input is 4 vectors each containing 8 elements. We assign a number to
4717 each element, the input sequence is:
4719 1st vec: 0 1 2 3 4 5 6 7
4720 2nd vec: 8 9 10 11 12 13 14 15
4721 3rd vec: 16 17 18 19 20 21 22 23
4722 4th vec: 24 25 26 27 28 29 30 31
4724 The output sequence should be:
4726 1st vec: 0 8 16 24 1 9 17 25
4727 2nd vec: 2 10 18 26 3 11 19 27
4728 3rd vec: 4 12 20 28 5 13 21 30
4729 4th vec: 6 14 22 30 7 15 23 31
4731 i.e., we interleave the contents of the four vectors in their order.
4733 We use interleave_high/low instructions to create such output. The input of
4734 each interleave_high/low operation is two vectors:
4737 the even elements of the result vector are obtained left-to-right from the
4738 high/low elements of the first vector. The odd elements of the result are
4739 obtained left-to-right from the high/low elements of the second vector.
4740 The output of interleave_high will be: 0 4 1 5
4741 and of interleave_low: 2 6 3 7
4744 The permutation is done in log LENGTH stages. In each stage interleave_high
4745 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4746 where the first argument is taken from the first half of DR_CHAIN and the
4747 second argument from it's second half.
4750 I1: interleave_high (1st vec, 3rd vec)
4751 I2: interleave_low (1st vec, 3rd vec)
4752 I3: interleave_high (2nd vec, 4th vec)
4753 I4: interleave_low (2nd vec, 4th vec)
4755 The output for the first stage is:
4757 I1: 0 16 1 17 2 18 3 19
4758 I2: 4 20 5 21 6 22 7 23
4759 I3: 8 24 9 25 10 26 11 27
4760 I4: 12 28 13 29 14 30 15 31
4762 The output of the second stage, i.e. the final result is:
4764 I1: 0 8 16 24 1 9 17 25
4765 I2: 2 10 18 26 3 11 19 27
4766 I3: 4 12 20 28 5 13 21 30
4767 I4: 6 14 22 30 7 15 23 31. */
4770 vect_permute_store_chain (vec<tree> dr_chain,
4771 unsigned int length,
4773 gimple_stmt_iterator *gsi,
4774 vec<tree> *result_chain)
4776 tree vect1, vect2, high, low;
4778 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4779 tree perm_mask_low, perm_mask_high;
4781 tree perm3_mask_low, perm3_mask_high;
4782 unsigned int i, n, log_length = exact_log2 (length);
4783 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4784 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4786 result_chain->quick_grow (length);
4787 memcpy (result_chain->address (), dr_chain.address (),
4788 length * sizeof (tree));
4792 unsigned int j0 = 0, j1 = 0, j2 = 0;
4794 for (j = 0; j < 3; j++)
4796 int nelt0 = ((3 - j) * nelt) % 3;
4797 int nelt1 = ((3 - j) * nelt + 1) % 3;
4798 int nelt2 = ((3 - j) * nelt + 2) % 3;
4800 for (i = 0; i < nelt; i++)
4802 if (3 * i + nelt0 < nelt)
4803 sel[3 * i + nelt0] = j0++;
4804 if (3 * i + nelt1 < nelt)
4805 sel[3 * i + nelt1] = nelt + j1++;
4806 if (3 * i + nelt2 < nelt)
4807 sel[3 * i + nelt2] = 0;
4809 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4811 for (i = 0; i < nelt; i++)
4813 if (3 * i + nelt0 < nelt)
4814 sel[3 * i + nelt0] = 3 * i + nelt0;
4815 if (3 * i + nelt1 < nelt)
4816 sel[3 * i + nelt1] = 3 * i + nelt1;
4817 if (3 * i + nelt2 < nelt)
4818 sel[3 * i + nelt2] = nelt + j2++;
4820 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4822 vect1 = dr_chain[0];
4823 vect2 = dr_chain[1];
4825 /* Create interleaving stmt:
4826 low = VEC_PERM_EXPR <vect1, vect2,
4827 {j, nelt, *, j + 1, nelt + j + 1, *,
4828 j + 2, nelt + j + 2, *, ...}> */
4829 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4830 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4831 vect2, perm3_mask_low);
4832 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4835 vect2 = dr_chain[2];
4836 /* Create interleaving stmt:
4837 low = VEC_PERM_EXPR <vect1, vect2,
4838 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4839 6, 7, nelt + j + 2, ...}> */
4840 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4841 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4842 vect2, perm3_mask_high);
4843 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4844 (*result_chain)[j] = data_ref;
4849 /* If length is not equal to 3 then only power of 2 is supported. */
4850 gcc_assert (exact_log2 (length) != -1);
4852 for (i = 0, n = nelt / 2; i < n; i++)
4855 sel[i * 2 + 1] = i + nelt;
4857 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4859 for (i = 0; i < nelt; i++)
4861 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4863 for (i = 0, n = log_length; i < n; i++)
4865 for (j = 0; j < length/2; j++)
4867 vect1 = dr_chain[j];
4868 vect2 = dr_chain[j+length/2];
4870 /* Create interleaving stmt:
4871 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4873 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4874 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4875 vect2, perm_mask_high);
4876 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4877 (*result_chain)[2*j] = high;
4879 /* Create interleaving stmt:
4880 low = VEC_PERM_EXPR <vect1, vect2,
4881 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4883 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4884 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4885 vect2, perm_mask_low);
4886 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4887 (*result_chain)[2*j+1] = low;
4889 memcpy (dr_chain.address (), result_chain->address (),
4890 length * sizeof (tree));
4895 /* Function vect_setup_realignment
4897 This function is called when vectorizing an unaligned load using
4898 the dr_explicit_realign[_optimized] scheme.
4899 This function generates the following code at the loop prolog:
4902 x msq_init = *(floor(p)); # prolog load
4903 realignment_token = call target_builtin;
4905 x msq = phi (msq_init, ---)
4907 The stmts marked with x are generated only for the case of
4908 dr_explicit_realign_optimized.
4910 The code above sets up a new (vector) pointer, pointing to the first
4911 location accessed by STMT, and a "floor-aligned" load using that pointer.
4912 It also generates code to compute the "realignment-token" (if the relevant
4913 target hook was defined), and creates a phi-node at the loop-header bb
4914 whose arguments are the result of the prolog-load (created by this
4915 function) and the result of a load that takes place in the loop (to be
4916 created by the caller to this function).
4918 For the case of dr_explicit_realign_optimized:
4919 The caller to this function uses the phi-result (msq) to create the
4920 realignment code inside the loop, and sets up the missing phi argument,
4923 msq = phi (msq_init, lsq)
4924 lsq = *(floor(p')); # load in loop
4925 result = realign_load (msq, lsq, realignment_token);
4927 For the case of dr_explicit_realign:
4929 msq = *(floor(p)); # load in loop
4931 lsq = *(floor(p')); # load in loop
4932 result = realign_load (msq, lsq, realignment_token);
4935 STMT - (scalar) load stmt to be vectorized. This load accesses
4936 a memory location that may be unaligned.
4937 BSI - place where new code is to be inserted.
4938 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4942 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4943 target hook, if defined.
4944 Return value - the result of the loop-header phi node. */
4947 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4948 tree *realignment_token,
4949 enum dr_alignment_support alignment_support_scheme,
4951 struct loop **at_loop)
4953 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4954 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4955 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4956 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4957 struct loop *loop = NULL;
4959 tree scalar_dest = gimple_assign_lhs (stmt);
4965 tree msq_init = NULL_TREE;
4968 tree msq = NULL_TREE;
4969 gimple_seq stmts = NULL;
4971 bool compute_in_loop = false;
4972 bool nested_in_vect_loop = false;
4973 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4974 struct loop *loop_for_initial_load = NULL;
4978 loop = LOOP_VINFO_LOOP (loop_vinfo);
4979 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4982 gcc_assert (alignment_support_scheme == dr_explicit_realign
4983 || alignment_support_scheme == dr_explicit_realign_optimized);
4985 /* We need to generate three things:
4986 1. the misalignment computation
4987 2. the extra vector load (for the optimized realignment scheme).
4988 3. the phi node for the two vectors from which the realignment is
4989 done (for the optimized realignment scheme). */
4991 /* 1. Determine where to generate the misalignment computation.
4993 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4994 calculation will be generated by this function, outside the loop (in the
4995 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4996 caller, inside the loop.
4998 Background: If the misalignment remains fixed throughout the iterations of
4999 the loop, then both realignment schemes are applicable, and also the
5000 misalignment computation can be done outside LOOP. This is because we are
5001 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5002 are a multiple of VS (the Vector Size), and therefore the misalignment in
5003 different vectorized LOOP iterations is always the same.
5004 The problem arises only if the memory access is in an inner-loop nested
5005 inside LOOP, which is now being vectorized using outer-loop vectorization.
5006 This is the only case when the misalignment of the memory access may not
5007 remain fixed throughout the iterations of the inner-loop (as explained in
5008 detail in vect_supportable_dr_alignment). In this case, not only is the
5009 optimized realignment scheme not applicable, but also the misalignment
5010 computation (and generation of the realignment token that is passed to
5011 REALIGN_LOAD) have to be done inside the loop.
5013 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5014 or not, which in turn determines if the misalignment is computed inside
5015 the inner-loop, or outside LOOP. */
5017 if (init_addr != NULL_TREE || !loop_vinfo)
5019 compute_in_loop = true;
5020 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5024 /* 2. Determine where to generate the extra vector load.
5026 For the optimized realignment scheme, instead of generating two vector
5027 loads in each iteration, we generate a single extra vector load in the
5028 preheader of the loop, and in each iteration reuse the result of the
5029 vector load from the previous iteration. In case the memory access is in
5030 an inner-loop nested inside LOOP, which is now being vectorized using
5031 outer-loop vectorization, we need to determine whether this initial vector
5032 load should be generated at the preheader of the inner-loop, or can be
5033 generated at the preheader of LOOP. If the memory access has no evolution
5034 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5035 to be generated inside LOOP (in the preheader of the inner-loop). */
5037 if (nested_in_vect_loop)
5039 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5040 bool invariant_in_outerloop =
5041 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5042 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5045 loop_for_initial_load = loop;
5047 *at_loop = loop_for_initial_load;
5049 if (loop_for_initial_load)
5050 pe = loop_preheader_edge (loop_for_initial_load);
5052 /* 3. For the case of the optimized realignment, create the first vector
5053 load at the loop preheader. */
5055 if (alignment_support_scheme == dr_explicit_realign_optimized)
5057 /* Create msq_init = *(floor(p1)) in the loop preheader */
5060 gcc_assert (!compute_in_loop);
5061 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5062 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5063 NULL_TREE, &init_addr, NULL, &inc,
5065 if (TREE_CODE (ptr) == SSA_NAME)
5066 new_temp = copy_ssa_name (ptr);
5068 new_temp = make_ssa_name (TREE_TYPE (ptr));
5069 new_stmt = gimple_build_assign
5070 (new_temp, BIT_AND_EXPR, ptr,
5071 build_int_cst (TREE_TYPE (ptr),
5072 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5073 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5074 gcc_assert (!new_bb);
5076 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5077 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5078 new_stmt = gimple_build_assign (vec_dest, data_ref);
5079 new_temp = make_ssa_name (vec_dest, new_stmt);
5080 gimple_assign_set_lhs (new_stmt, new_temp);
5083 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5084 gcc_assert (!new_bb);
5087 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5089 msq_init = gimple_assign_lhs (new_stmt);
5092 /* 4. Create realignment token using a target builtin, if available.
5093 It is done either inside the containing loop, or before LOOP (as
5094 determined above). */
5096 if (targetm.vectorize.builtin_mask_for_load)
5101 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5104 /* Generate the INIT_ADDR computation outside LOOP. */
5105 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5109 pe = loop_preheader_edge (loop);
5110 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5111 gcc_assert (!new_bb);
5114 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5117 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5118 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5120 vect_create_destination_var (scalar_dest,
5121 gimple_call_return_type (new_stmt));
5122 new_temp = make_ssa_name (vec_dest, new_stmt);
5123 gimple_call_set_lhs (new_stmt, new_temp);
5125 if (compute_in_loop)
5126 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5129 /* Generate the misalignment computation outside LOOP. */
5130 pe = loop_preheader_edge (loop);
5131 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5132 gcc_assert (!new_bb);
5135 *realignment_token = gimple_call_lhs (new_stmt);
5137 /* The result of the CALL_EXPR to this builtin is determined from
5138 the value of the parameter and no global variables are touched
5139 which makes the builtin a "const" function. Requiring the
5140 builtin to have the "const" attribute makes it unnecessary
5141 to call mark_call_clobbered. */
5142 gcc_assert (TREE_READONLY (builtin_decl));
5145 if (alignment_support_scheme == dr_explicit_realign)
5148 gcc_assert (!compute_in_loop);
5149 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5152 /* 5. Create msq = phi <msq_init, lsq> in loop */
5154 pe = loop_preheader_edge (containing_loop);
5155 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5156 msq = make_ssa_name (vec_dest);
5157 phi_stmt = create_phi_node (msq, containing_loop->header);
5158 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5164 /* Function vect_grouped_load_supported.
5166 Returns TRUE if even and odd permutations are supported,
5167 and FALSE otherwise. */
5170 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5172 machine_mode mode = TYPE_MODE (vectype);
5174 /* vect_permute_load_chain requires the group size to be equal to 3 or
5175 be a power of two. */
5176 if (count != 3 && exact_log2 (count) == -1)
5178 if (dump_enabled_p ())
5179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5180 "the size of the group of accesses"
5181 " is not a power of 2 or not equal to 3\n");
5185 /* Check that the permutation is supported. */
5186 if (VECTOR_MODE_P (mode))
5188 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5189 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5194 for (k = 0; k < 3; k++)
5196 for (i = 0; i < nelt; i++)
5197 if (3 * i + k < 2 * nelt)
5201 if (!can_vec_perm_p (mode, false, sel))
5203 if (dump_enabled_p ())
5204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5205 "shuffle of 3 loads is not supported by"
5209 for (i = 0, j = 0; i < nelt; i++)
5210 if (3 * i + k < 2 * nelt)
5213 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5214 if (!can_vec_perm_p (mode, false, sel))
5216 if (dump_enabled_p ())
5217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5218 "shuffle of 3 loads is not supported by"
5227 /* If length is not equal to 3 then only power of 2 is supported. */
5228 gcc_assert (exact_log2 (count) != -1);
5229 for (i = 0; i < nelt; i++)
5231 if (can_vec_perm_p (mode, false, sel))
5233 for (i = 0; i < nelt; i++)
5235 if (can_vec_perm_p (mode, false, sel))
5241 if (dump_enabled_p ())
5242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5243 "extract even/odd not supported by target\n");
5247 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5251 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5253 return vect_lanes_optab_supported_p ("vec_load_lanes",
5254 vec_load_lanes_optab,
5258 /* Function vect_permute_load_chain.
5260 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5261 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5262 the input data correctly. Return the final references for loads in
5265 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5266 The input is 4 vectors each containing 8 elements. We assign a number to each
5267 element, the input sequence is:
5269 1st vec: 0 1 2 3 4 5 6 7
5270 2nd vec: 8 9 10 11 12 13 14 15
5271 3rd vec: 16 17 18 19 20 21 22 23
5272 4th vec: 24 25 26 27 28 29 30 31
5274 The output sequence should be:
5276 1st vec: 0 4 8 12 16 20 24 28
5277 2nd vec: 1 5 9 13 17 21 25 29
5278 3rd vec: 2 6 10 14 18 22 26 30
5279 4th vec: 3 7 11 15 19 23 27 31
5281 i.e., the first output vector should contain the first elements of each
5282 interleaving group, etc.
5284 We use extract_even/odd instructions to create such output. The input of
5285 each extract_even/odd operation is two vectors
5289 and the output is the vector of extracted even/odd elements. The output of
5290 extract_even will be: 0 2 4 6
5291 and of extract_odd: 1 3 5 7
5294 The permutation is done in log LENGTH stages. In each stage extract_even
5295 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5296 their order. In our example,
5298 E1: extract_even (1st vec, 2nd vec)
5299 E2: extract_odd (1st vec, 2nd vec)
5300 E3: extract_even (3rd vec, 4th vec)
5301 E4: extract_odd (3rd vec, 4th vec)
5303 The output for the first stage will be:
5305 E1: 0 2 4 6 8 10 12 14
5306 E2: 1 3 5 7 9 11 13 15
5307 E3: 16 18 20 22 24 26 28 30
5308 E4: 17 19 21 23 25 27 29 31
5310 In order to proceed and create the correct sequence for the next stage (or
5311 for the correct output, if the second stage is the last one, as in our
5312 example), we first put the output of extract_even operation and then the
5313 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5314 The input for the second stage is:
5316 1st vec (E1): 0 2 4 6 8 10 12 14
5317 2nd vec (E3): 16 18 20 22 24 26 28 30
5318 3rd vec (E2): 1 3 5 7 9 11 13 15
5319 4th vec (E4): 17 19 21 23 25 27 29 31
5321 The output of the second stage:
5323 E1: 0 4 8 12 16 20 24 28
5324 E2: 2 6 10 14 18 22 26 30
5325 E3: 1 5 9 13 17 21 25 29
5326 E4: 3 7 11 15 19 23 27 31
5328 And RESULT_CHAIN after reordering:
5330 1st vec (E1): 0 4 8 12 16 20 24 28
5331 2nd vec (E3): 1 5 9 13 17 21 25 29
5332 3rd vec (E2): 2 6 10 14 18 22 26 30
5333 4th vec (E4): 3 7 11 15 19 23 27 31. */
5336 vect_permute_load_chain (vec<tree> dr_chain,
5337 unsigned int length,
5339 gimple_stmt_iterator *gsi,
5340 vec<tree> *result_chain)
5342 tree data_ref, first_vect, second_vect;
5343 tree perm_mask_even, perm_mask_odd;
5344 tree perm3_mask_low, perm3_mask_high;
5346 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5347 unsigned int i, j, log_length = exact_log2 (length);
5348 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5349 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5351 result_chain->quick_grow (length);
5352 memcpy (result_chain->address (), dr_chain.address (),
5353 length * sizeof (tree));
5359 for (k = 0; k < 3; k++)
5361 for (i = 0; i < nelt; i++)
5362 if (3 * i + k < 2 * nelt)
5366 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5368 for (i = 0, j = 0; i < nelt; i++)
5369 if (3 * i + k < 2 * nelt)
5372 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5374 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5376 first_vect = dr_chain[0];
5377 second_vect = dr_chain[1];
5379 /* Create interleaving stmt (low part of):
5380 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5382 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5383 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5384 second_vect, perm3_mask_low);
5385 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5387 /* Create interleaving stmt (high part of):
5388 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5390 first_vect = data_ref;
5391 second_vect = dr_chain[2];
5392 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5393 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5394 second_vect, perm3_mask_high);
5395 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5396 (*result_chain)[k] = data_ref;
5401 /* If length is not equal to 3 then only power of 2 is supported. */
5402 gcc_assert (exact_log2 (length) != -1);
5404 for (i = 0; i < nelt; ++i)
5406 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5408 for (i = 0; i < nelt; ++i)
5410 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5412 for (i = 0; i < log_length; i++)
5414 for (j = 0; j < length; j += 2)
5416 first_vect = dr_chain[j];
5417 second_vect = dr_chain[j+1];
5419 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5420 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5421 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5422 first_vect, second_vect,
5424 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5425 (*result_chain)[j/2] = data_ref;
5427 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5428 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5429 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5430 first_vect, second_vect,
5432 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5433 (*result_chain)[j/2+length/2] = data_ref;
5435 memcpy (dr_chain.address (), result_chain->address (),
5436 length * sizeof (tree));
5441 /* Function vect_shift_permute_load_chain.
5443 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5444 sequence of stmts to reorder the input data accordingly.
5445 Return the final references for loads in RESULT_CHAIN.
5446 Return true if successed, false otherwise.
5448 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5449 The input is 3 vectors each containing 8 elements. We assign a
5450 number to each element, the input sequence is:
5452 1st vec: 0 1 2 3 4 5 6 7
5453 2nd vec: 8 9 10 11 12 13 14 15
5454 3rd vec: 16 17 18 19 20 21 22 23
5456 The output sequence should be:
5458 1st vec: 0 3 6 9 12 15 18 21
5459 2nd vec: 1 4 7 10 13 16 19 22
5460 3rd vec: 2 5 8 11 14 17 20 23
5462 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5464 First we shuffle all 3 vectors to get correct elements order:
5466 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5467 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5468 3rd vec: (16 19 22) (17 20 23) (18 21)
5470 Next we unite and shift vector 3 times:
5473 shift right by 6 the concatenation of:
5474 "1st vec" and "2nd vec"
5475 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5476 "2nd vec" and "3rd vec"
5477 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5478 "3rd vec" and "1st vec"
5479 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5482 So that now new vectors are:
5484 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5485 2nd vec: (10 13) (16 19 22) (17 20 23)
5486 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5489 shift right by 5 the concatenation of:
5490 "1st vec" and "3rd vec"
5491 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5492 "2nd vec" and "1st vec"
5493 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5494 "3rd vec" and "2nd vec"
5495 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5498 So that now new vectors are:
5500 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5501 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5502 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5505 shift right by 5 the concatenation of:
5506 "1st vec" and "1st vec"
5507 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5508 shift right by 3 the concatenation of:
5509 "2nd vec" and "2nd vec"
5510 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5513 So that now all vectors are READY:
5514 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5515 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5516 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5518 This algorithm is faster than one in vect_permute_load_chain if:
5519 1. "shift of a concatination" is faster than general permutation.
5521 2. The TARGET machine can't execute vector instructions in parallel.
5522 This is because each step of the algorithm depends on previous.
5523 The algorithm in vect_permute_load_chain is much more parallel.
5525 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5529 vect_shift_permute_load_chain (vec<tree> dr_chain,
5530 unsigned int length,
5532 gimple_stmt_iterator *gsi,
5533 vec<tree> *result_chain)
5535 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5536 tree perm2_mask1, perm2_mask2, perm3_mask;
5537 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5540 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5542 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5543 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5544 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5545 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5547 result_chain->quick_grow (length);
5548 memcpy (result_chain->address (), dr_chain.address (),
5549 length * sizeof (tree));
5551 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5553 unsigned int j, log_length = exact_log2 (length);
5554 for (i = 0; i < nelt / 2; ++i)
5556 for (i = 0; i < nelt / 2; ++i)
5557 sel[nelt / 2 + i] = i * 2 + 1;
5558 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5560 if (dump_enabled_p ())
5561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5562 "shuffle of 2 fields structure is not \
5563 supported by target\n");
5566 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5568 for (i = 0; i < nelt / 2; ++i)
5570 for (i = 0; i < nelt / 2; ++i)
5571 sel[nelt / 2 + i] = i * 2;
5572 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5574 if (dump_enabled_p ())
5575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5576 "shuffle of 2 fields structure is not \
5577 supported by target\n");
5580 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5582 /* Generating permutation constant to shift all elements.
5583 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5584 for (i = 0; i < nelt; i++)
5585 sel[i] = nelt / 2 + i;
5586 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5588 if (dump_enabled_p ())
5589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5590 "shift permutation is not supported by target\n");
5593 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5595 /* Generating permutation constant to select vector from 2.
5596 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5597 for (i = 0; i < nelt / 2; i++)
5599 for (i = nelt / 2; i < nelt; i++)
5601 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5603 if (dump_enabled_p ())
5604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5605 "select is not supported by target\n");
5608 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5610 for (i = 0; i < log_length; i++)
5612 for (j = 0; j < length; j += 2)
5614 first_vect = dr_chain[j];
5615 second_vect = dr_chain[j + 1];
5617 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5618 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5619 first_vect, first_vect,
5621 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5624 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5625 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5626 second_vect, second_vect,
5628 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5631 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5632 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5633 vect[0], vect[1], shift1_mask);
5634 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5635 (*result_chain)[j/2 + length/2] = data_ref;
5637 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5638 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5639 vect[0], vect[1], select_mask);
5640 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5641 (*result_chain)[j/2] = data_ref;
5643 memcpy (dr_chain.address (), result_chain->address (),
5644 length * sizeof (tree));
5648 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5650 unsigned int k = 0, l = 0;
5652 /* Generating permutation constant to get all elements in rigth order.
5653 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5654 for (i = 0; i < nelt; i++)
5656 if (3 * k + (l % 3) >= nelt)
5659 l += (3 - (nelt % 3));
5661 sel[i] = 3 * k + (l % 3);
5664 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5666 if (dump_enabled_p ())
5667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5668 "shuffle of 3 fields structure is not \
5669 supported by target\n");
5672 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5674 /* Generating permutation constant to shift all elements.
5675 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5676 for (i = 0; i < nelt; i++)
5677 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5678 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5680 if (dump_enabled_p ())
5681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5682 "shift permutation is not supported by target\n");
5685 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5687 /* Generating permutation constant to shift all elements.
5688 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5689 for (i = 0; i < nelt; i++)
5690 sel[i] = 2 * (nelt / 3) + 1 + i;
5691 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5693 if (dump_enabled_p ())
5694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5695 "shift permutation is not supported by target\n");
5698 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5700 /* Generating permutation constant to shift all elements.
5701 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5702 for (i = 0; i < nelt; i++)
5703 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5704 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5706 if (dump_enabled_p ())
5707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5708 "shift permutation is not supported by target\n");
5711 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5713 /* Generating permutation constant to shift all elements.
5714 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5715 for (i = 0; i < nelt; i++)
5716 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5717 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5719 if (dump_enabled_p ())
5720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5721 "shift permutation is not supported by target\n");
5724 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5726 for (k = 0; k < 3; k++)
5728 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5729 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5730 dr_chain[k], dr_chain[k],
5732 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5736 for (k = 0; k < 3; k++)
5738 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5739 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5740 vect[k % 3], vect[(k + 1) % 3],
5742 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5743 vect_shift[k] = data_ref;
5746 for (k = 0; k < 3; k++)
5748 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5749 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5750 vect_shift[(4 - k) % 3],
5751 vect_shift[(3 - k) % 3],
5753 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5757 (*result_chain)[3 - (nelt % 3)] = vect[2];
5759 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5760 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5761 vect[0], shift3_mask);
5762 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5763 (*result_chain)[nelt % 3] = data_ref;
5765 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5766 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5767 vect[1], shift4_mask);
5768 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5769 (*result_chain)[0] = data_ref;
5775 /* Function vect_transform_grouped_load.
5777 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5778 to perform their permutation and ascribe the result vectorized statements to
5779 the scalar statements.
5783 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5784 gimple_stmt_iterator *gsi)
5787 vec<tree> result_chain = vNULL;
5789 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5790 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5791 vectors, that are ready for vector computation. */
5792 result_chain.create (size);
5794 /* If reassociation width for vector type is 2 or greater target machine can
5795 execute 2 or more vector instructions in parallel. Otherwise try to
5796 get chain for loads group using vect_shift_permute_load_chain. */
5797 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5798 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5799 || exact_log2 (size) != -1
5800 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5801 gsi, &result_chain))
5802 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5803 vect_record_grouped_load_vectors (stmt, result_chain);
5804 result_chain.release ();
5807 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5808 generated as part of the vectorization of STMT. Assign the statement
5809 for each vector to the associated scalar statement. */
5812 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5814 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5815 gimple *next_stmt, *new_stmt;
5816 unsigned int i, gap_count;
5819 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5820 Since we scan the chain starting from it's first node, their order
5821 corresponds the order of data-refs in RESULT_CHAIN. */
5822 next_stmt = first_stmt;
5824 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5829 /* Skip the gaps. Loads created for the gaps will be removed by dead
5830 code elimination pass later. No need to check for the first stmt in
5831 the group, since it always exists.
5832 GROUP_GAP is the number of steps in elements from the previous
5833 access (if there is no gap GROUP_GAP is 1). We skip loads that
5834 correspond to the gaps. */
5835 if (next_stmt != first_stmt
5836 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5844 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5845 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5846 copies, and we put the new vector statement in the first available
5848 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5849 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5852 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5855 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5857 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5860 prev_stmt = rel_stmt;
5862 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5865 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5870 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5872 /* If NEXT_STMT accesses the same DR as the previous statement,
5873 put the same TMP_DATA_REF as its vectorized statement; otherwise
5874 get the next data-ref from RESULT_CHAIN. */
5875 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5881 /* Function vect_force_dr_alignment_p.
5883 Returns whether the alignment of a DECL can be forced to be aligned
5884 on ALIGNMENT bit boundary. */
5887 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5889 if (TREE_CODE (decl) != VAR_DECL)
5892 if (decl_in_symtab_p (decl)
5893 && !symtab_node::get (decl)->can_increase_alignment_p ())
5896 if (TREE_STATIC (decl))
5897 return (alignment <= MAX_OFILE_ALIGNMENT);
5899 return (alignment <= MAX_STACK_ALIGNMENT);
5903 /* Return whether the data reference DR is supported with respect to its
5905 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5906 it is aligned, i.e., check if it is possible to vectorize it with different
5909 enum dr_alignment_support
5910 vect_supportable_dr_alignment (struct data_reference *dr,
5911 bool check_aligned_accesses)
5913 gimple *stmt = DR_STMT (dr);
5914 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5915 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5916 machine_mode mode = TYPE_MODE (vectype);
5917 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5918 struct loop *vect_loop = NULL;
5919 bool nested_in_vect_loop = false;
5921 if (aligned_access_p (dr) && !check_aligned_accesses)
5924 /* For now assume all conditional loads/stores support unaligned
5925 access without any special code. */
5926 if (is_gimple_call (stmt)
5927 && gimple_call_internal_p (stmt)
5928 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5929 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5930 return dr_unaligned_supported;
5934 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5935 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5938 /* Possibly unaligned access. */
5940 /* We can choose between using the implicit realignment scheme (generating
5941 a misaligned_move stmt) and the explicit realignment scheme (generating
5942 aligned loads with a REALIGN_LOAD). There are two variants to the
5943 explicit realignment scheme: optimized, and unoptimized.
5944 We can optimize the realignment only if the step between consecutive
5945 vector loads is equal to the vector size. Since the vector memory
5946 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5947 is guaranteed that the misalignment amount remains the same throughout the
5948 execution of the vectorized loop. Therefore, we can create the
5949 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5950 at the loop preheader.
5952 However, in the case of outer-loop vectorization, when vectorizing a
5953 memory access in the inner-loop nested within the LOOP that is now being
5954 vectorized, while it is guaranteed that the misalignment of the
5955 vectorized memory access will remain the same in different outer-loop
5956 iterations, it is *not* guaranteed that is will remain the same throughout
5957 the execution of the inner-loop. This is because the inner-loop advances
5958 with the original scalar step (and not in steps of VS). If the inner-loop
5959 step happens to be a multiple of VS, then the misalignment remains fixed
5960 and we can use the optimized realignment scheme. For example:
5966 When vectorizing the i-loop in the above example, the step between
5967 consecutive vector loads is 1, and so the misalignment does not remain
5968 fixed across the execution of the inner-loop, and the realignment cannot
5969 be optimized (as illustrated in the following pseudo vectorized loop):
5971 for (i=0; i<N; i+=4)
5972 for (j=0; j<M; j++){
5973 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5974 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5975 // (assuming that we start from an aligned address).
5978 We therefore have to use the unoptimized realignment scheme:
5980 for (i=0; i<N; i+=4)
5981 for (j=k; j<M; j+=4)
5982 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5983 // that the misalignment of the initial address is
5986 The loop can then be vectorized as follows:
5988 for (k=0; k<4; k++){
5989 rt = get_realignment_token (&vp[k]);
5990 for (i=0; i<N; i+=4){
5992 for (j=k; j<M; j+=4){
5994 va = REALIGN_LOAD <v1,v2,rt>;
6001 if (DR_IS_READ (dr))
6003 bool is_packed = false;
6004 tree type = (TREE_TYPE (DR_REF (dr)));
6006 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6007 && (!targetm.vectorize.builtin_mask_for_load
6008 || targetm.vectorize.builtin_mask_for_load ()))
6010 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6012 /* If we are doing SLP then the accesses need not have the
6013 same alignment, instead it depends on the SLP group size. */
6015 && STMT_SLP_TYPE (stmt_info)
6016 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6017 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6018 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6020 else if (!loop_vinfo
6021 || (nested_in_vect_loop
6022 && (TREE_INT_CST_LOW (DR_STEP (dr))
6023 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6024 return dr_explicit_realign;
6026 return dr_explicit_realign_optimized;
6028 if (!known_alignment_for_access_p (dr))
6029 is_packed = not_size_aligned (DR_REF (dr));
6031 if ((TYPE_USER_ALIGN (type) && !is_packed)
6032 || targetm.vectorize.
6033 support_vector_misalignment (mode, type,
6034 DR_MISALIGNMENT (dr), is_packed))
6035 /* Can't software pipeline the loads, but can at least do them. */
6036 return dr_unaligned_supported;
6040 bool is_packed = false;
6041 tree type = (TREE_TYPE (DR_REF (dr)));
6043 if (!known_alignment_for_access_p (dr))
6044 is_packed = not_size_aligned (DR_REF (dr));
6046 if ((TYPE_USER_ALIGN (type) && !is_packed)
6047 || targetm.vectorize.
6048 support_vector_misalignment (mode, type,
6049 DR_MISALIGNMENT (dr), is_packed))
6050 return dr_unaligned_supported;
6054 return dr_unaligned_unsupported;