gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50
  51 /*************************************************************************
  52   Simple Loop Peeling Utilities
  53
  54   Utilities to support loop peeling for vectorization purposes.
  55  *************************************************************************/
  56
  57
  58 /* Renames the use *OP_P.  */
  59
  60 static void
  61 rename_use_op (use_operand_p op_p)
  62 {
  63   tree new_name;
  64
  65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  66     return;
  67
  68   new_name = get_current_def (USE_FROM_PTR (op_p));
  69
  70   /* Something defined outside of the loop.  */
  71   if (!new_name)
  72     return;
  73
  74   /* An ordinary ssa name defined in the loop.  */
  75
  76   SET_USE (op_p, new_name);
  77 }
  78
  79
  80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  82    true.  */
  83
  84 static void
  85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  86 {
  87   gimple *stmt;
  88   use_operand_p use_p;
  89   ssa_op_iter iter;
  90   edge e;
  91   edge_iterator ei;
  92   class loop *loop = bb->loop_father;
  93   class loop *outer_loop = NULL;
  94
  95   if (rename_from_outer_loop)
  96     {
  97       gcc_assert (loop);
  98       outer_loop = loop_outer (loop);
  99     }
 100
 101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 102        gsi_next (&gsi))
 103     {
 104       stmt = gsi_stmt (gsi);
 105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 106         rename_use_op (use_p);
 107     }
 108
 109   FOR_EACH_EDGE (e, ei, bb->preds)
 110     {
 111       if (!flow_bb_inside_loop_p (loop, e->src))
 112         {
 113           if (!rename_from_outer_loop)
 114             continue;
 115           if (e->src != outer_loop->header)
 116             {
 117               if (outer_loop->inner->next)
 118                 {
 119                   /* If outer_loop has 2 inner loops, allow there to
 120                      be an extra basic block which decides which of the
 121                      two loops to use using LOOP_VECTORIZED.  */
 122                   if (!single_pred_p (e->src)
 123                       || single_pred (e->src) != outer_loop->header)
 124                     continue;
 125                 }
 126             }
 127         }
 128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 129            gsi_next (&gsi))
 130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 131     }
 132 }
 133
 134
 135 struct adjust_info
 136 {
 137   tree from, to;
 138   basic_block bb;
 139 };
 140
 141 /* A stack of values to be adjusted in debug stmts.  We have to
 142    process them LIFO, so that the closest substitution applies.  If we
 143    processed them FIFO, without the stack, we might substitute uses
 144    with a PHI DEF that would soon become non-dominant, and when we got
 145    to the suitable one, it wouldn't have anything to substitute any
 146    more.  */
 147 static vec<adjust_info, va_heap> adjust_vec;
 148
 149 /* Adjust any debug stmts that referenced AI->from values to use the
 150    loop-closed AI->to, if the references are dominated by AI->bb and
 151    not by the definition of AI->from.  */
 152
 153 static void
 154 adjust_debug_stmts_now (adjust_info *ai)
 155 {
 156   basic_block bbphi = ai->bb;
 157   tree orig_def = ai->from;
 158   tree new_def = ai->to;
 159   imm_use_iterator imm_iter;
 160   gimple *stmt;
 161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 162
 163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 164
 165   /* Adjust any debug stmts that held onto non-loop-closed
 166      references.  */
 167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 168     {
 169       use_operand_p use_p;
 170       basic_block bbuse;
 171
 172       if (!is_gimple_debug (stmt))
 173         continue;
 174
 175       gcc_assert (gimple_debug_bind_p (stmt));
 176
 177       bbuse = gimple_bb (stmt);
 178
 179       if ((bbuse == bbphi
 180            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 181           && !(bbuse == bbdef
 182                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 183         {
 184           if (new_def)
 185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 186               SET_USE (use_p, new_def);
 187           else
 188             {
 189               gimple_debug_bind_reset_value (stmt);
 190               update_stmt (stmt);
 191             }
 192         }
 193     }
 194 }
 195
 196 /* Adjust debug stmts as scheduled before.  */
 197
 198 static void
 199 adjust_vec_debug_stmts (void)
 200 {
 201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 202     return;
 203
 204   gcc_assert (adjust_vec.exists ());
 205
 206   while (!adjust_vec.is_empty ())
 207     {
 208       adjust_debug_stmts_now (&adjust_vec.last ());
 209       adjust_vec.pop ();
 210     }
 211 }
 212
 213 /* Adjust any debug stmts that referenced FROM values to use the
 214    loop-closed TO, if the references are dominated by BB and not by
 215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 216    will be postponed until adjust_vec_debug_stmts is called.  */
 217
 218 static void
 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
 220 {
 221   adjust_info ai;
 222
 223   if (MAY_HAVE_DEBUG_BIND_STMTS
 224       && TREE_CODE (from) == SSA_NAME
 225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 226       && ! virtual_operand_p (from))
 227     {
 228       ai.from = from;
 229       ai.to = to;
 230       ai.bb = bb;
 231
 232       if (adjust_vec.exists ())
 233         adjust_vec.safe_push (ai);
 234       else
 235         adjust_debug_stmts_now (&ai);
 236     }
 237 }
 238
 239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 240    to adjust any debug stmts that referenced the old phi arg,
 241    presumably non-loop-closed references left over from other
 242    transformations.  */
 243
 244 static void
 245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 246 {
 247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 248
 249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 250
 251   if (MAY_HAVE_DEBUG_BIND_STMTS)
 252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 253                         gimple_bb (update_phi));
 254 }
 255
 256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 257    the mask should have during the first iteration and NEXT_MASK is the
 258    value that it should have on subsequent iterations.  */
 259
 260 static void
 261 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
 262                     tree next_mask)
 263 {
 264   gphi *phi = create_phi_node (mask, loop->header);
 265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 267 }
 268
 269 /* Add SEQ to the end of LOOP's preheader block.  */
 270
 271 static void
 272 add_preheader_seq (class loop *loop, gimple_seq seq)
 273 {
 274   if (seq)
 275     {
 276       edge pe = loop_preheader_edge (loop);
 277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 278       gcc_assert (!new_bb);
 279     }
 280 }
 281
 282 /* Add SEQ to the beginning of LOOP's header block.  */
 283
 284 static void
 285 add_header_seq (class loop *loop, gimple_seq seq)
 286 {
 287   if (seq)
 288     {
 289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 291     }
 292 }
 293
 294 /* Return true if the target can interleave elements of two vectors.
 295    OFFSET is 0 if the first half of the vectors should be interleaved
 296    or 1 if the second half should.  When returning true, store the
 297    associated permutation in INDICES.  */
 298
 299 static bool
 300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 301                         unsigned int offset)
 302 {
 303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 304   poly_uint64 base = exact_div (nelts, 2) * offset;
 305   vec_perm_builder sel (nelts, 2, 3);
 306   for (unsigned int i = 0; i < 3; ++i)
 307     {
 308       sel.quick_push (base + i);
 309       sel.quick_push (base + i + nelts);
 310     }
 311   indices->new_vector (sel, 2, nelts);
 312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 313 }
 314
 315 /* Try to use permutes to define the masks in DEST_RGM using the masks
 316    in SRC_RGM, given that the former has twice as many masks as the
 317    latter.  Return true on success, adding any new statements to SEQ.  */
 318
 319 static bool
 320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 321                                rgroup_masks *src_rgm)
 322 {
 323   tree src_masktype = src_rgm->mask_type;
 324   tree dest_masktype = dest_rgm->mask_type;
 325   machine_mode src_mode = TYPE_MODE (src_masktype);
 326   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 327       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
 328       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
 329     {
 330       /* Unpacking the source masks gives at least as many mask bits as
 331          we need.  We can then VIEW_CONVERT any excess bits away.  */
 332       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
 333       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 334         {
 335           tree src = src_rgm->masks[i / 2];
 336           tree dest = dest_rgm->masks[i];
 337           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 338                             ? VEC_UNPACK_HI_EXPR
 339                             : VEC_UNPACK_LO_EXPR);
 340           gassign *stmt;
 341           if (dest_masktype == unpack_masktype)
 342             stmt = gimple_build_assign (dest, code, src);
 343           else
 344             {
 345               tree temp = make_ssa_name (unpack_masktype);
 346               stmt = gimple_build_assign (temp, code, src);
 347               gimple_seq_add_stmt (seq, stmt);
 348               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 349                                           build1 (VIEW_CONVERT_EXPR,
 350                                                   dest_masktype, temp));
 351             }
 352           gimple_seq_add_stmt (seq, stmt);
 353         }
 354       return true;
 355     }
 356   vec_perm_indices indices[2];
 357   if (dest_masktype == src_masktype
 358       && interleave_supported_p (&indices[0], src_masktype, 0)
 359       && interleave_supported_p (&indices[1], src_masktype, 1))
 360     {
 361       /* The destination requires twice as many mask bits as the source, so
 362          we can use interleaving permutes to double up the number of bits.  */
 363       tree masks[2];
 364       for (unsigned int i = 0; i < 2; ++i)
 365         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 366       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 367         {
 368           tree src = src_rgm->masks[i / 2];
 369           tree dest = dest_rgm->masks[i];
 370           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 371                                               src, src, masks[i & 1]);
 372           gimple_seq_add_stmt (seq, stmt);
 373         }
 374       return true;
 375     }
 376   return false;
 377 }
 378
 379 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 380    all the masks in RGM and return a mask that is nonzero when the loop
 381    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 382    Use LOOP_COND_GSI to insert code before the exit gcond.
 383
 384    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 385    times and has been vectorized according to LOOP_VINFO.
 386
 387    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 388    starts with NITERS_SKIP dummy iterations of the scalar loop before
 389    the real work starts.  The mask elements for these dummy iterations
 390    must be 0, to ensure that the extra iterations do not have an effect.
 391
 392    It is known that:
 393
 394      NITERS * RGM->max_nscalars_per_iter
 395
 396    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 397    variable that starts at 0 and has step:
 398
 399      VF * RGM->max_nscalars_per_iter
 400
 401    might overflow before hitting a value above:
 402
 403      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 404
 405    This means that we cannot guarantee that such an induction variable
 406    would ever hit a value that produces a set of all-false masks for RGM.  */
 407
 408 static tree
 409 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
 410                               gimple_seq *preheader_seq,
 411                               gimple_stmt_iterator loop_cond_gsi,
 412                               rgroup_masks *rgm, tree niters, tree niters_skip,
 413                               bool might_wrap_p)
 414 {
 415   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 416   tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
 417   tree mask_type = rgm->mask_type;
 418   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 419   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 420   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 421
 422   /* Calculate the maximum number of scalar values that the rgroup
 423      handles in total, the number that it handles for each iteration
 424      of the vector loop, and the number that it should skip during the
 425      first iteration of the vector loop.  */
 426   tree nscalars_total = niters;
 427   tree nscalars_step = build_int_cst (iv_type, vf);
 428   tree nscalars_skip = niters_skip;
 429   if (nscalars_per_iter != 1)
 430     {
 431       /* We checked before choosing to use a fully-masked loop that these
 432          multiplications don't overflow.  */
 433       tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
 434       tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
 435       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 436                                      nscalars_total, compare_factor);
 437       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 438                                     nscalars_step, iv_factor);
 439       if (nscalars_skip)
 440         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 441                                       nscalars_skip, compare_factor);
 442     }
 443
 444   /* Create an induction variable that counts the number of scalars
 445      processed.  */
 446   tree index_before_incr, index_after_incr;
 447   gimple_stmt_iterator incr_gsi;
 448   bool insert_after;
 449   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 450   create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
 451              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 452
 453   tree zero_index = build_int_cst (compare_type, 0);
 454   tree test_index, test_limit, first_limit;
 455   gimple_stmt_iterator *test_gsi;
 456   if (might_wrap_p)
 457     {
 458       /* In principle the loop should stop iterating once the incremented
 459          IV reaches a value greater than or equal to:
 460
 461            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 462
 463          However, there's no guarantee that this addition doesn't overflow
 464          the comparison type, or that the IV hits a value above it before
 465          wrapping around.  We therefore adjust the limit down by one
 466          IV step:
 467
 468            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 469            -[infinite-prec] NSCALARS_STEP
 470
 471          and compare the IV against this limit _before_ incrementing it.
 472          Since the comparison type is unsigned, we actually want the
 473          subtraction to saturate at zero:
 474
 475            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 476            -[sat] NSCALARS_STEP
 477
 478          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 479
 480            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 481
 482          where the rightmost subtraction can be done directly in
 483          COMPARE_TYPE.  */
 484       test_index = index_before_incr;
 485       tree adjust = gimple_convert (preheader_seq, compare_type,
 486                                     nscalars_step);
 487       if (nscalars_skip)
 488         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 489                                adjust, nscalars_skip);
 490       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 491                                  nscalars_total, adjust);
 492       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 493                                  test_limit, adjust);
 494       test_gsi = &incr_gsi;
 495
 496       /* Get a safe limit for the first iteration.  */
 497       if (nscalars_skip)
 498         {
 499           /* The first vector iteration can handle at most NSCALARS_STEP
 500              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 501              NSCALARS_SKIP to that cannot overflow.  */
 502           tree const_limit = build_int_cst (compare_type,
 503                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 504                                             * nscalars_per_iter);
 505           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 506                                       nscalars_total, const_limit);
 507           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 508                                       first_limit, nscalars_skip);
 509         }
 510       else
 511         /* For the first iteration it doesn't matter whether the IV hits
 512            a value above NSCALARS_TOTAL.  That only matters for the latch
 513            condition.  */
 514         first_limit = nscalars_total;
 515     }
 516   else
 517     {
 518       /* Test the incremented IV, which will always hit a value above
 519          the bound before wrapping.  */
 520       test_index = index_after_incr;
 521       test_limit = nscalars_total;
 522       if (nscalars_skip)
 523         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 524                                    test_limit, nscalars_skip);
 525       test_gsi = &loop_cond_gsi;
 526
 527       first_limit = test_limit;
 528     }
 529
 530   /* Convert the IV value to the comparison type (either a no-op or
 531      a demotion).  */
 532   gimple_seq test_seq = NULL;
 533   test_index = gimple_convert (&test_seq, compare_type, test_index);
 534   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 535
 536   /* Provide a definition of each mask in the group.  */
 537   tree next_mask = NULL_TREE;
 538   tree mask;
 539   unsigned int i;
 540   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 541     {
 542       /* Previous masks will cover BIAS scalars.  This mask covers the
 543          next batch.  */
 544       poly_uint64 bias = nscalars_per_mask * i;
 545       tree bias_tree = build_int_cst (compare_type, bias);
 546       gimple *tmp_stmt;
 547
 548       /* See whether the first iteration of the vector loop is known
 549          to have a full mask.  */
 550       poly_uint64 const_limit;
 551       bool first_iteration_full
 552         = (poly_int_tree_p (first_limit, &const_limit)
 553            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 554
 555       /* Rather than have a new IV that starts at BIAS and goes up to
 556          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 557          and adjust the bound down by BIAS.  */
 558       tree this_test_limit = test_limit;
 559       if (i != 0)
 560         {
 561           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 562                                           compare_type, this_test_limit,
 563                                           bias_tree);
 564           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 565                                           compare_type, this_test_limit,
 566                                           bias_tree);
 567         }
 568
 569       /* Create the initial mask.  First include all scalars that
 570          are within the loop limit.  */
 571       tree init_mask = NULL_TREE;
 572       if (!first_iteration_full)
 573         {
 574           tree start, end;
 575           if (first_limit == test_limit)
 576             {
 577               /* Use a natural test between zero (the initial IV value)
 578                  and the loop limit.  The "else" block would be valid too,
 579                  but this choice can avoid the need to load BIAS_TREE into
 580                  a register.  */
 581               start = zero_index;
 582               end = this_test_limit;
 583             }
 584           else
 585             {
 586               /* FIRST_LIMIT is the maximum number of scalars handled by the
 587                  first iteration of the vector loop.  Test the portion
 588                  associated with this mask.  */
 589               start = bias_tree;
 590               end = first_limit;
 591             }
 592
 593           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 594           tmp_stmt = vect_gen_while (init_mask, start, end);
 595           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 596         }
 597
 598       /* Now AND out the bits that are within the number of skipped
 599          scalars.  */
 600       poly_uint64 const_skip;
 601       if (nscalars_skip
 602           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 603                && known_le (const_skip, bias)))
 604         {
 605           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 606                                                     bias_tree, nscalars_skip);
 607           if (init_mask)
 608             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 609                                       init_mask, unskipped_mask);
 610           else
 611             init_mask = unskipped_mask;
 612         }
 613
 614       if (!init_mask)
 615         /* First iteration is full.  */
 616         init_mask = build_minus_one_cst (mask_type);
 617
 618       /* Get the mask value for the next iteration of the loop.  */
 619       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 620       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 621       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 622
 623       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 624     }
 625   return next_mask;
 626 }
 627
 628 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 629    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 630    number of iterations of the original scalar loop that should be
 631    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 632    as for vect_set_loop_condition.
 633
 634    Insert the branch-back condition before LOOP_COND_GSI and return the
 635    final gcond.  */
 636
 637 static gcond *
 638 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
 639                                 tree niters, tree final_iv,
 640                                 bool niters_maybe_zero,
 641                                 gimple_stmt_iterator loop_cond_gsi)
 642 {
 643   gimple_seq preheader_seq = NULL;
 644   gimple_seq header_seq = NULL;
 645
 646   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 647   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 648   tree orig_niters = niters;
 649
 650   /* Type of the initial value of NITERS.  */
 651   tree ni_actual_type = TREE_TYPE (niters);
 652   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 653   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 654
 655   /* Convert NITERS to the same size as the compare.  */
 656   if (compare_precision > ni_actual_precision
 657       && niters_maybe_zero)
 658     {
 659       /* We know that there is always at least one iteration, so if the
 660          count is zero then it must have wrapped.  Cope with this by
 661          subtracting 1 before the conversion and adding 1 to the result.  */
 662       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 663       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 664                              niters, build_minus_one_cst (ni_actual_type));
 665       niters = gimple_convert (&preheader_seq, compare_type, niters);
 666       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 667                              niters, build_one_cst (compare_type));
 668     }
 669   else
 670     niters = gimple_convert (&preheader_seq, compare_type, niters);
 671
 672   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
 673
 674   /* Iterate over all the rgroups and fill in their masks.  We could use
 675      the first mask from any rgroup for the loop condition; here we
 676      arbitrarily pick the last.  */
 677   tree test_mask = NULL_TREE;
 678   rgroup_masks *rgm;
 679   unsigned int i;
 680   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 681   FOR_EACH_VEC_ELT (*masks, i, rgm)
 682     if (!rgm->masks.is_empty ())
 683       {
 684         /* First try using permutes.  This adds a single vector
 685            instruction to the loop for each mask, but needs no extra
 686            loop invariants or IVs.  */
 687         unsigned int nmasks = i + 1;
 688         if ((nmasks & 1) == 0)
 689           {
 690             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 691             if (!half_rgm->masks.is_empty ()
 692                 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
 693               continue;
 694           }
 695
 696         /* See whether zero-based IV would ever generate all-false masks
 697            before wrapping around.  */
 698         bool might_wrap_p
 699           = (iv_limit == -1
 700              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 701                                     UNSIGNED)
 702                  > compare_precision));
 703
 704         /* Set up all masks for this group.  */
 705         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 706                                                   &preheader_seq,
 707                                                   loop_cond_gsi, rgm,
 708                                                   niters, niters_skip,
 709                                                   might_wrap_p);
 710       }
 711
 712   /* Emit all accumulated statements.  */
 713   add_preheader_seq (loop, preheader_seq);
 714   add_header_seq (loop, header_seq);
 715
 716   /* Get a boolean result that tells us whether to iterate.  */
 717   edge exit_edge = single_exit (loop);
 718   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 719   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 720   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 721                                         NULL_TREE, NULL_TREE);
 722   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 723
 724   /* The loop iterates (NITERS - 1) / VF + 1 times.
 725      Subtract one from this to get the latch count.  */
 726   tree step = build_int_cst (compare_type,
 727                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 728   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 729                                        build_minus_one_cst (compare_type));
 730   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 731                                      niters_minus_one, step);
 732
 733   if (final_iv)
 734     {
 735       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 736       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 737     }
 738
 739   return cond_stmt;
 740 }
 741
 742 /* Like vect_set_loop_condition, but handle the case in which there
 743    are no loop masks.  */
 744
 745 static gcond *
 746 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
 747                                   tree step, tree final_iv,
 748                                   bool niters_maybe_zero,
 749                                   gimple_stmt_iterator loop_cond_gsi)
 750 {
 751   tree indx_before_incr, indx_after_incr;
 752   gcond *cond_stmt;
 753   gcond *orig_cond;
 754   edge pe = loop_preheader_edge (loop);
 755   edge exit_edge = single_exit (loop);
 756   gimple_stmt_iterator incr_gsi;
 757   bool insert_after;
 758   enum tree_code code;
 759   tree niters_type = TREE_TYPE (niters);
 760
 761   orig_cond = get_loop_exit_condition (loop);
 762   gcc_assert (orig_cond);
 763   loop_cond_gsi = gsi_for_stmt (orig_cond);
 764
 765   tree init, limit;
 766   if (!niters_maybe_zero && integer_onep (step))
 767     {
 768       /* In this case we can use a simple 0-based IV:
 769
 770          A:
 771            x = 0;
 772            do
 773              {
 774                ...
 775                x += 1;
 776              }
 777            while (x < NITERS);  */
 778       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 779       init = build_zero_cst (niters_type);
 780       limit = niters;
 781     }
 782   else
 783     {
 784       /* The following works for all values of NITERS except 0:
 785
 786          B:
 787            x = 0;
 788            do
 789              {
 790                ...
 791                x += STEP;
 792              }
 793            while (x <= NITERS - STEP);
 794
 795          so that the loop continues to iterate if x + STEP - 1 < NITERS
 796          but stops if x + STEP - 1 >= NITERS.
 797
 798          However, if NITERS is zero, x never hits a value above NITERS - STEP
 799          before wrapping around.  There are two obvious ways of dealing with
 800          this:
 801
 802          - start at STEP - 1 and compare x before incrementing it
 803          - start at -1 and compare x after incrementing it
 804
 805          The latter is simpler and is what we use.  The loop in this case
 806          looks like:
 807
 808          C:
 809            x = -1;
 810            do
 811              {
 812                ...
 813                x += STEP;
 814              }
 815            while (x < NITERS - STEP);
 816
 817          In both cases the loop limit is NITERS - STEP.  */
 818       gimple_seq seq = NULL;
 819       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 820       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 821       if (seq)
 822         {
 823           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 824           gcc_assert (!new_bb);
 825         }
 826       if (niters_maybe_zero)
 827         {
 828           /* Case C.  */
 829           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 830           init = build_all_ones_cst (niters_type);
 831         }
 832       else
 833         {
 834           /* Case B.  */
 835           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 836           init = build_zero_cst (niters_type);
 837         }
 838     }
 839
 840   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 841   create_iv (init, step, NULL_TREE, loop,
 842              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 843   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 844                                               true, NULL_TREE, true,
 845                                               GSI_SAME_STMT);
 846   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 847                                      true, GSI_SAME_STMT);
 848
 849   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 850                                  NULL_TREE);
 851
 852   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 853
 854   /* Record the number of latch iterations.  */
 855   if (limit == niters)
 856     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 857        latch count.  */
 858     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 859                                        build_int_cst (niters_type, 1));
 860   else
 861     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 862        Subtract one from this to get the latch count.  */
 863     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 864                                        limit, step);
 865
 866   if (final_iv)
 867     {
 868       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 869                                              indx_after_incr, init);
 870       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 871     }
 872
 873   return cond_stmt;
 874 }
 875
 876 /* If we're using fully-masked loops, make LOOP iterate:
 877
 878       N == (NITERS - 1) / STEP + 1
 879
 880    times.  When NITERS is zero, this is equivalent to making the loop
 881    execute (1 << M) / STEP times, where M is the precision of NITERS.
 882    NITERS_MAYBE_ZERO is true if this last case might occur.
 883
 884    If we're not using fully-masked loops, make LOOP iterate:
 885
 886       N == (NITERS - STEP) / STEP + 1
 887
 888    times, where NITERS is known to be outside the range [1, STEP - 1].
 889    This is equivalent to making the loop execute NITERS / STEP times
 890    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 891    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 892
 893    If FINAL_IV is nonnull, it is an SSA name that should be set to
 894    N * STEP on exit from the loop.
 895
 896    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 897
 898 void
 899 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 900                          tree niters, tree step, tree final_iv,
 901                          bool niters_maybe_zero)
 902 {
 903   gcond *cond_stmt;
 904   gcond *orig_cond = get_loop_exit_condition (loop);
 905   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 906
 907   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 908     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 909                                                 final_iv, niters_maybe_zero,
 910                                                 loop_cond_gsi);
 911   else
 912     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 913                                                   final_iv, niters_maybe_zero,
 914                                                   loop_cond_gsi);
 915
 916   /* Remove old loop exit test.  */
 917   stmt_vec_info orig_cond_info;
 918   if (loop_vinfo
 919       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 920     loop_vinfo->remove_stmt (orig_cond_info);
 921   else
 922     gsi_remove (&loop_cond_gsi, true);
 923
 924   if (dump_enabled_p ())
 925     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 926                      cond_stmt);
 927 }
 928
 929 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 930    For all PHI arguments in FROM->dest and TO->dest from those
 931    edges ensure that TO->dest PHI arguments have current_def
 932    to that in from.  */
 933
 934 static void
 935 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 936 {
 937   gimple_stmt_iterator gsi_from, gsi_to;
 938
 939   for (gsi_from = gsi_start_phis (from->dest),
 940        gsi_to = gsi_start_phis (to->dest);
 941        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 942     {
 943       gimple *from_phi = gsi_stmt (gsi_from);
 944       gimple *to_phi = gsi_stmt (gsi_to);
 945       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 946       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 947       if (virtual_operand_p (from_arg))
 948         {
 949           gsi_next (&gsi_from);
 950           continue;
 951         }
 952       if (virtual_operand_p (to_arg))
 953         {
 954           gsi_next (&gsi_to);
 955           continue;
 956         }
 957       if (TREE_CODE (from_arg) != SSA_NAME)
 958         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 959       else if (TREE_CODE (to_arg) == SSA_NAME
 960                && from_arg != to_arg)
 961         {
 962           if (get_current_def (to_arg) == NULL_TREE)
 963             {
 964               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
 965                                               TREE_TYPE (get_current_def
 966                                                            (from_arg))));
 967               set_current_def (to_arg, get_current_def (from_arg));
 968             }
 969         }
 970       gsi_next (&gsi_from);
 971       gsi_next (&gsi_to);
 972     }
 973
 974   gphi *from_phi = get_virtual_phi (from->dest);
 975   gphi *to_phi = get_virtual_phi (to->dest);
 976   if (from_phi)
 977     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 978                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 979 }
 980
 981
 982 /* Given LOOP this function generates a new copy of it and puts it
 983    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 984    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
 985    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
 986    entry or exit of LOOP.  */
 987
 988 class loop *
 989 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
 990                                         class loop *scalar_loop, edge e)
 991 {
 992   class loop *new_loop;
 993   basic_block *new_bbs, *bbs, *pbbs;
 994   bool at_exit;
 995   bool was_imm_dom;
 996   basic_block exit_dest;
 997   edge exit, new_exit;
 998   bool duplicate_outer_loop = false;
 999
1000   exit = single_exit (loop);
1001   at_exit = (e == exit);
1002   if (!at_exit && e != loop_preheader_edge (loop))
1003     return NULL;
1004
1005   if (scalar_loop == NULL)
1006     scalar_loop = loop;
1007
1008   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1009   pbbs = bbs + 1;
1010   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1011   /* Allow duplication of outer loops.  */
1012   if (scalar_loop->inner)
1013     duplicate_outer_loop = true;
1014   /* Check whether duplication is possible.  */
1015   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1016     {
1017       free (bbs);
1018       return NULL;
1019     }
1020
1021   /* Generate new loop structure.  */
1022   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1023   duplicate_subloops (scalar_loop, new_loop);
1024
1025   exit_dest = exit->dest;
1026   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1027                                           exit_dest) == loop->header ?
1028                  true : false);
1029
1030   /* Also copy the pre-header, this avoids jumping through hoops to
1031      duplicate the loop entry PHI arguments.  Create an empty
1032      pre-header unconditionally for this.  */
1033   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1034   edge entry_e = single_pred_edge (preheader);
1035   bbs[0] = preheader;
1036   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1037
1038   exit = single_exit (scalar_loop);
1039   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1040             &exit, 1, &new_exit, NULL,
1041             at_exit ? loop->latch : e->src, true);
1042   exit = single_exit (loop);
1043   basic_block new_preheader = new_bbs[0];
1044
1045   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1046
1047   if (scalar_loop != loop)
1048     {
1049       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1050          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1051          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1052          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1053          header) to have current_def set, so copy them over.  */
1054       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1055                                                 exit);
1056       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1057                                                            0),
1058                                                 EDGE_SUCC (loop->latch, 0));
1059     }
1060
1061   if (at_exit) /* Add the loop copy at exit.  */
1062     {
1063       if (scalar_loop != loop)
1064         {
1065           gphi_iterator gsi;
1066           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1067
1068           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1069                gsi_next (&gsi))
1070             {
1071               gphi *phi = gsi.phi ();
1072               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1073               location_t orig_locus
1074                 = gimple_phi_arg_location_from_edge (phi, e);
1075
1076               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1077             }
1078         }
1079       redirect_edge_and_branch_force (e, new_preheader);
1080       flush_pending_stmts (e);
1081       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1082       if (was_imm_dom || duplicate_outer_loop)
1083         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1084
1085       /* And remove the non-necessary forwarder again.  Keep the other
1086          one so we have a proper pre-header for the loop at the exit edge.  */
1087       redirect_edge_pred (single_succ_edge (preheader),
1088                           single_pred (preheader));
1089       delete_basic_block (preheader);
1090       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1091                                loop_preheader_edge (scalar_loop)->src);
1092     }
1093   else /* Add the copy at entry.  */
1094     {
1095       if (scalar_loop != loop)
1096         {
1097           /* Remove the non-necessary forwarder of scalar_loop again.  */
1098           redirect_edge_pred (single_succ_edge (preheader),
1099                               single_pred (preheader));
1100           delete_basic_block (preheader);
1101           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1102                                    loop_preheader_edge (scalar_loop)->src);
1103           preheader = split_edge (loop_preheader_edge (loop));
1104           entry_e = single_pred_edge (preheader);
1105         }
1106
1107       redirect_edge_and_branch_force (entry_e, new_preheader);
1108       flush_pending_stmts (entry_e);
1109       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1110
1111       redirect_edge_and_branch_force (new_exit, preheader);
1112       flush_pending_stmts (new_exit);
1113       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1114
1115       /* And remove the non-necessary forwarder again.  Keep the other
1116          one so we have a proper pre-header for the loop at the exit edge.  */
1117       redirect_edge_pred (single_succ_edge (new_preheader),
1118                           single_pred (new_preheader));
1119       delete_basic_block (new_preheader);
1120       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1121                                loop_preheader_edge (new_loop)->src);
1122     }
1123
1124   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1125   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1126     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1127
1128   if (scalar_loop != loop)
1129     {
1130       /* Update new_loop->header PHIs, so that on the preheader
1131          edge they are the ones from loop rather than scalar_loop.  */
1132       gphi_iterator gsi_orig, gsi_new;
1133       edge orig_e = loop_preheader_edge (loop);
1134       edge new_e = loop_preheader_edge (new_loop);
1135
1136       for (gsi_orig = gsi_start_phis (loop->header),
1137            gsi_new = gsi_start_phis (new_loop->header);
1138            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1139            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1140         {
1141           gphi *orig_phi = gsi_orig.phi ();
1142           gphi *new_phi = gsi_new.phi ();
1143           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1144           location_t orig_locus
1145             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1146
1147           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1148         }
1149     }
1150
1151   free (new_bbs);
1152   free (bbs);
1153
1154   checking_verify_dominators (CDI_DOMINATORS);
1155
1156   return new_loop;
1157 }
1158
1159
1160 /* Given the condition expression COND, put it as the last statement of
1161    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1162    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1163    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1164    new edge as irreducible if IRREDUCIBLE_P is true.  */
1165
1166 static edge
1167 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1168                        basic_block guard_to, basic_block dom_bb,
1169                        profile_probability probability, bool irreducible_p)
1170 {
1171   gimple_stmt_iterator gsi;
1172   edge new_e, enter_e;
1173   gcond *cond_stmt;
1174   gimple_seq gimplify_stmt_list = NULL;
1175
1176   enter_e = EDGE_SUCC (guard_bb, 0);
1177   enter_e->flags &= ~EDGE_FALLTHRU;
1178   enter_e->flags |= EDGE_FALSE_VALUE;
1179   gsi = gsi_last_bb (guard_bb);
1180
1181   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1182                                  NULL_TREE);
1183   if (gimplify_stmt_list)
1184     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1185
1186   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1187   gsi = gsi_last_bb (guard_bb);
1188   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1189
1190   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1191   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1192
1193   new_e->probability = probability;
1194   if (irreducible_p)
1195     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1196
1197   enter_e->probability = probability.invert ();
1198   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1199
1200   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1201   if (enter_e->dest->loop_father->header == enter_e->dest)
1202     split_edge (enter_e);
1203
1204   return new_e;
1205 }
1206
1207
1208 /* This function verifies that the following restrictions apply to LOOP:
1209    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1210        for innermost loop and 5 basic blocks for outer-loop.
1211    (2) it is single entry, single exit
1212    (3) its exit condition is the last stmt in the header
1213    (4) E is the entry/exit edge of LOOP.
1214  */
1215
1216 bool
1217 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1218 {
1219   edge exit_e = single_exit (loop);
1220   edge entry_e = loop_preheader_edge (loop);
1221   gcond *orig_cond = get_loop_exit_condition (loop);
1222   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1223   unsigned int num_bb = loop->inner? 5 : 2;
1224
1225   /* All loops have an outer scope; the only case loop->outer is NULL is for
1226      the function itself.  */
1227   if (!loop_outer (loop)
1228       || loop->num_nodes != num_bb
1229       || !empty_block_p (loop->latch)
1230       || !single_exit (loop)
1231       /* Verify that new loop exit condition can be trivially modified.  */
1232       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1233       || (e != exit_e && e != entry_e))
1234     return false;
1235
1236   return true;
1237 }
1238
1239 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1240    in the exit bb and rename all the uses after the loop.  This simplifies
1241    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1242    (but normally loop closed SSA form doesn't require virtual PHIs to be
1243    in the same form).  Doing this early simplifies the checking what
1244    uses should be renamed.  */
1245
1246 static void
1247 create_lcssa_for_virtual_phi (class loop *loop)
1248 {
1249   gphi_iterator gsi;
1250   edge exit_e = single_exit (loop);
1251
1252   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1253     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1254       {
1255         gphi *phi = gsi.phi ();
1256         for (gsi = gsi_start_phis (exit_e->dest);
1257              !gsi_end_p (gsi); gsi_next (&gsi))
1258           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1259             break;
1260         if (gsi_end_p (gsi))
1261           {
1262             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1263             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1264             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1265             imm_use_iterator imm_iter;
1266             gimple *stmt;
1267             use_operand_p use_p;
1268
1269             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1270               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1271             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1272             gimple_phi_set_result (new_phi, new_vop);
1273             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1274               if (stmt != new_phi
1275                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1276                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1277                   SET_USE (use_p, new_vop);
1278           }
1279         break;
1280       }
1281
1282 }
1283
1284 /* Function vect_get_loop_location.
1285
1286    Extract the location of the loop in the source code.
1287    If the loop is not well formed for vectorization, an estimated
1288    location is calculated.
1289    Return the loop location if succeed and NULL if not.  */
1290
1291 dump_user_location_t
1292 find_loop_location (class loop *loop)
1293 {
1294   gimple *stmt = NULL;
1295   basic_block bb;
1296   gimple_stmt_iterator si;
1297
1298   if (!loop)
1299     return dump_user_location_t ();
1300
1301   stmt = get_loop_exit_condition (loop);
1302
1303   if (stmt
1304       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1305     return stmt;
1306
1307   /* If we got here the loop is probably not "well formed",
1308      try to estimate the loop location */
1309
1310   if (!loop->header)
1311     return dump_user_location_t ();
1312
1313   bb = loop->header;
1314
1315   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1316     {
1317       stmt = gsi_stmt (si);
1318       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1319         return stmt;
1320     }
1321
1322   return dump_user_location_t ();
1323 }
1324
1325 /* Return true if the phi described by STMT_INFO defines an IV of the
1326    loop to be vectorized.  */
1327
1328 static bool
1329 iv_phi_p (stmt_vec_info stmt_info)
1330 {
1331   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1332   if (virtual_operand_p (PHI_RESULT (phi)))
1333     return false;
1334
1335   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1336       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1337     return false;
1338
1339   return true;
1340 }
1341
1342 /* Function vect_can_advance_ivs_p
1343
1344    In case the number of iterations that LOOP iterates is unknown at compile
1345    time, an epilog loop will be generated, and the loop induction variables
1346    (IVs) will be "advanced" to the value they are supposed to take just before
1347    the epilog loop.  Here we check that the access function of the loop IVs
1348    and the expression that represents the loop bound are simple enough.
1349    These restrictions will be relaxed in the future.  */
1350
1351 bool
1352 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1353 {
1354   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1355   basic_block bb = loop->header;
1356   gphi_iterator gsi;
1357
1358   /* Analyze phi functions of the loop header.  */
1359
1360   if (dump_enabled_p ())
1361     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1362   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1363     {
1364       tree evolution_part;
1365
1366       gphi *phi = gsi.phi ();
1367       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1368       if (dump_enabled_p ())
1369         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1370                          phi_info->stmt);
1371
1372       /* Skip virtual phi's. The data dependences that are associated with
1373          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1374
1375          Skip reduction phis.  */
1376       if (!iv_phi_p (phi_info))
1377         {
1378           if (dump_enabled_p ())
1379             dump_printf_loc (MSG_NOTE, vect_location,
1380                              "reduc or virtual phi. skip.\n");
1381           continue;
1382         }
1383
1384       /* Analyze the evolution function.  */
1385
1386       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1387       if (evolution_part == NULL_TREE)
1388         {
1389           if (dump_enabled_p ())
1390             dump_printf (MSG_MISSED_OPTIMIZATION,
1391                          "No access function or evolution.\n");
1392           return false;
1393         }
1394
1395       /* FORNOW: We do not transform initial conditions of IVs
1396          which evolution functions are not invariants in the loop.  */
1397
1398       if (!expr_invariant_in_loop_p (loop, evolution_part))
1399         {
1400           if (dump_enabled_p ())
1401             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402                              "evolution not invariant in loop.\n");
1403           return false;
1404         }
1405
1406       /* FORNOW: We do not transform initial conditions of IVs
1407          which evolution functions are a polynomial of degree >= 2.  */
1408
1409       if (tree_is_chrec (evolution_part))
1410         {
1411           if (dump_enabled_p ())
1412             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1413                              "evolution is chrec.\n");
1414           return false;
1415         }
1416     }
1417
1418   return true;
1419 }
1420
1421
1422 /*   Function vect_update_ivs_after_vectorizer.
1423
1424      "Advance" the induction variables of LOOP to the value they should take
1425      after the execution of LOOP.  This is currently necessary because the
1426      vectorizer does not handle induction variables that are used after the
1427      loop.  Such a situation occurs when the last iterations of LOOP are
1428      peeled, because:
1429      1. We introduced new uses after LOOP for IVs that were not originally used
1430         after LOOP: the IVs of LOOP are now used by an epilog loop.
1431      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1432         times, whereas the loop IVs should be bumped N times.
1433
1434      Input:
1435      - LOOP - a loop that is going to be vectorized. The last few iterations
1436               of LOOP were peeled.
1437      - NITERS - the number of iterations that LOOP executes (before it is
1438                 vectorized). i.e, the number of times the ivs should be bumped.
1439      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1440                   coming out from LOOP on which there are uses of the LOOP ivs
1441                   (this is the path from LOOP->exit to epilog_loop->preheader).
1442
1443                   The new definitions of the ivs are placed in LOOP->exit.
1444                   The phi args associated with the edge UPDATE_E in the bb
1445                   UPDATE_E->dest are updated accordingly.
1446
1447      Assumption 1: Like the rest of the vectorizer, this function assumes
1448      a single loop exit that has a single predecessor.
1449
1450      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1451      organized in the same order.
1452
1453      Assumption 3: The access function of the ivs is simple enough (see
1454      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1455
1456      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1457      coming out of LOOP on which the ivs of LOOP are used (this is the path
1458      that leads to the epilog loop; other paths skip the epilog loop).  This
1459      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1460      needs to have its phis updated.
1461  */
1462
1463 static void
1464 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1465                                   tree niters, edge update_e)
1466 {
1467   gphi_iterator gsi, gsi1;
1468   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1469   basic_block update_bb = update_e->dest;
1470   basic_block exit_bb = single_exit (loop)->dest;
1471
1472   /* Make sure there exists a single-predecessor exit bb:  */
1473   gcc_assert (single_pred_p (exit_bb));
1474   gcc_assert (single_succ_edge (exit_bb) == update_e);
1475
1476   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1477        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1478        gsi_next (&gsi), gsi_next (&gsi1))
1479     {
1480       tree init_expr;
1481       tree step_expr, off;
1482       tree type;
1483       tree var, ni, ni_name;
1484       gimple_stmt_iterator last_gsi;
1485
1486       gphi *phi = gsi.phi ();
1487       gphi *phi1 = gsi1.phi ();
1488       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1489       if (dump_enabled_p ())
1490         dump_printf_loc (MSG_NOTE, vect_location,
1491                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1492
1493       /* Skip reduction and virtual phis.  */
1494       if (!iv_phi_p (phi_info))
1495         {
1496           if (dump_enabled_p ())
1497             dump_printf_loc (MSG_NOTE, vect_location,
1498                              "reduc or virtual phi. skip.\n");
1499           continue;
1500         }
1501
1502       type = TREE_TYPE (gimple_phi_result (phi));
1503       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1504       step_expr = unshare_expr (step_expr);
1505
1506       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1507          of degree >= 2 or exponential.  */
1508       gcc_assert (!tree_is_chrec (step_expr));
1509
1510       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1511
1512       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1513                          fold_convert (TREE_TYPE (step_expr), niters),
1514                          step_expr);
1515       if (POINTER_TYPE_P (type))
1516         ni = fold_build_pointer_plus (init_expr, off);
1517       else
1518         ni = fold_build2 (PLUS_EXPR, type,
1519                           init_expr, fold_convert (type, off));
1520
1521       var = create_tmp_var (type, "tmp");
1522
1523       last_gsi = gsi_last_bb (exit_bb);
1524       gimple_seq new_stmts = NULL;
1525       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1526       /* Exit_bb shouldn't be empty.  */
1527       if (!gsi_end_p (last_gsi))
1528         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1529       else
1530         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1531
1532       /* Fix phi expressions in the successor bb.  */
1533       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1534     }
1535 }
1536
1537 /* Return a gimple value containing the misalignment (measured in vector
1538    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1539    it is away from a perfectly aligned address.  Add any new statements
1540    to SEQ.  */
1541
1542 static tree
1543 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1544 {
1545   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1546   stmt_vec_info stmt_info = dr_info->stmt;
1547   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1548
1549   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1550   unsigned HOST_WIDE_INT target_align_c;
1551   tree target_align_minus_1;
1552
1553   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1554                                         size_zero_node) < 0;
1555   tree offset = (negative
1556                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1557                  : size_zero_node);
1558   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1559                                                           offset);
1560   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1561   if (target_align.is_constant (&target_align_c))
1562     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1563   else
1564     {
1565       tree vla = build_int_cst (type, target_align);
1566       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1567                                     fold_build2 (MINUS_EXPR, type,
1568                                                  build_int_cst (type, 0), vla));
1569       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1570                                           build_int_cst (type, 1));
1571     }
1572
1573   HOST_WIDE_INT elem_size
1574     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1575   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1576
1577   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1578   tree int_start_addr = fold_convert (type, start_addr);
1579   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1580                                         target_align_minus_1);
1581
1582   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1583   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1584                                         elem_size_log);
1585
1586   return misalign_in_elems;
1587 }
1588
1589 /* Function vect_gen_prolog_loop_niters
1590
1591    Generate the number of iterations which should be peeled as prolog for the
1592    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1593    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1594    As a result, after the execution of this loop, the data reference DR will
1595    refer to an aligned location.  The following computation is generated:
1596
1597    If the misalignment of DR is known at compile time:
1598      addr_mis = int mis = DR_MISALIGNMENT (dr);
1599    Else, compute address misalignment in bytes:
1600      addr_mis = addr & (target_align - 1)
1601
1602    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1603
1604    (elem_size = element type size; an element is the scalar element whose type
1605    is the inner type of the vectype)
1606
1607    The computations will be emitted at the end of BB.  We also compute and
1608    store upper bound (included) of the result in BOUND.
1609
1610    When the step of the data-ref in the loop is not 1 (as in interleaved data
1611    and SLP), the number of iterations of the prolog must be divided by the step
1612    (which is equal to the size of interleaved group).
1613
1614    The above formulas assume that VF == number of elements in the vector. This
1615    may not hold when there are multiple-types in the loop.
1616    In this case, for some data-references in the loop the VF does not represent
1617    the number of elements that fit in the vector.  Therefore, instead of VF we
1618    use TYPE_VECTOR_SUBPARTS.  */
1619
1620 static tree
1621 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1622                              basic_block bb, int *bound)
1623 {
1624   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1625   tree var;
1626   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1627   gimple_seq stmts = NULL, new_stmts = NULL;
1628   tree iters, iters_name;
1629   stmt_vec_info stmt_info = dr_info->stmt;
1630   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1631   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1632
1633   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1634     {
1635       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1636
1637       if (dump_enabled_p ())
1638         dump_printf_loc (MSG_NOTE, vect_location,
1639                          "known peeling = %d.\n", npeel);
1640
1641       iters = build_int_cst (niters_type, npeel);
1642       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1643     }
1644   else
1645     {
1646       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1647       tree type = TREE_TYPE (misalign_in_elems);
1648       HOST_WIDE_INT elem_size
1649         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1650       /* We only do prolog peeling if the target alignment is known at compile
1651          time.  */
1652       poly_uint64 align_in_elems =
1653         exact_div (target_align, elem_size);
1654       tree align_in_elems_minus_1 =
1655         build_int_cst (type, align_in_elems - 1);
1656       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1657
1658       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1659                                  & (align_in_elems - 1)).  */
1660       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1661                                             size_zero_node) < 0;
1662       if (negative)
1663         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1664                              align_in_elems_tree);
1665       else
1666         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1667                              misalign_in_elems);
1668       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1669       iters = fold_convert (niters_type, iters);
1670       unsigned HOST_WIDE_INT align_in_elems_c;
1671       if (align_in_elems.is_constant (&align_in_elems_c))
1672         *bound = align_in_elems_c - 1;
1673       else
1674         *bound = -1;
1675     }
1676
1677   if (dump_enabled_p ())
1678     dump_printf_loc (MSG_NOTE, vect_location,
1679                      "niters for prolog loop: %T\n", iters);
1680
1681   var = create_tmp_var (niters_type, "prolog_loop_niters");
1682   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1683
1684   if (new_stmts)
1685     gimple_seq_add_seq (&stmts, new_stmts);
1686   if (stmts)
1687     {
1688       gcc_assert (single_succ_p (bb));
1689       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1690       if (gsi_end_p (gsi))
1691         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1692       else
1693         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1694     }
1695   return iters_name;
1696 }
1697
1698
1699 /* Function vect_update_init_of_dr
1700
1701    If CODE is PLUS, the vector loop starts NITERS iterations after the
1702    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1703    iterations before the scalar one (using masking to skip inactive
1704    elements).  This function updates the information recorded in DR to
1705    account for the difference.  Specifically, it updates the OFFSET
1706    field of DR.  */
1707
1708 static void
1709 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1710 {
1711   tree offset = DR_OFFSET (dr);
1712
1713   niters = fold_build2 (MULT_EXPR, sizetype,
1714                         fold_convert (sizetype, niters),
1715                         fold_convert (sizetype, DR_STEP (dr)));
1716   offset = fold_build2 (code, sizetype,
1717                         fold_convert (sizetype, offset), niters);
1718   DR_OFFSET (dr) = offset;
1719 }
1720
1721
1722 /* Function vect_update_inits_of_drs
1723
1724    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1725    CODE and NITERS are as for vect_update_inits_of_dr.  */
1726
1727 static void
1728 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1729                           tree_code code)
1730 {
1731   unsigned int i;
1732   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1733   struct data_reference *dr;
1734
1735   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1736
1737   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1738   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1739     {
1740       gimple_seq seq;
1741       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1742       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1743
1744       niters = fold_convert (sizetype, niters);
1745       niters = force_gimple_operand (niters, &seq, false, var);
1746       if (seq)
1747         {
1748           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1749           gcc_assert (!new_bb);
1750         }
1751     }
1752
1753   FOR_EACH_VEC_ELT (datarefs, i, dr)
1754     {
1755       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1756       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1757         vect_update_init_of_dr (dr, niters, code);
1758     }
1759 }
1760
1761 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1762    by masking.  This involves calculating the number of iterations to
1763    be peeled and then aligning all memory references appropriately.  */
1764
1765 void
1766 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1767 {
1768   tree misalign_in_elems;
1769   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1770
1771   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1772
1773   /* From the information recorded in LOOP_VINFO get the number of iterations
1774      that need to be skipped via masking.  */
1775   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1776     {
1777       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1778                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1779       misalign_in_elems = build_int_cst (type, misalign);
1780     }
1781   else
1782     {
1783       gimple_seq seq1 = NULL, seq2 = NULL;
1784       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1785       misalign_in_elems = fold_convert (type, misalign_in_elems);
1786       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1787                                                 &seq2, true, NULL_TREE);
1788       gimple_seq_add_seq (&seq1, seq2);
1789       if (seq1)
1790         {
1791           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1792           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1793           gcc_assert (!new_bb);
1794         }
1795     }
1796
1797   if (dump_enabled_p ())
1798     dump_printf_loc (MSG_NOTE, vect_location,
1799                      "misalignment for fully-masked loop: %T\n",
1800                      misalign_in_elems);
1801
1802   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1803
1804   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1805 }
1806
1807 /* This function builds ni_name = number of iterations.  Statements
1808    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1809    it to TRUE if new ssa_var is generated.  */
1810
1811 tree
1812 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1813 {
1814   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1815   if (TREE_CODE (ni) == INTEGER_CST)
1816     return ni;
1817   else
1818     {
1819       tree ni_name, var;
1820       gimple_seq stmts = NULL;
1821       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1822
1823       var = create_tmp_var (TREE_TYPE (ni), "niters");
1824       ni_name = force_gimple_operand (ni, &stmts, false, var);
1825       if (stmts)
1826         {
1827           gsi_insert_seq_on_edge_immediate (pe, stmts);
1828           if (new_var_p != NULL)
1829             *new_var_p = true;
1830         }
1831
1832       return ni_name;
1833     }
1834 }
1835
1836 /* Calculate the number of iterations above which vectorized loop will be
1837    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1838    of prolog loop.  If it's integer const, the integer number is also passed
1839    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1840    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1841    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1842    threshold below which the scalar (rather than vectorized) loop will be
1843    executed.  This function stores the upper bound (inclusive) of the result
1844    in BOUND_SCALAR.  */
1845
1846 static tree
1847 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1848                              int bound_prolog, poly_int64 bound_epilog, int th,
1849                              poly_uint64 *bound_scalar,
1850                              bool check_profitability)
1851 {
1852   tree type = TREE_TYPE (niters_prolog);
1853   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1854                              build_int_cst (type, bound_epilog));
1855
1856   *bound_scalar = bound_prolog + bound_epilog;
1857   if (check_profitability)
1858     {
1859       /* TH indicates the minimum niters of vectorized loop, while we
1860          compute the maximum niters of scalar loop.  */
1861       th--;
1862       /* Peeling for constant times.  */
1863       if (int_niters_prolog >= 0)
1864         {
1865           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1866           return build_int_cst (type, *bound_scalar);
1867         }
1868       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1869          and BOUND_EPILOG are inclusive upper bounds.  */
1870       if (known_ge (th, bound_prolog + bound_epilog))
1871         {
1872           *bound_scalar = th;
1873           return build_int_cst (type, th);
1874         }
1875       /* Need to do runtime comparison.  */
1876       else if (maybe_gt (th, bound_epilog))
1877         {
1878           *bound_scalar = upper_bound (*bound_scalar, th);
1879           return fold_build2 (MAX_EXPR, type,
1880                               build_int_cst (type, th), niters);
1881         }
1882     }
1883   return niters;
1884 }
1885
1886 /* NITERS is the number of times that the original scalar loop executes
1887    after peeling.  Work out the maximum number of iterations N that can
1888    be handled by the vectorized form of the loop and then either:
1889
1890    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1891
1892         niters_vector = N
1893
1894    b) set *STEP_VECTOR_PTR to one and generate:
1895
1896         niters_vector = N / vf
1897
1898    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1899    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1900    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1901
1902 void
1903 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1904                              tree *niters_vector_ptr, tree *step_vector_ptr,
1905                              bool niters_no_overflow)
1906 {
1907   tree ni_minus_gap, var;
1908   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1909   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1910   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1911   tree log_vf = NULL_TREE;
1912
1913   /* If epilogue loop is required because of data accesses with gaps, we
1914      subtract one iteration from the total number of iterations here for
1915      correct calculation of RATIO.  */
1916   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1917     {
1918       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1919                                   build_one_cst (type));
1920       if (!is_gimple_val (ni_minus_gap))
1921         {
1922           var = create_tmp_var (type, "ni_gap");
1923           gimple *stmts = NULL;
1924           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1925                                                true, var);
1926           gsi_insert_seq_on_edge_immediate (pe, stmts);
1927         }
1928     }
1929   else
1930     ni_minus_gap = niters;
1931
1932   unsigned HOST_WIDE_INT const_vf;
1933   if (vf.is_constant (&const_vf)
1934       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1935     {
1936       /* Create: niters >> log2(vf) */
1937       /* If it's known that niters == number of latch executions + 1 doesn't
1938          overflow, we can generate niters >> log2(vf); otherwise we generate
1939          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1940          will be at least one.  */
1941       log_vf = build_int_cst (type, exact_log2 (const_vf));
1942       if (niters_no_overflow)
1943         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1944       else
1945         niters_vector
1946           = fold_build2 (PLUS_EXPR, type,
1947                          fold_build2 (RSHIFT_EXPR, type,
1948                                       fold_build2 (MINUS_EXPR, type,
1949                                                    ni_minus_gap,
1950                                                    build_int_cst (type, vf)),
1951                                       log_vf),
1952                          build_int_cst (type, 1));
1953       step_vector = build_one_cst (type);
1954     }
1955   else
1956     {
1957       niters_vector = ni_minus_gap;
1958       step_vector = build_int_cst (type, vf);
1959     }
1960
1961   if (!is_gimple_val (niters_vector))
1962     {
1963       var = create_tmp_var (type, "bnd");
1964       gimple_seq stmts = NULL;
1965       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1966       gsi_insert_seq_on_edge_immediate (pe, stmts);
1967       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1968          we set range information to make niters analyzer's life easier.  */
1969       if (stmts != NULL && log_vf)
1970         set_range_info (niters_vector, VR_RANGE,
1971                         wi::to_wide (build_int_cst (type, 1)),
1972                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1973                                                   TYPE_MAX_VALUE (type),
1974                                                   log_vf)));
1975     }
1976   *niters_vector_ptr = niters_vector;
1977   *step_vector_ptr = step_vector;
1978
1979   return;
1980 }
1981
1982 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1983    loop specified by LOOP_VINFO after vectorization, compute the number
1984    of iterations before vectorization (niters_vector * vf) and store it
1985    to NITERS_VECTOR_MULT_VF_PTR.  */
1986
1987 static void
1988 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1989                                      tree niters_vector,
1990                                      tree *niters_vector_mult_vf_ptr)
1991 {
1992   /* We should be using a step_vector of VF if VF is variable.  */
1993   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1994   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1995   tree type = TREE_TYPE (niters_vector);
1996   tree log_vf = build_int_cst (type, exact_log2 (vf));
1997   basic_block exit_bb = single_exit (loop)->dest;
1998
1999   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2000   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2001                                             niters_vector, log_vf);
2002   if (!is_gimple_val (niters_vector_mult_vf))
2003     {
2004       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2005       gimple_seq stmts = NULL;
2006       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2007                                                     &stmts, true, var);
2008       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2009       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2010     }
2011   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2012 }
2013
2014 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2015    from SECOND/FIRST and puts it at the original loop's preheader/exit
2016    edge, the two loops are arranged as below:
2017
2018        preheader_a:
2019      first_loop:
2020        header_a:
2021          i_1 = PHI<i_0, i_2>;
2022          ...
2023          i_2 = i_1 + 1;
2024          if (cond_a)
2025            goto latch_a;
2026          else
2027            goto between_bb;
2028        latch_a:
2029          goto header_a;
2030
2031        between_bb:
2032          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2033
2034      second_loop:
2035        header_b:
2036          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2037                                  or with i_2 if no LCSSA phi is created
2038                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2039          ...
2040          i_4 = i_3 + 1;
2041          if (cond_b)
2042            goto latch_b;
2043          else
2044            goto exit_bb;
2045        latch_b:
2046          goto header_b;
2047
2048        exit_bb:
2049
2050    This function creates loop closed SSA for the first loop; update the
2051    second loop's PHI nodes by replacing argument on incoming edge with the
2052    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2053    is false, Loop closed ssa phis will only be created for non-iv phis for
2054    the first loop.
2055
2056    This function assumes exit bb of the first loop is preheader bb of the
2057    second loop, i.e, between_bb in the example code.  With PHIs updated,
2058    the second loop will execute rest iterations of the first.  */
2059
2060 static void
2061 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2062                                    class loop *first, class loop *second,
2063                                    bool create_lcssa_for_iv_phis)
2064 {
2065   gphi_iterator gsi_update, gsi_orig;
2066   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2067
2068   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2069   edge second_preheader_e = loop_preheader_edge (second);
2070   basic_block between_bb = single_exit (first)->dest;
2071
2072   gcc_assert (between_bb == second_preheader_e->src);
2073   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2074   /* Either the first loop or the second is the loop to be vectorized.  */
2075   gcc_assert (loop == first || loop == second);
2076
2077   for (gsi_orig = gsi_start_phis (first->header),
2078        gsi_update = gsi_start_phis (second->header);
2079        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2080        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2081     {
2082       gphi *orig_phi = gsi_orig.phi ();
2083       gphi *update_phi = gsi_update.phi ();
2084
2085       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2086       /* Generate lcssa PHI node for the first loop.  */
2087       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2088       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2089       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2090         {
2091           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2092           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2093           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2094           arg = new_res;
2095         }
2096
2097       /* Update PHI node in the second loop by replacing arg on the loop's
2098          incoming edge.  */
2099       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2100     }
2101 }
2102
2103 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2104    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2105    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2106    appear like below:
2107
2108        guard_bb:
2109          if (cond)
2110            goto merge_bb;
2111          else
2112            goto skip_loop;
2113
2114      skip_loop:
2115        header_a:
2116          i_1 = PHI<i_0, i_2>;
2117          ...
2118          i_2 = i_1 + 1;
2119          if (cond_a)
2120            goto latch_a;
2121          else
2122            goto exit_a;
2123        latch_a:
2124          goto header_a;
2125
2126        exit_a:
2127          i_5 = PHI<i_2>;
2128
2129        merge_bb:
2130          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2131
2132      update_loop:
2133        header_b:
2134          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2135          ...
2136          i_4 = i_3 + 1;
2137          if (cond_b)
2138            goto latch_b;
2139          else
2140            goto exit_bb;
2141        latch_b:
2142          goto header_b;
2143
2144        exit_bb:
2145
2146    This function creates PHI nodes at merge_bb and replaces the use of i_5
2147    in the update_loop's PHI node with the result of new PHI result.  */
2148
2149 static void
2150 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2151                                     class loop *update_loop,
2152                                     edge guard_edge, edge merge_edge)
2153 {
2154   location_t merge_loc, guard_loc;
2155   edge orig_e = loop_preheader_edge (skip_loop);
2156   edge update_e = loop_preheader_edge (update_loop);
2157   gphi_iterator gsi_orig, gsi_update;
2158
2159   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2160         gsi_update = gsi_start_phis (update_loop->header));
2161        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2162        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2163     {
2164       gphi *orig_phi = gsi_orig.phi ();
2165       gphi *update_phi = gsi_update.phi ();
2166
2167       /* Generate new phi node at merge bb of the guard.  */
2168       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2169       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2170
2171       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2172          args in NEW_PHI for these edges.  */
2173       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2174       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2175       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2176       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2177       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2178       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2179
2180       /* Update phi in UPDATE_PHI.  */
2181       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2182     }
2183 }
2184
2185 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2186    this function searches for the corresponding lcssa phi node in exit
2187    bb of LOOP.  If it is found, return the phi result; otherwise return
2188    NULL.  */
2189
2190 static tree
2191 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2192                 gphi *lcssa_phi)
2193 {
2194   gphi_iterator gsi;
2195   edge e = single_exit (loop);
2196
2197   gcc_assert (single_pred_p (e->dest));
2198   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2199     {
2200       gphi *phi = gsi.phi ();
2201       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2202                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2203         return PHI_RESULT (phi);
2204     }
2205   return NULL_TREE;
2206 }
2207
2208 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2209    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2210    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2211    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2212    The CFG looks like:
2213
2214      loop:
2215        header_a:
2216          i_1 = PHI<i_0, i_2>;
2217          ...
2218          i_2 = i_1 + 1;
2219          if (cond_a)
2220            goto latch_a;
2221          else
2222            goto exit_a;
2223        latch_a:
2224          goto header_a;
2225
2226        exit_a:
2227
2228        guard_bb:
2229          if (cond)
2230            goto merge_bb;
2231          else
2232            goto epilog_loop;
2233
2234        ;; fall_through_bb
2235
2236      epilog_loop:
2237        header_b:
2238          i_3 = PHI<i_2, i_4>;
2239          ...
2240          i_4 = i_3 + 1;
2241          if (cond_b)
2242            goto latch_b;
2243          else
2244            goto merge_bb;
2245        latch_b:
2246          goto header_b;
2247
2248        merge_bb:
2249          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2250
2251        exit_bb:
2252          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2253
2254    For each name used out side EPILOG (i.e - for each name that has a lcssa
2255    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2256    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2257    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2258    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2259    in exit_bb will also be updated.  */
2260
2261 static void
2262 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2263                                     edge guard_edge, edge merge_edge)
2264 {
2265   gphi_iterator gsi;
2266   basic_block merge_bb = guard_edge->dest;
2267
2268   gcc_assert (single_succ_p (merge_bb));
2269   edge e = single_succ_edge (merge_bb);
2270   basic_block exit_bb = e->dest;
2271   gcc_assert (single_pred_p (exit_bb));
2272   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2273
2274   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2275     {
2276       gphi *update_phi = gsi.phi ();
2277       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2278       /* This loop-closed-phi actually doesn't represent a use out of the
2279          loop - the phi arg is a constant.  */
2280       if (TREE_CODE (old_arg) != SSA_NAME)
2281         continue;
2282
2283       tree merge_arg = get_current_def (old_arg);
2284       if (!merge_arg)
2285         merge_arg = old_arg;
2286
2287       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2288       /* If the var is live after loop but not a reduction, we simply
2289          use the old arg.  */
2290       if (!guard_arg)
2291         guard_arg = old_arg;
2292
2293       /* Create new phi node in MERGE_BB:  */
2294       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2295       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2296
2297       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2298          the two PHI args in merge_phi for these edges.  */
2299       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2300       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2301
2302       /* Update the original phi in exit_bb.  */
2303       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2304     }
2305 }
2306
2307 /* EPILOG loop is duplicated from the original loop for vectorizing,
2308    the arg of its loop closed ssa PHI needs to be updated.  */
2309
2310 static void
2311 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2312 {
2313   gphi_iterator gsi;
2314   basic_block exit_bb = single_exit (epilog)->dest;
2315
2316   gcc_assert (single_pred_p (exit_bb));
2317   edge e = EDGE_PRED (exit_bb, 0);
2318   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2319     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2320 }
2321
2322 /* Function vect_do_peeling.
2323
2324    Input:
2325    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2326
2327        preheader:
2328      LOOP:
2329        header_bb:
2330          loop_body
2331          if (exit_loop_cond) goto exit_bb
2332          else                goto header_bb
2333        exit_bb:
2334
2335    - NITERS: The number of iterations of the loop.
2336    - NITERSM1: The number of iterations of the loop's latch.
2337    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2338    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2339                               CHECK_PROFITABILITY is true.
2340    Output:
2341    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2342      iterate after vectorization; see vect_set_loop_condition for details.
2343    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2344      should be set to the number of scalar iterations handled by the
2345      vector loop.  The SSA name is only used on exit from the loop.
2346
2347    This function peels prolog and epilog from the loop, adds guards skipping
2348    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2349    would look like:
2350
2351        guard_bb_1:
2352          if (prefer_scalar_loop) goto merge_bb_1
2353          else                    goto guard_bb_2
2354
2355        guard_bb_2:
2356          if (skip_prolog) goto merge_bb_2
2357          else             goto prolog_preheader
2358
2359        prolog_preheader:
2360      PROLOG:
2361        prolog_header_bb:
2362          prolog_body
2363          if (exit_prolog_cond) goto prolog_exit_bb
2364          else                  goto prolog_header_bb
2365        prolog_exit_bb:
2366
2367        merge_bb_2:
2368
2369        vector_preheader:
2370      VECTOR LOOP:
2371        vector_header_bb:
2372          vector_body
2373          if (exit_vector_cond) goto vector_exit_bb
2374          else                  goto vector_header_bb
2375        vector_exit_bb:
2376
2377        guard_bb_3:
2378          if (skip_epilog) goto merge_bb_3
2379          else             goto epilog_preheader
2380
2381        merge_bb_1:
2382
2383        epilog_preheader:
2384      EPILOG:
2385        epilog_header_bb:
2386          epilog_body
2387          if (exit_epilog_cond) goto merge_bb_3
2388          else                  goto epilog_header_bb
2389
2390        merge_bb_3:
2391
2392    Note this function peels prolog and epilog only if it's necessary,
2393    as well as guards.
2394    Returns created epilogue or NULL.
2395
2396    TODO: Guard for prefer_scalar_loop should be emitted along with
2397    versioning conditions if loop versioning is needed.  */
2398
2399
2400 class loop *
2401 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2402                  tree *niters_vector, tree *step_vector,
2403                  tree *niters_vector_mult_vf_var, int th,
2404                  bool check_profitability, bool niters_no_overflow)
2405 {
2406   edge e, guard_e;
2407   tree type = TREE_TYPE (niters), guard_cond;
2408   basic_block guard_bb, guard_to;
2409   profile_probability prob_prolog, prob_vector, prob_epilog;
2410   int estimated_vf;
2411   int prolog_peeling = 0;
2412   /* We currently do not support prolog peeling if the target alignment is not
2413      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2414      target alignment being constant.  */
2415   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2416   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2417     return NULL;
2418
2419   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2420     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2421
2422   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2423   poly_uint64 bound_epilog = 0;
2424   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2425       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2426     bound_epilog += vf - 1;
2427   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2428     bound_epilog += 1;
2429   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2430   poly_uint64 bound_scalar = bound_epilog;
2431
2432   if (!prolog_peeling && !epilog_peeling)
2433     return NULL;
2434
2435   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2436   estimated_vf = vect_vf_for_cost (loop_vinfo);
2437   if (estimated_vf == 2)
2438     estimated_vf = 3;
2439   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2440                         .apply_scale (estimated_vf - 1, estimated_vf);
2441
2442   class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2443   class loop *first_loop = loop;
2444   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2445   create_lcssa_for_virtual_phi (loop);
2446   update_ssa (TODO_update_ssa_only_virtuals);
2447
2448   if (MAY_HAVE_DEBUG_BIND_STMTS)
2449     {
2450       gcc_assert (!adjust_vec.exists ());
2451       adjust_vec.create (32);
2452     }
2453   initialize_original_copy_tables ();
2454
2455   /* Record the anchor bb at which the guard should be placed if the scalar
2456      loop might be preferred.  */
2457   basic_block anchor = loop_preheader_edge (loop)->src;
2458
2459   /* Generate the number of iterations for the prolog loop.  We do this here
2460      so that we can also get the upper bound on the number of iterations.  */
2461   tree niters_prolog;
2462   int bound_prolog = 0;
2463   if (prolog_peeling)
2464     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2465                                                  &bound_prolog);
2466   else
2467     niters_prolog = build_int_cst (type, 0);
2468
2469   /* Prolog loop may be skipped.  */
2470   bool skip_prolog = (prolog_peeling != 0);
2471   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2472      when we peel for epilog loop and when it hasn't been checked with
2473      loop versioning.  */
2474   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2475                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2476                                   bound_prolog + bound_epilog)
2477                       : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2478   /* Epilog loop must be executed if the number of iterations for epilog
2479      loop is known at compile time, otherwise we need to add a check at
2480      the end of vector loop and skip to the end of epilog loop.  */
2481   bool skip_epilog = (prolog_peeling < 0
2482                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2483                       || !vf.is_constant ());
2484   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2485   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2486     skip_epilog = false;
2487
2488   if (skip_vector)
2489     {
2490       split_edge (loop_preheader_edge (loop));
2491
2492       /* Due to the order in which we peel prolog and epilog, we first
2493          propagate probability to the whole loop.  The purpose is to
2494          avoid adjusting probabilities of both prolog and vector loops
2495          separately.  Note in this case, the probability of epilog loop
2496          needs to be scaled back later.  */
2497       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2498       if (prob_vector.initialized_p ())
2499         {
2500           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2501           scale_loop_profile (loop, prob_vector, 0);
2502         }
2503     }
2504
2505   dump_user_location_t loop_loc = find_loop_location (loop);
2506   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2507   if (prolog_peeling)
2508     {
2509       e = loop_preheader_edge (loop);
2510       if (!slpeel_can_duplicate_loop_p (loop, e))
2511         {
2512           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2513                            "loop can't be duplicated to preheader edge.\n");
2514           gcc_unreachable ();
2515         }
2516       /* Peel prolog and put it on preheader edge of loop.  */
2517       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2518       if (!prolog)
2519         {
2520           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2521                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2522           gcc_unreachable ();
2523         }
2524       prolog->force_vectorize = false;
2525       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2526       first_loop = prolog;
2527       reset_original_copy_tables ();
2528
2529       /* Update the number of iterations for prolog loop.  */
2530       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2531       vect_set_loop_condition (prolog, NULL, niters_prolog,
2532                                step_prolog, NULL_TREE, false);
2533
2534       /* Skip the prolog loop.  */
2535       if (skip_prolog)
2536         {
2537           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2538                                     niters_prolog, build_int_cst (type, 0));
2539           guard_bb = loop_preheader_edge (prolog)->src;
2540           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2541           guard_to = split_edge (loop_preheader_edge (loop));
2542           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2543                                            guard_to, guard_bb,
2544                                            prob_prolog.invert (),
2545                                            irred_flag);
2546           e = EDGE_PRED (guard_to, 0);
2547           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2548           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2549
2550           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2551           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2552         }
2553       /* Update init address of DRs.  */
2554       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2555       /* Update niters for vector loop.  */
2556       LOOP_VINFO_NITERS (loop_vinfo)
2557         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2558       LOOP_VINFO_NITERSM1 (loop_vinfo)
2559         = fold_build2 (MINUS_EXPR, type,
2560                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2561       bool new_var_p = false;
2562       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2563       /* It's guaranteed that vector loop bound before vectorization is at
2564          least VF, so set range information for newly generated var.  */
2565       if (new_var_p)
2566         set_range_info (niters, VR_RANGE,
2567                         wi::to_wide (build_int_cst (type, vf)),
2568                         wi::to_wide (TYPE_MAX_VALUE (type)));
2569
2570       /* Prolog iterates at most bound_prolog times, latch iterates at
2571          most bound_prolog - 1 times.  */
2572       record_niter_bound (prolog, bound_prolog - 1, false, true);
2573       delete_update_ssa ();
2574       adjust_vec_debug_stmts ();
2575       scev_reset ();
2576     }
2577
2578   if (epilog_peeling)
2579     {
2580       e = single_exit (loop);
2581       if (!slpeel_can_duplicate_loop_p (loop, e))
2582         {
2583           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2584                            "loop can't be duplicated to exit edge.\n");
2585           gcc_unreachable ();
2586         }
2587       /* Peel epilog and put it on exit edge of loop.  */
2588       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2589       if (!epilog)
2590         {
2591           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2592                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2593           gcc_unreachable ();
2594         }
2595       epilog->force_vectorize = false;
2596       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2597
2598       /* Scalar version loop may be preferred.  In this case, add guard
2599          and skip to epilog.  Note this only happens when the number of
2600          iterations of loop is unknown at compile time, otherwise this
2601          won't be vectorized.  */
2602       if (skip_vector)
2603         {
2604           /* Additional epilogue iteration is peeled if gap exists.  */
2605           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2606                                                 bound_prolog, bound_epilog,
2607                                                 th, &bound_scalar,
2608                                                 check_profitability);
2609           /* Build guard against NITERSM1 since NITERS may overflow.  */
2610           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2611           guard_bb = anchor;
2612           guard_to = split_edge (loop_preheader_edge (epilog));
2613           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2614                                            guard_to, guard_bb,
2615                                            prob_vector.invert (),
2616                                            irred_flag);
2617           e = EDGE_PRED (guard_to, 0);
2618           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2619           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2620
2621           /* Simply propagate profile info from guard_bb to guard_to which is
2622              a merge point of control flow.  */
2623           guard_to->count = guard_bb->count;
2624
2625           /* Scale probability of epilog loop back.
2626              FIXME: We should avoid scaling down and back up.  Profile may
2627              get lost if we scale down to 0.  */
2628           basic_block *bbs = get_loop_body (epilog);
2629           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2630             bbs[i]->count = bbs[i]->count.apply_scale
2631                                  (bbs[i]->count,
2632                                   bbs[i]->count.apply_probability
2633                                     (prob_vector));
2634           free (bbs);
2635         }
2636
2637       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2638       tree niters_vector_mult_vf;
2639       /* If loop is peeled for non-zero constant times, now niters refers to
2640          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2641          overflows.  */
2642       niters_no_overflow |= (prolog_peeling > 0);
2643       vect_gen_vector_loop_niters (loop_vinfo, niters,
2644                                    niters_vector, step_vector,
2645                                    niters_no_overflow);
2646       if (!integer_onep (*step_vector))
2647         {
2648           /* On exit from the loop we will have an easy way of calcalating
2649              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2650              until then.  */
2651           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2652           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2653           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2654         }
2655       else
2656         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2657                                              &niters_vector_mult_vf);
2658       /* Update IVs of original loop as if they were advanced by
2659          niters_vector_mult_vf steps.  */
2660       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2661       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2662       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2663                                         update_e);
2664
2665       if (skip_epilog)
2666         {
2667           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2668                                     niters, niters_vector_mult_vf);
2669           guard_bb = single_exit (loop)->dest;
2670           guard_to = split_edge (single_exit (epilog));
2671           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2672                                            skip_vector ? anchor : guard_bb,
2673                                            prob_epilog.invert (),
2674                                            irred_flag);
2675           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2676                                               single_exit (epilog));
2677           /* Only need to handle basic block before epilog loop if it's not
2678              the guard_bb, which is the case when skip_vector is true.  */
2679           if (guard_bb != bb_before_epilog)
2680             {
2681               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2682
2683               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2684             }
2685           scale_loop_profile (epilog, prob_epilog, 0);
2686         }
2687       else
2688         slpeel_update_phi_nodes_for_lcssa (epilog);
2689
2690       unsigned HOST_WIDE_INT bound;
2691       if (bound_scalar.is_constant (&bound))
2692         {
2693           gcc_assert (bound != 0);
2694           /* -1 to convert loop iterations to latch iterations.  */
2695           record_niter_bound (epilog, bound - 1, false, true);
2696         }
2697
2698       delete_update_ssa ();
2699       adjust_vec_debug_stmts ();
2700       scev_reset ();
2701     }
2702   adjust_vec.release ();
2703   free_original_copy_tables ();
2704
2705   return epilog;
2706 }
2707
2708 /* Function vect_create_cond_for_niters_checks.
2709
2710    Create a conditional expression that represents the run-time checks for
2711    loop's niter.  The loop is guaranteed to terminate if the run-time
2712    checks hold.
2713
2714    Input:
2715    COND_EXPR  - input conditional expression.  New conditions will be chained
2716                 with logical AND operation.  If it is NULL, then the function
2717                 is used to return the number of alias checks.
2718    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2719                 to be checked.
2720
2721    Output:
2722    COND_EXPR - conditional expression.
2723
2724    The returned COND_EXPR is the conditional expression to be used in the
2725    if statement that controls which version of the loop gets executed at
2726    runtime.  */
2727
2728 static void
2729 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2730 {
2731   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2732
2733   if (*cond_expr)
2734     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2735                               *cond_expr, part_cond_expr);
2736   else
2737     *cond_expr = part_cond_expr;
2738 }
2739
2740 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2741    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2742
2743 static void
2744 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2745 {
2746   if (*cond_expr)
2747     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2748                               *cond_expr, part_cond_expr);
2749   else
2750     *cond_expr = part_cond_expr;
2751 }
2752
2753 /* Function vect_create_cond_for_align_checks.
2754
2755    Create a conditional expression that represents the alignment checks for
2756    all of data references (array element references) whose alignment must be
2757    checked at runtime.
2758
2759    Input:
2760    COND_EXPR  - input conditional expression.  New conditions will be chained
2761                 with logical AND operation.
2762    LOOP_VINFO - two fields of the loop information are used.
2763                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2764                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2765
2766    Output:
2767    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2768                          expression.
2769    The returned value is the conditional expression to be used in the if
2770    statement that controls which version of the loop gets executed at runtime.
2771
2772    The algorithm makes two assumptions:
2773      1) The number of bytes "n" in a vector is a power of 2.
2774      2) An address "a" is aligned if a%n is zero and that this
2775         test can be done as a&(n-1) == 0.  For example, for 16
2776         byte vectors the test is a&0xf == 0.  */
2777
2778 static void
2779 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2780                                    tree *cond_expr,
2781                                    gimple_seq *cond_expr_stmt_list)
2782 {
2783   vec<stmt_vec_info> may_misalign_stmts
2784     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2785   stmt_vec_info stmt_info;
2786   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2787   tree mask_cst;
2788   unsigned int i;
2789   tree int_ptrsize_type;
2790   char tmp_name[20];
2791   tree or_tmp_name = NULL_TREE;
2792   tree and_tmp_name;
2793   gimple *and_stmt;
2794   tree ptrsize_zero;
2795   tree part_cond_expr;
2796
2797   /* Check that mask is one less than a power of 2, i.e., mask is
2798      all zeros followed by all ones.  */
2799   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2800
2801   int_ptrsize_type = signed_type_for (ptr_type_node);
2802
2803   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2804      of the first vector of the i'th data reference. */
2805
2806   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2807     {
2808       gimple_seq new_stmt_list = NULL;
2809       tree addr_base;
2810       tree addr_tmp_name;
2811       tree new_or_tmp_name;
2812       gimple *addr_stmt, *or_stmt;
2813       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2814       bool negative = tree_int_cst_compare
2815         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2816       tree offset = negative
2817         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2818
2819       /* create: addr_tmp = (int)(address_of_first_vector) */
2820       addr_base =
2821         vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2822                                               offset);
2823       if (new_stmt_list != NULL)
2824         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2825
2826       sprintf (tmp_name, "addr2int%d", i);
2827       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2828       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2829       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2830
2831       /* The addresses are OR together.  */
2832
2833       if (or_tmp_name != NULL_TREE)
2834         {
2835           /* create: or_tmp = or_tmp | addr_tmp */
2836           sprintf (tmp_name, "orptrs%d", i);
2837           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2838           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2839                                          or_tmp_name, addr_tmp_name);
2840           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2841           or_tmp_name = new_or_tmp_name;
2842         }
2843       else
2844         or_tmp_name = addr_tmp_name;
2845
2846     } /* end for i */
2847
2848   mask_cst = build_int_cst (int_ptrsize_type, mask);
2849
2850   /* create: and_tmp = or_tmp & mask  */
2851   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2852
2853   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2854                                   or_tmp_name, mask_cst);
2855   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2856
2857   /* Make and_tmp the left operand of the conditional test against zero.
2858      if and_tmp has a nonzero bit then some address is unaligned.  */
2859   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2860   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2861                                 and_tmp_name, ptrsize_zero);
2862   chain_cond_expr (cond_expr, part_cond_expr);
2863 }
2864
2865 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2866    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2867    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2868    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2869
2870 static void
2871 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2872 {
2873   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2874   unsigned int i;
2875   vec_object_pair *pair;
2876   FOR_EACH_VEC_ELT (pairs, i, pair)
2877     {
2878       tree addr1 = build_fold_addr_expr (pair->first);
2879       tree addr2 = build_fold_addr_expr (pair->second);
2880       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2881                                          addr1, addr2);
2882       chain_cond_expr (cond_expr, part_cond_expr);
2883     }
2884 }
2885
2886 /* Create an expression that is true when all lower-bound conditions for
2887    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2888
2889 static void
2890 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2891 {
2892   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2893   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2894     {
2895       tree expr = lower_bounds[i].expr;
2896       tree type = unsigned_type_for (TREE_TYPE (expr));
2897       expr = fold_convert (type, expr);
2898       poly_uint64 bound = lower_bounds[i].min_value;
2899       if (!lower_bounds[i].unsigned_p)
2900         {
2901           expr = fold_build2 (PLUS_EXPR, type, expr,
2902                               build_int_cstu (type, bound - 1));
2903           bound += bound - 1;
2904         }
2905       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2906                                          build_int_cstu (type, bound));
2907       chain_cond_expr (cond_expr, part_cond_expr);
2908     }
2909 }
2910
2911 /* Function vect_create_cond_for_alias_checks.
2912
2913    Create a conditional expression that represents the run-time checks for
2914    overlapping of address ranges represented by a list of data references
2915    relations passed as input.
2916
2917    Input:
2918    COND_EXPR  - input conditional expression.  New conditions will be chained
2919                 with logical AND operation.  If it is NULL, then the function
2920                 is used to return the number of alias checks.
2921    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2922                 to be checked.
2923
2924    Output:
2925    COND_EXPR - conditional expression.
2926
2927    The returned COND_EXPR is the conditional expression to be used in the if
2928    statement that controls which version of the loop gets executed at runtime.
2929 */
2930
2931 void
2932 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2933 {
2934   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2935     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2936
2937   if (comp_alias_ddrs.is_empty ())
2938     return;
2939
2940   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2941                                &comp_alias_ddrs, cond_expr);
2942   if (dump_enabled_p ())
2943     dump_printf_loc (MSG_NOTE, vect_location,
2944                      "created %u versioning for alias checks.\n",
2945                      comp_alias_ddrs.length ());
2946 }
2947
2948
2949 /* Function vect_loop_versioning.
2950
2951    If the loop has data references that may or may not be aligned or/and
2952    has data reference relations whose independence was not proven then
2953    two versions of the loop need to be generated, one which is vectorized
2954    and one which isn't.  A test is then generated to control which of the
2955    loops is executed.  The test checks for the alignment of all of the
2956    data references that may or may not be aligned.  An additional
2957    sequence of runtime tests is generated for each pairs of DDRs whose
2958    independence was not proven.  The vectorized version of loop is
2959    executed only if both alias and alignment tests are passed.
2960
2961    The test generated to check which version of loop is executed
2962    is modified to also check for profitability as indicated by the
2963    cost model threshold TH.
2964
2965    The versioning precondition(s) are placed in *COND_EXPR and
2966    *COND_EXPR_STMT_LIST.  */
2967
2968 class loop *
2969 vect_loop_versioning (loop_vec_info loop_vinfo,
2970                       unsigned int th, bool check_profitability,
2971                       poly_uint64 versioning_threshold)
2972 {
2973   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2974   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2975   basic_block condition_bb;
2976   gphi_iterator gsi;
2977   gimple_stmt_iterator cond_exp_gsi;
2978   basic_block merge_bb;
2979   basic_block new_exit_bb;
2980   edge new_exit_e, e;
2981   gphi *orig_phi, *new_phi;
2982   tree cond_expr = NULL_TREE;
2983   gimple_seq cond_expr_stmt_list = NULL;
2984   tree arg;
2985   profile_probability prob = profile_probability::likely ();
2986   gimple_seq gimplify_stmt_list = NULL;
2987   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2988   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2989   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2990   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2991   tree version_simd_if_cond
2992     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
2993
2994   if (check_profitability)
2995     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2996                              build_int_cst (TREE_TYPE (scalar_loop_iters),
2997                                             th - 1));
2998   if (maybe_ne (versioning_threshold, 0U))
2999     {
3000       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3001                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3002                                               versioning_threshold - 1));
3003       if (cond_expr)
3004         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3005                                  expr, cond_expr);
3006       else
3007         cond_expr = expr;
3008     }
3009
3010   if (version_niter)
3011     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3012
3013   if (cond_expr)
3014     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3015                                         &cond_expr_stmt_list,
3016                                         is_gimple_condexpr, NULL_TREE);
3017
3018   if (version_align)
3019     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3020                                        &cond_expr_stmt_list);
3021
3022   if (version_alias)
3023     {
3024       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3025       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3026       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3027     }
3028
3029   if (version_simd_if_cond)
3030     {
3031       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3032       if (flag_checking)
3033         if (basic_block bb
3034             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3035           gcc_assert (bb != loop->header
3036                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3037                       && (scalar_loop == NULL
3038                           || (bb != scalar_loop->header
3039                               && dominated_by_p (CDI_DOMINATORS,
3040                                                  scalar_loop->header, bb))));
3041       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3042       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3043                             version_simd_if_cond, zero);
3044       if (cond_expr)
3045         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3046                                  c, cond_expr);
3047       else
3048         cond_expr = c;
3049       if (dump_enabled_p ())
3050         dump_printf_loc (MSG_NOTE, vect_location,
3051                          "created versioning for simd if condition check.\n");
3052     }
3053
3054   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3055                                       &gimplify_stmt_list,
3056                                       is_gimple_condexpr, NULL_TREE);
3057   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3058
3059   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3060      invariant in.  */
3061   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3062   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3063        !gsi_end_p (gsi); gsi_next (&gsi))
3064     {
3065       gimple *stmt = gsi_stmt (gsi);
3066       update_stmt (stmt);
3067       ssa_op_iter iter;
3068       use_operand_p use_p;
3069       basic_block def_bb;
3070       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3071         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3072             && flow_bb_inside_loop_p (outermost, def_bb))
3073           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3074     }
3075
3076   /* Search for the outermost loop we can version.  Avoid versioning of
3077      non-perfect nests but allow if-conversion versioned loops inside.  */
3078   class loop *loop_to_version = loop;
3079   if (flow_loop_nested_p (outermost, loop))
3080     {
3081       if (dump_enabled_p ())
3082         dump_printf_loc (MSG_NOTE, vect_location,
3083                          "trying to apply versioning to outer loop %d\n",
3084                          outermost->num);
3085       if (outermost->num == 0)
3086         outermost = superloop_at_depth (loop, 1);
3087       /* And avoid applying versioning on non-perfect nests.  */
3088       while (loop_to_version != outermost
3089              && single_exit (loop_outer (loop_to_version))
3090              && (!loop_outer (loop_to_version)->inner->next
3091                  || vect_loop_vectorized_call (loop_to_version))
3092              && (!loop_outer (loop_to_version)->inner->next
3093                  || !loop_outer (loop_to_version)->inner->next->next))
3094         loop_to_version = loop_outer (loop_to_version);
3095     }
3096
3097   /* Apply versioning.  If there is already a scalar version created by
3098      if-conversion re-use that.  Note we cannot re-use the copy of
3099      an if-converted outer-loop when vectorizing the inner loop only.  */
3100   gcond *cond;
3101   gimple *call;
3102   if ((!loop_to_version->inner || loop == loop_to_version)
3103       && (call = vect_loop_vectorized_call (loop_to_version, &cond)))
3104     {
3105       gcc_assert (scalar_loop);
3106       condition_bb = gimple_bb (cond);
3107       gimple_cond_set_condition_from_tree (cond, cond_expr);
3108       update_stmt (cond);
3109
3110       if (cond_expr_stmt_list)
3111         {
3112           cond_exp_gsi = gsi_for_stmt (call);
3113           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3114                                  GSI_SAME_STMT);
3115         }
3116
3117       /* if-conversion uses profile_probability::always () for both paths,
3118          reset the paths probabilities appropriately.  */
3119       edge te, fe;
3120       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3121       te->probability = prob;
3122       fe->probability = prob.invert ();
3123       /* We can scale loops counts immediately but have to postpone
3124          scaling the scalar loop because we re-use it during peeling.  */
3125       scale_loop_frequencies (loop_to_version, te->probability);
3126       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3127
3128       nloop = scalar_loop;
3129       if (dump_enabled_p ())
3130         dump_printf_loc (MSG_NOTE, vect_location,
3131                          "reusing %sloop version created by if conversion\n",
3132                          loop_to_version != loop ? "outer " : "");
3133     }
3134   else
3135     {
3136       if (loop_to_version != loop
3137           && dump_enabled_p ())
3138         dump_printf_loc (MSG_NOTE, vect_location,
3139                          "applying loop versioning to outer loop %d\n",
3140                          loop_to_version->num);
3141
3142       initialize_original_copy_tables ();
3143       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3144                             prob, prob.invert (), prob, prob.invert (), true);
3145       gcc_assert (nloop);
3146       nloop = get_loop_copy (loop);
3147
3148       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3149          reap those otherwise;  they also refer to the original
3150          loops.  */
3151       class loop *l = loop;
3152       while (gimple *call = vect_loop_vectorized_call (l))
3153         {
3154           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3155           fold_loop_internal_call (call, boolean_false_node);
3156           l = loop_outer (l);
3157         }
3158       free_original_copy_tables ();
3159
3160       if (cond_expr_stmt_list)
3161         {
3162           cond_exp_gsi = gsi_last_bb (condition_bb);
3163           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3164                                  GSI_SAME_STMT);
3165         }
3166
3167       /* Loop versioning violates an assumption we try to maintain during
3168          vectorization - that the loop exit block has a single predecessor.
3169          After versioning, the exit block of both loop versions is the same
3170          basic block (i.e. it has two predecessors). Just in order to simplify
3171          following transformations in the vectorizer, we fix this situation
3172          here by adding a new (empty) block on the exit-edge of the loop,
3173          with the proper loop-exit phis to maintain loop-closed-form.
3174          If loop versioning wasn't done from loop, but scalar_loop instead,
3175          merge_bb will have already just a single successor.  */
3176
3177       merge_bb = single_exit (loop_to_version)->dest;
3178       if (EDGE_COUNT (merge_bb->preds) >= 2)
3179         {
3180           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3181           new_exit_bb = split_edge (single_exit (loop_to_version));
3182           new_exit_e = single_exit (loop_to_version);
3183           e = EDGE_SUCC (new_exit_bb, 0);
3184
3185           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3186                gsi_next (&gsi))
3187             {
3188               tree new_res;
3189               orig_phi = gsi.phi ();
3190               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3191               new_phi = create_phi_node (new_res, new_exit_bb);
3192               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3193               add_phi_arg (new_phi, arg, new_exit_e,
3194                            gimple_phi_arg_location_from_edge (orig_phi, e));
3195               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3196             }
3197         }
3198
3199       update_ssa (TODO_update_ssa);
3200     }
3201
3202   if (version_niter)
3203     {
3204       /* The versioned loop could be infinite, we need to clear existing
3205          niter information which is copied from the original loop.  */
3206       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3207       vect_free_loop_info_assumptions (nloop);
3208       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3209       loop_constraint_set (loop, LOOP_C_INFINITE);
3210     }
3211
3212   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3213       && dump_enabled_p ())
3214     {
3215       if (version_alias)
3216         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3217                          vect_location,
3218                          "loop versioned for vectorization because of "
3219                          "possible aliasing\n");
3220       if (version_align)
3221         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3222                          vect_location,
3223                          "loop versioned for vectorization to enhance "
3224                          "alignment\n");
3225
3226     }
3227
3228   return nloop;
3229 }