gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156
 157 /* Function vect_determine_vectorization_factor
 158
 159    Determine the vectorization factor (VF).  VF is the number of data elements
 160    that are operated upon in parallel in a single iteration of the vectorized
 161    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 162    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 163    elements can fit in a single vector register.
 164
 165    We currently support vectorization of loops in which all types operated upon
 166    are of the same size.  Therefore this function currently sets VF according to
 167    the size of the types operated upon, and fails if there are multiple sizes
 168    in the loop.
 169
 170    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 171    original loop:
 172         for (i=0; i<N; i++){
 173           a[i] = b[i] + c[i];
 174         }
 175
 176    vectorized loop:
 177         for (i=0; i<N; i+=VF){
 178           a[i:VF] = b[i:VF] + c[i:VF];
 179         }
 180 */
 181
 182 static bool
 183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 187   unsigned nbbs = loop->num_nodes;
 188   poly_uint64 vectorization_factor = 1;
 189   tree scalar_type = NULL_TREE;
 190   gphi *phi;
 191   tree vectype;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               if (dump_enabled_p ())
 261                 dump_printf_loc (MSG_NOTE, vect_location,
 262                                  "nunits = " HOST_WIDE_INT_PRINT_DEC "\n",
 263                                  TYPE_VECTOR_SUBPARTS (vectype));
 264
 265               vect_update_max_nunits (&vectorization_factor, vectype);
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           if (dump_enabled_p ())
 554             dump_printf_loc (MSG_NOTE, vect_location,
 555                              "nunits = " HOST_WIDE_INT_PRINT_DEC "\n",
 556                              TYPE_VECTOR_SUBPARTS (vf_vectype));
 557
 558           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     {
 571       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 572       dump_dec (MSG_NOTE, vectorization_factor);
 573       dump_printf (MSG_NOTE, "\n");
 574     }
 575
 576   if (known_le (vectorization_factor, 1U))
 577     {
 578       if (dump_enabled_p ())
 579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 580                          "not vectorized: unsupported data-type\n");
 581       return false;
 582     }
 583   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 584
 585   for (i = 0; i < mask_producers.length (); i++)
 586     {
 587       tree mask_type = NULL;
 588
 589       stmt = STMT_VINFO_STMT (mask_producers[i]);
 590
 591       if (is_gimple_assign (stmt)
 592           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 593           && !VECT_SCALAR_BOOLEAN_TYPE_P
 594                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 595         {
 596           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 597           mask_type = get_mask_type_for_scalar_type (scalar_type);
 598
 599           if (!mask_type)
 600             {
 601               if (dump_enabled_p ())
 602                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 603                                  "not vectorized: unsupported mask\n");
 604               return false;
 605             }
 606         }
 607       else
 608         {
 609           tree rhs;
 610           ssa_op_iter iter;
 611           gimple *def_stmt;
 612           enum vect_def_type dt;
 613
 614           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 615             {
 616               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 617                                        &def_stmt, &dt, &vectype))
 618                 {
 619                   if (dump_enabled_p ())
 620                     {
 621                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 622                                        "not vectorized: can't compute mask type "
 623                                        "for statement, ");
 624                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 625                                         0);
 626                     }
 627                   return false;
 628                 }
 629
 630               /* No vectype probably means external definition.
 631                  Allow it in case there is another operand which
 632                  allows to determine mask type.  */
 633               if (!vectype)
 634                 continue;
 635
 636               if (!mask_type)
 637                 mask_type = vectype;
 638               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 639                        != TYPE_VECTOR_SUBPARTS (vectype))
 640                 {
 641                   if (dump_enabled_p ())
 642                     {
 643                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 644                                        "not vectorized: different sized masks "
 645                                        "types in statement, ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          mask_type);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 649                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 650                                          vectype);
 651                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 652                     }
 653                   return false;
 654                 }
 655               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 656                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 657                 {
 658                   if (dump_enabled_p ())
 659                     {
 660                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 661                                        "not vectorized: mixed mask and "
 662                                        "nonmask vector types in statement, ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          mask_type);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 666                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 667                                          vectype);
 668                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 669                     }
 670                   return false;
 671                 }
 672             }
 673
 674           /* We may compare boolean value loaded as vector of integers.
 675              Fix mask_type in such case.  */
 676           if (mask_type
 677               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 678               && gimple_code (stmt) == GIMPLE_ASSIGN
 679               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 680             mask_type = build_same_sized_truth_vector_type (mask_type);
 681         }
 682
 683       /* No mask_type should mean loop invariant predicate.
 684          This is probably a subject for optimization in
 685          if-conversion.  */
 686       if (!mask_type)
 687         {
 688           if (dump_enabled_p ())
 689             {
 690               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 691                                "not vectorized: can't compute mask type "
 692                                "for statement, ");
 693               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 694                                 0);
 695             }
 696           return false;
 697         }
 698
 699       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 700     }
 701
 702   return true;
 703 }
 704
 705
 706 /* Function vect_is_simple_iv_evolution.
 707
 708    FORNOW: A simple evolution of an induction variables in the loop is
 709    considered a polynomial evolution.  */
 710
 711 static bool
 712 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 713                              tree * step)
 714 {
 715   tree init_expr;
 716   tree step_expr;
 717   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 718   basic_block bb;
 719
 720   /* When there is no evolution in this loop, the evolution function
 721      is not "simple".  */
 722   if (evolution_part == NULL_TREE)
 723     return false;
 724
 725   /* When the evolution is a polynomial of degree >= 2
 726      the evolution function is not "simple".  */
 727   if (tree_is_chrec (evolution_part))
 728     return false;
 729
 730   step_expr = evolution_part;
 731   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 732
 733   if (dump_enabled_p ())
 734     {
 735       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 736       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 737       dump_printf (MSG_NOTE, ",  init: ");
 738       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 739       dump_printf (MSG_NOTE, "\n");
 740     }
 741
 742   *init = init_expr;
 743   *step = step_expr;
 744
 745   if (TREE_CODE (step_expr) != INTEGER_CST
 746       && (TREE_CODE (step_expr) != SSA_NAME
 747           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 748               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 749           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 750               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 751                   || !flag_associative_math)))
 752       && (TREE_CODE (step_expr) != REAL_CST
 753           || !flag_associative_math))
 754     {
 755       if (dump_enabled_p ())
 756         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                          "step unknown.\n");
 758       return false;
 759     }
 760
 761   return true;
 762 }
 763
 764 /* Function vect_analyze_scalar_cycles_1.
 765
 766    Examine the cross iteration def-use cycles of scalar variables
 767    in LOOP.  LOOP_VINFO represents the loop that is now being
 768    considered for vectorization (can be LOOP, or an outer-loop
 769    enclosing LOOP).  */
 770
 771 static void
 772 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 773 {
 774   basic_block bb = loop->header;
 775   tree init, step;
 776   auto_vec<gimple *, 64> worklist;
 777   gphi_iterator gsi;
 778   bool double_reduc;
 779
 780   if (dump_enabled_p ())
 781     dump_printf_loc (MSG_NOTE, vect_location,
 782                      "=== vect_analyze_scalar_cycles ===\n");
 783
 784   /* First - identify all inductions.  Reduction detection assumes that all the
 785      inductions have been identified, therefore, this order must not be
 786      changed.  */
 787   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 788     {
 789       gphi *phi = gsi.phi ();
 790       tree access_fn = NULL;
 791       tree def = PHI_RESULT (phi);
 792       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 793
 794       if (dump_enabled_p ())
 795         {
 796           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 797           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 798         }
 799
 800       /* Skip virtual phi's.  The data dependences that are associated with
 801          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 802       if (virtual_operand_p (def))
 803         continue;
 804
 805       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 806
 807       /* Analyze the evolution function.  */
 808       access_fn = analyze_scalar_evolution (loop, def);
 809       if (access_fn)
 810         {
 811           STRIP_NOPS (access_fn);
 812           if (dump_enabled_p ())
 813             {
 814               dump_printf_loc (MSG_NOTE, vect_location,
 815                                "Access function of PHI: ");
 816               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 817               dump_printf (MSG_NOTE, "\n");
 818             }
 819           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 820             = initial_condition_in_loop_num (access_fn, loop->num);
 821           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 822             = evolution_part_in_loop_num (access_fn, loop->num);
 823         }
 824
 825       if (!access_fn
 826           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 827           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 828               && TREE_CODE (step) != INTEGER_CST))
 829         {
 830           worklist.safe_push (phi);
 831           continue;
 832         }
 833
 834       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 835                   != NULL_TREE);
 836       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 837
 838       if (dump_enabled_p ())
 839         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 840       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 841     }
 842
 843
 844   /* Second - identify all reductions and nested cycles.  */
 845   while (worklist.length () > 0)
 846     {
 847       gimple *phi = worklist.pop ();
 848       tree def = PHI_RESULT (phi);
 849       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 850       gimple *reduc_stmt;
 851
 852       if (dump_enabled_p ())
 853         {
 854           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 855           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 856         }
 857
 858       gcc_assert (!virtual_operand_p (def)
 859                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 860
 861       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 862                                                 &double_reduc, false);
 863       if (reduc_stmt)
 864         {
 865           if (double_reduc)
 866             {
 867               if (dump_enabled_p ())
 868                 dump_printf_loc (MSG_NOTE, vect_location,
 869                                  "Detected double reduction.\n");
 870
 871               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 872               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 873                                                     vect_double_reduction_def;
 874             }
 875           else
 876             {
 877               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 878                 {
 879                   if (dump_enabled_p ())
 880                     dump_printf_loc (MSG_NOTE, vect_location,
 881                                      "Detected vectorizable nested cycle.\n");
 882
 883                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 884                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 885                                                              vect_nested_cycle;
 886                 }
 887               else
 888                 {
 889                   if (dump_enabled_p ())
 890                     dump_printf_loc (MSG_NOTE, vect_location,
 891                                      "Detected reduction.\n");
 892
 893                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 894                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 895                                                            vect_reduction_def;
 896                   /* Store the reduction cycles for possible vectorization in
 897                      loop-aware SLP if it was not detected as reduction
 898                      chain.  */
 899                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 900                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 901                 }
 902             }
 903         }
 904       else
 905         if (dump_enabled_p ())
 906           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 907                            "Unknown def-use cycle pattern.\n");
 908     }
 909 }
 910
 911
 912 /* Function vect_analyze_scalar_cycles.
 913
 914    Examine the cross iteration def-use cycles of scalar variables, by
 915    analyzing the loop-header PHIs of scalar variables.  Classify each
 916    cycle as one of the following: invariant, induction, reduction, unknown.
 917    We do that for the loop represented by LOOP_VINFO, and also to its
 918    inner-loop, if exists.
 919    Examples for scalar cycles:
 920
 921    Example1: reduction:
 922
 923               loop1:
 924               for (i=0; i<N; i++)
 925                  sum += a[i];
 926
 927    Example2: induction:
 928
 929               loop2:
 930               for (i=0; i<N; i++)
 931                  a[i] = i;  */
 932
 933 static void
 934 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 935 {
 936   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 937
 938   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 939
 940   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 941      Reductions in such inner-loop therefore have different properties than
 942      the reductions in the nest that gets vectorized:
 943      1. When vectorized, they are executed in the same order as in the original
 944         scalar loop, so we can't change the order of computation when
 945         vectorizing them.
 946      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 947         current checks are too strict.  */
 948
 949   if (loop->inner)
 950     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 951 }
 952
 953 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 954
 955 static void
 956 vect_fixup_reduc_chain (gimple *stmt)
 957 {
 958   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 959   gimple *stmtp;
 960   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 961               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 962   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 963   do
 964     {
 965       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 966       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 967       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 968       if (stmt)
 969         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 970           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 971     }
 972   while (stmt);
 973   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 974 }
 975
 976 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 977
 978 static void
 979 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 980 {
 981   gimple *first;
 982   unsigned i;
 983
 984   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 985     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 986       {
 987         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 988         while (next)
 989           {
 990             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 991               break;
 992             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 993           }
 994         /* If not all stmt in the chain are patterns try to handle
 995            the chain without patterns.  */
 996         if (! next)
 997           {
 998             vect_fixup_reduc_chain (first);
 999             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1000               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1001           }
1002       }
1003 }
1004
1005 /* Function vect_get_loop_niters.
1006
1007    Determine how many iterations the loop is executed and place it
1008    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1009    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1010    niter information holds in ASSUMPTIONS.
1011
1012    Return the loop exit condition.  */
1013
1014
1015 static gcond *
1016 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1017                       tree *number_of_iterations, tree *number_of_iterationsm1)
1018 {
1019   edge exit = single_exit (loop);
1020   struct tree_niter_desc niter_desc;
1021   tree niter_assumptions, niter, may_be_zero;
1022   gcond *cond = get_loop_exit_condition (loop);
1023
1024   *assumptions = boolean_true_node;
1025   *number_of_iterationsm1 = chrec_dont_know;
1026   *number_of_iterations = chrec_dont_know;
1027   if (dump_enabled_p ())
1028     dump_printf_loc (MSG_NOTE, vect_location,
1029                      "=== get_loop_niters ===\n");
1030
1031   if (!exit)
1032     return cond;
1033
1034   niter = chrec_dont_know;
1035   may_be_zero = NULL_TREE;
1036   niter_assumptions = boolean_true_node;
1037   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1038       || chrec_contains_undetermined (niter_desc.niter))
1039     return cond;
1040
1041   niter_assumptions = niter_desc.assumptions;
1042   may_be_zero = niter_desc.may_be_zero;
1043   niter = niter_desc.niter;
1044
1045   if (may_be_zero && integer_zerop (may_be_zero))
1046     may_be_zero = NULL_TREE;
1047
1048   if (may_be_zero)
1049     {
1050       if (COMPARISON_CLASS_P (may_be_zero))
1051         {
1052           /* Try to combine may_be_zero with assumptions, this can simplify
1053              computation of niter expression.  */
1054           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1055             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1056                                              niter_assumptions,
1057                                              fold_build1 (TRUTH_NOT_EXPR,
1058                                                           boolean_type_node,
1059                                                           may_be_zero));
1060           else
1061             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1062                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1063
1064           may_be_zero = NULL_TREE;
1065         }
1066       else if (integer_nonzerop (may_be_zero))
1067         {
1068           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1069           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1070           return cond;
1071         }
1072       else
1073         return cond;
1074     }
1075
1076   *assumptions = niter_assumptions;
1077   *number_of_iterationsm1 = niter;
1078
1079   /* We want the number of loop header executions which is the number
1080      of latch executions plus one.
1081      ???  For UINT_MAX latch executions this number overflows to zero
1082      for loops like do { n++; } while (n != 0);  */
1083   if (niter && !chrec_contains_undetermined (niter))
1084     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1085                           build_int_cst (TREE_TYPE (niter), 1));
1086   *number_of_iterations = niter;
1087
1088   return cond;
1089 }
1090
1091 /* Function bb_in_loop_p
1092
1093    Used as predicate for dfs order traversal of the loop bbs.  */
1094
1095 static bool
1096 bb_in_loop_p (const_basic_block bb, const void *data)
1097 {
1098   const struct loop *const loop = (const struct loop *)data;
1099   if (flow_bb_inside_loop_p (loop, bb))
1100     return true;
1101   return false;
1102 }
1103
1104
1105 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1106    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1107
1108 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1109   : vec_info (vec_info::loop, init_cost (loop_in)),
1110     loop (loop_in),
1111     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1112     num_itersm1 (NULL_TREE),
1113     num_iters (NULL_TREE),
1114     num_iters_unchanged (NULL_TREE),
1115     num_iters_assumptions (NULL_TREE),
1116     th (0),
1117     versioning_threshold (0),
1118     vectorization_factor (0),
1119     max_vectorization_factor (0),
1120     unaligned_dr (NULL),
1121     peeling_for_alignment (0),
1122     ptr_mask (0),
1123     slp_unrolling_factor (1),
1124     single_scalar_iteration_cost (0),
1125     vectorizable (false),
1126     peeling_for_gaps (false),
1127     peeling_for_niter (false),
1128     operands_swapped (false),
1129     no_data_dependencies (false),
1130     has_mask_store (false),
1131     scalar_loop (NULL),
1132     orig_loop_info (NULL)
1133 {
1134   /* Create/Update stmt_info for all stmts in the loop.  */
1135   basic_block *body = get_loop_body (loop);
1136   for (unsigned int i = 0; i < loop->num_nodes; i++)
1137     {
1138       basic_block bb = body[i];
1139       gimple_stmt_iterator si;
1140
1141       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1142         {
1143           gimple *phi = gsi_stmt (si);
1144           gimple_set_uid (phi, 0);
1145           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1146         }
1147
1148       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1149         {
1150           gimple *stmt = gsi_stmt (si);
1151           gimple_set_uid (stmt, 0);
1152           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1153         }
1154     }
1155   free (body);
1156
1157   /* CHECKME: We want to visit all BBs before their successors (except for
1158      latch blocks, for which this assertion wouldn't hold).  In the simple
1159      case of the loop forms we allow, a dfs order of the BBs would the same
1160      as reversed postorder traversal, so we are safe.  */
1161
1162   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1163                                           bbs, loop->num_nodes, loop);
1164   gcc_assert (nbbs == loop->num_nodes);
1165 }
1166
1167
1168 /* Free all memory used by the _loop_vec_info, as well as all the
1169    stmt_vec_info structs of all the stmts in the loop.  */
1170
1171 _loop_vec_info::~_loop_vec_info ()
1172 {
1173   int nbbs;
1174   gimple_stmt_iterator si;
1175   int j;
1176
1177   nbbs = loop->num_nodes;
1178   for (j = 0; j < nbbs; j++)
1179     {
1180       basic_block bb = bbs[j];
1181       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1182         free_stmt_vec_info (gsi_stmt (si));
1183
1184       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1185         {
1186           gimple *stmt = gsi_stmt (si);
1187
1188           /* We may have broken canonical form by moving a constant
1189              into RHS1 of a commutative op.  Fix such occurrences.  */
1190           if (operands_swapped && is_gimple_assign (stmt))
1191             {
1192               enum tree_code code = gimple_assign_rhs_code (stmt);
1193
1194               if ((code == PLUS_EXPR
1195                    || code == POINTER_PLUS_EXPR
1196                    || code == MULT_EXPR)
1197                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1198                 swap_ssa_operands (stmt,
1199                                    gimple_assign_rhs1_ptr (stmt),
1200                                    gimple_assign_rhs2_ptr (stmt));
1201               else if (code == COND_EXPR
1202                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1203                 {
1204                   tree cond_expr = gimple_assign_rhs1 (stmt);
1205                   enum tree_code cond_code = TREE_CODE (cond_expr);
1206
1207                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1208                     {
1209                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1210                                                                   0));
1211                       cond_code = invert_tree_comparison (cond_code,
1212                                                           honor_nans);
1213                       if (cond_code != ERROR_MARK)
1214                         {
1215                           TREE_SET_CODE (cond_expr, cond_code);
1216                           swap_ssa_operands (stmt,
1217                                              gimple_assign_rhs2_ptr (stmt),
1218                                              gimple_assign_rhs3_ptr (stmt));
1219                         }
1220                     }
1221                 }
1222             }
1223
1224           /* Free stmt_vec_info.  */
1225           free_stmt_vec_info (stmt);
1226           gsi_next (&si);
1227         }
1228     }
1229
1230   free (bbs);
1231
1232   loop->aux = NULL;
1233 }
1234
1235
1236 /* Calculate the cost of one scalar iteration of the loop.  */
1237 static void
1238 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1239 {
1240   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1241   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1242   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1243   int innerloop_iters, i;
1244
1245   /* Count statements in scalar loop.  Using this as scalar cost for a single
1246      iteration for now.
1247
1248      TODO: Add outer loop support.
1249
1250      TODO: Consider assigning different costs to different scalar
1251      statements.  */
1252
1253   /* FORNOW.  */
1254   innerloop_iters = 1;
1255   if (loop->inner)
1256     innerloop_iters = 50; /* FIXME */
1257
1258   for (i = 0; i < nbbs; i++)
1259     {
1260       gimple_stmt_iterator si;
1261       basic_block bb = bbs[i];
1262
1263       if (bb->loop_father == loop->inner)
1264         factor = innerloop_iters;
1265       else
1266         factor = 1;
1267
1268       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1269         {
1270           gimple *stmt = gsi_stmt (si);
1271           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1272
1273           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1274             continue;
1275
1276           /* Skip stmts that are not vectorized inside the loop.  */
1277           if (stmt_info
1278               && !STMT_VINFO_RELEVANT_P (stmt_info)
1279               && (!STMT_VINFO_LIVE_P (stmt_info)
1280                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1281               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1282             continue;
1283
1284           vect_cost_for_stmt kind;
1285           if (STMT_VINFO_DATA_REF (stmt_info))
1286             {
1287               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1288                kind = scalar_load;
1289              else
1290                kind = scalar_store;
1291             }
1292           else
1293             kind = scalar_stmt;
1294
1295           scalar_single_iter_cost
1296             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1297                                  factor, kind, stmt_info, 0, vect_prologue);
1298         }
1299     }
1300   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1301     = scalar_single_iter_cost;
1302 }
1303
1304
1305 /* Function vect_analyze_loop_form_1.
1306
1307    Verify that certain CFG restrictions hold, including:
1308    - the loop has a pre-header
1309    - the loop has a single entry and exit
1310    - the loop exit condition is simple enough
1311    - the number of iterations can be analyzed, i.e, a countable loop.  The
1312      niter could be analyzed under some assumptions.  */
1313
1314 bool
1315 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1316                           tree *assumptions, tree *number_of_iterationsm1,
1317                           tree *number_of_iterations, gcond **inner_loop_cond)
1318 {
1319   if (dump_enabled_p ())
1320     dump_printf_loc (MSG_NOTE, vect_location,
1321                      "=== vect_analyze_loop_form ===\n");
1322
1323   /* Different restrictions apply when we are considering an inner-most loop,
1324      vs. an outer (nested) loop.
1325      (FORNOW. May want to relax some of these restrictions in the future).  */
1326
1327   if (!loop->inner)
1328     {
1329       /* Inner-most loop.  We currently require that the number of BBs is
1330          exactly 2 (the header and latch).  Vectorizable inner-most loops
1331          look like this:
1332
1333                         (pre-header)
1334                            |
1335                           header <--------+
1336                            | |            |
1337                            | +--> latch --+
1338                            |
1339                         (exit-bb)  */
1340
1341       if (loop->num_nodes != 2)
1342         {
1343           if (dump_enabled_p ())
1344             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                              "not vectorized: control flow in loop.\n");
1346           return false;
1347         }
1348
1349       if (empty_block_p (loop->header))
1350         {
1351           if (dump_enabled_p ())
1352             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1353                              "not vectorized: empty loop.\n");
1354           return false;
1355         }
1356     }
1357   else
1358     {
1359       struct loop *innerloop = loop->inner;
1360       edge entryedge;
1361
1362       /* Nested loop. We currently require that the loop is doubly-nested,
1363          contains a single inner loop, and the number of BBs is exactly 5.
1364          Vectorizable outer-loops look like this:
1365
1366                         (pre-header)
1367                            |
1368                           header <---+
1369                            |         |
1370                           inner-loop |
1371                            |         |
1372                           tail ------+
1373                            |
1374                         (exit-bb)
1375
1376          The inner-loop has the properties expected of inner-most loops
1377          as described above.  */
1378
1379       if ((loop->inner)->inner || (loop->inner)->next)
1380         {
1381           if (dump_enabled_p ())
1382             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383                              "not vectorized: multiple nested loops.\n");
1384           return false;
1385         }
1386
1387       if (loop->num_nodes != 5)
1388         {
1389           if (dump_enabled_p ())
1390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391                              "not vectorized: control flow in loop.\n");
1392           return false;
1393         }
1394
1395       entryedge = loop_preheader_edge (innerloop);
1396       if (entryedge->src != loop->header
1397           || !single_exit (innerloop)
1398           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1399         {
1400           if (dump_enabled_p ())
1401             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402                              "not vectorized: unsupported outerloop form.\n");
1403           return false;
1404         }
1405
1406       /* Analyze the inner-loop.  */
1407       tree inner_niterm1, inner_niter, inner_assumptions;
1408       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1409                                       &inner_assumptions, &inner_niterm1,
1410                                       &inner_niter, NULL)
1411           /* Don't support analyzing niter under assumptions for inner
1412              loop.  */
1413           || !integer_onep (inner_assumptions))
1414         {
1415           if (dump_enabled_p ())
1416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                              "not vectorized: Bad inner loop.\n");
1418           return false;
1419         }
1420
1421       if (!expr_invariant_in_loop_p (loop, inner_niter))
1422         {
1423           if (dump_enabled_p ())
1424             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1425                              "not vectorized: inner-loop count not"
1426                              " invariant.\n");
1427           return false;
1428         }
1429
1430       if (dump_enabled_p ())
1431         dump_printf_loc (MSG_NOTE, vect_location,
1432                          "Considering outer-loop vectorization.\n");
1433     }
1434
1435   if (!single_exit (loop)
1436       || EDGE_COUNT (loop->header->preds) != 2)
1437     {
1438       if (dump_enabled_p ())
1439         {
1440           if (!single_exit (loop))
1441             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442                              "not vectorized: multiple exits.\n");
1443           else if (EDGE_COUNT (loop->header->preds) != 2)
1444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1445                              "not vectorized: too many incoming edges.\n");
1446         }
1447       return false;
1448     }
1449
1450   /* We assume that the loop exit condition is at the end of the loop. i.e,
1451      that the loop is represented as a do-while (with a proper if-guard
1452      before the loop if needed), where the loop header contains all the
1453      executable statements, and the latch is empty.  */
1454   if (!empty_block_p (loop->latch)
1455       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456     {
1457       if (dump_enabled_p ())
1458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                          "not vectorized: latch block not empty.\n");
1460       return false;
1461     }
1462
1463   /* Make sure the exit is not abnormal.  */
1464   edge e = single_exit (loop);
1465   if (e->flags & EDGE_ABNORMAL)
1466     {
1467       if (dump_enabled_p ())
1468         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469                          "not vectorized: abnormal loop exit edge.\n");
1470       return false;
1471     }
1472
1473   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1474                                      number_of_iterationsm1);
1475   if (!*loop_cond)
1476     {
1477       if (dump_enabled_p ())
1478         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1479                          "not vectorized: complicated exit condition.\n");
1480       return false;
1481     }
1482
1483   if (integer_zerop (*assumptions)
1484       || !*number_of_iterations
1485       || chrec_contains_undetermined (*number_of_iterations))
1486     {
1487       if (dump_enabled_p ())
1488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489                          "not vectorized: number of iterations cannot be "
1490                          "computed.\n");
1491       return false;
1492     }
1493
1494   if (integer_zerop (*number_of_iterations))
1495     {
1496       if (dump_enabled_p ())
1497         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1498                          "not vectorized: number of iterations = 0.\n");
1499       return false;
1500     }
1501
1502   return true;
1503 }
1504
1505 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1506
1507 loop_vec_info
1508 vect_analyze_loop_form (struct loop *loop)
1509 {
1510   tree assumptions, number_of_iterations, number_of_iterationsm1;
1511   gcond *loop_cond, *inner_loop_cond = NULL;
1512
1513   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1514                                   &assumptions, &number_of_iterationsm1,
1515                                   &number_of_iterations, &inner_loop_cond))
1516     return NULL;
1517
1518   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1519   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1520   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1521   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1522   if (!integer_onep (assumptions))
1523     {
1524       /* We consider to vectorize this loop by versioning it under
1525          some assumptions.  In order to do this, we need to clear
1526          existing information computed by scev and niter analyzer.  */
1527       scev_reset_htab ();
1528       free_numbers_of_iterations_estimates (loop);
1529       /* Also set flag for this loop so that following scev and niter
1530          analysis are done under the assumptions.  */
1531       loop_constraint_set (loop, LOOP_C_FINITE);
1532       /* Also record the assumptions for versioning.  */
1533       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1534     }
1535
1536   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1537     {
1538       if (dump_enabled_p ())
1539         {
1540           dump_printf_loc (MSG_NOTE, vect_location,
1541                            "Symbolic number of iterations is ");
1542           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1543           dump_printf (MSG_NOTE, "\n");
1544         }
1545     }
1546
1547   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1548   if (inner_loop_cond)
1549     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1550       = loop_exit_ctrl_vec_info_type;
1551
1552   gcc_assert (!loop->aux);
1553   loop->aux = loop_vinfo;
1554   return loop_vinfo;
1555 }
1556
1557
1558
1559 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1560    statements update the vectorization factor.  */
1561
1562 static void
1563 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1564 {
1565   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1567   int nbbs = loop->num_nodes;
1568   poly_uint64 vectorization_factor;
1569   int i;
1570
1571   if (dump_enabled_p ())
1572     dump_printf_loc (MSG_NOTE, vect_location,
1573                      "=== vect_update_vf_for_slp ===\n");
1574
1575   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1576   gcc_assert (known_ne (vectorization_factor, 0U));
1577
1578   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1579      vectorization factor of the loop is the unrolling factor required by
1580      the SLP instances.  If that unrolling factor is 1, we say, that we
1581      perform pure SLP on loop - cross iteration parallelism is not
1582      exploited.  */
1583   bool only_slp_in_loop = true;
1584   for (i = 0; i < nbbs; i++)
1585     {
1586       basic_block bb = bbs[i];
1587       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1588            gsi_next (&si))
1589         {
1590           gimple *stmt = gsi_stmt (si);
1591           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1592           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1593               && STMT_VINFO_RELATED_STMT (stmt_info))
1594             {
1595               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1596               stmt_info = vinfo_for_stmt (stmt);
1597             }
1598           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1599                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1600               && !PURE_SLP_STMT (stmt_info))
1601             /* STMT needs both SLP and loop-based vectorization.  */
1602             only_slp_in_loop = false;
1603         }
1604     }
1605
1606   if (only_slp_in_loop)
1607     {
1608       dump_printf_loc (MSG_NOTE, vect_location,
1609                        "Loop contains only SLP stmts\n");
1610       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1611     }
1612   else
1613     {
1614       dump_printf_loc (MSG_NOTE, vect_location,
1615                        "Loop contains SLP and non-SLP stmts\n");
1616       /* Both the vectorization factor and unroll factor have the form
1617          current_vector_size * X for some rational X, so they must have
1618          a common multiple.  */
1619       vectorization_factor
1620         = force_common_multiple (vectorization_factor,
1621                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1622     }
1623
1624   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1625   if (dump_enabled_p ())
1626     {
1627       dump_printf_loc (MSG_NOTE, vect_location,
1628                        "Updating vectorization factor to ");
1629       dump_dec (MSG_NOTE, vectorization_factor);
1630       dump_printf (MSG_NOTE, ".\n");
1631     }
1632 }
1633
1634 /* Function vect_analyze_loop_operations.
1635
1636    Scan the loop stmts and make sure they are all vectorizable.  */
1637
1638 static bool
1639 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1640 {
1641   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1642   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1643   int nbbs = loop->num_nodes;
1644   int i;
1645   stmt_vec_info stmt_info;
1646   bool need_to_vectorize = false;
1647   bool ok;
1648
1649   if (dump_enabled_p ())
1650     dump_printf_loc (MSG_NOTE, vect_location,
1651                      "=== vect_analyze_loop_operations ===\n");
1652
1653   for (i = 0; i < nbbs; i++)
1654     {
1655       basic_block bb = bbs[i];
1656
1657       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1658            gsi_next (&si))
1659         {
1660           gphi *phi = si.phi ();
1661           ok = true;
1662
1663           stmt_info = vinfo_for_stmt (phi);
1664           if (dump_enabled_p ())
1665             {
1666               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1667               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1668             }
1669           if (virtual_operand_p (gimple_phi_result (phi)))
1670             continue;
1671
1672           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1673              (i.e., a phi in the tail of the outer-loop).  */
1674           if (! is_loop_header_bb_p (bb))
1675             {
1676               /* FORNOW: we currently don't support the case that these phis
1677                  are not used in the outerloop (unless it is double reduction,
1678                  i.e., this phi is vect_reduction_def), cause this case
1679                  requires to actually do something here.  */
1680               if (STMT_VINFO_LIVE_P (stmt_info)
1681                   && STMT_VINFO_DEF_TYPE (stmt_info)
1682                      != vect_double_reduction_def)
1683                 {
1684                   if (dump_enabled_p ())
1685                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                                      "Unsupported loop-closed phi in "
1687                                      "outer-loop.\n");
1688                   return false;
1689                 }
1690
1691               /* If PHI is used in the outer loop, we check that its operand
1692                  is defined in the inner loop.  */
1693               if (STMT_VINFO_RELEVANT_P (stmt_info))
1694                 {
1695                   tree phi_op;
1696                   gimple *op_def_stmt;
1697
1698                   if (gimple_phi_num_args (phi) != 1)
1699                     return false;
1700
1701                   phi_op = PHI_ARG_DEF (phi, 0);
1702                   if (TREE_CODE (phi_op) != SSA_NAME)
1703                     return false;
1704
1705                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1706                   if (gimple_nop_p (op_def_stmt)
1707                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1708                       || !vinfo_for_stmt (op_def_stmt))
1709                     return false;
1710
1711                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1712                         != vect_used_in_outer
1713                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1714                            != vect_used_in_outer_by_reduction)
1715                     return false;
1716                 }
1717
1718               continue;
1719             }
1720
1721           gcc_assert (stmt_info);
1722
1723           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1724                || STMT_VINFO_LIVE_P (stmt_info))
1725               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1726             {
1727               /* A scalar-dependence cycle that we don't support.  */
1728               if (dump_enabled_p ())
1729                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                                  "not vectorized: scalar dependence cycle.\n");
1731               return false;
1732             }
1733
1734           if (STMT_VINFO_RELEVANT_P (stmt_info))
1735             {
1736               need_to_vectorize = true;
1737               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1738                   && ! PURE_SLP_STMT (stmt_info))
1739                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1740               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1741                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1742                        && ! PURE_SLP_STMT (stmt_info))
1743                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1744             }
1745
1746           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1747             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1748
1749           if (!ok)
1750             {
1751               if (dump_enabled_p ())
1752                 {
1753                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1754                                    "not vectorized: relevant phi not "
1755                                    "supported: ");
1756                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1757                 }
1758               return false;
1759             }
1760         }
1761
1762       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1763            gsi_next (&si))
1764         {
1765           gimple *stmt = gsi_stmt (si);
1766           if (!gimple_clobber_p (stmt)
1767               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1768             return false;
1769         }
1770     } /* bbs */
1771
1772   /* All operations in the loop are either irrelevant (deal with loop
1773      control, or dead), or only used outside the loop and can be moved
1774      out of the loop (e.g. invariants, inductions).  The loop can be
1775      optimized away by scalar optimizations.  We're better off not
1776      touching this loop.  */
1777   if (!need_to_vectorize)
1778     {
1779       if (dump_enabled_p ())
1780         dump_printf_loc (MSG_NOTE, vect_location,
1781                          "All the computation can be taken out of the loop.\n");
1782       if (dump_enabled_p ())
1783         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1784                          "not vectorized: redundant loop. no profit to "
1785                          "vectorize.\n");
1786       return false;
1787     }
1788
1789   return true;
1790 }
1791
1792
1793 /* Function vect_analyze_loop_2.
1794
1795    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1796    for it.  The different analyses will record information in the
1797    loop_vec_info struct.  */
1798 static bool
1799 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1800 {
1801   bool ok;
1802   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1803   poly_uint64 min_vf = 2;
1804   unsigned int n_stmts = 0;
1805
1806   /* The first group of checks is independent of the vector size.  */
1807   fatal = true;
1808
1809   /* Find all data references in the loop (which correspond to vdefs/vuses)
1810      and analyze their evolution in the loop.  */
1811
1812   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1813
1814   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1815   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1816     {
1817       if (dump_enabled_p ())
1818         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1819                          "not vectorized: loop nest containing two "
1820                          "or more consecutive inner loops cannot be "
1821                          "vectorized\n");
1822       return false;
1823     }
1824
1825   for (unsigned i = 0; i < loop->num_nodes; i++)
1826     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1827          !gsi_end_p (gsi); gsi_next (&gsi))
1828       {
1829         gimple *stmt = gsi_stmt (gsi);
1830         if (is_gimple_debug (stmt))
1831           continue;
1832         ++n_stmts;
1833         if (!find_data_references_in_stmt (loop, stmt,
1834                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1835           {
1836             if (is_gimple_call (stmt) && loop->safelen)
1837               {
1838                 tree fndecl = gimple_call_fndecl (stmt), op;
1839                 if (fndecl != NULL_TREE)
1840                   {
1841                     cgraph_node *node = cgraph_node::get (fndecl);
1842                     if (node != NULL && node->simd_clones != NULL)
1843                       {
1844                         unsigned int j, n = gimple_call_num_args (stmt);
1845                         for (j = 0; j < n; j++)
1846                           {
1847                             op = gimple_call_arg (stmt, j);
1848                             if (DECL_P (op)
1849                                 || (REFERENCE_CLASS_P (op)
1850                                     && get_base_address (op)))
1851                               break;
1852                           }
1853                         op = gimple_call_lhs (stmt);
1854                         /* Ignore #pragma omp declare simd functions
1855                            if they don't have data references in the
1856                            call stmt itself.  */
1857                         if (j == n
1858                             && !(op
1859                                  && (DECL_P (op)
1860                                      || (REFERENCE_CLASS_P (op)
1861                                          && get_base_address (op)))))
1862                           continue;
1863                       }
1864                   }
1865               }
1866             if (dump_enabled_p ())
1867               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868                                "not vectorized: loop contains function "
1869                                "calls or data references that cannot "
1870                                "be analyzed\n");
1871             return false;
1872           }
1873       }
1874
1875   /* Analyze the data references and also adjust the minimal
1876      vectorization factor according to the loads and stores.  */
1877
1878   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1879   if (!ok)
1880     {
1881       if (dump_enabled_p ())
1882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1883                          "bad data references.\n");
1884       return false;
1885     }
1886
1887   /* Classify all cross-iteration scalar data-flow cycles.
1888      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1889   vect_analyze_scalar_cycles (loop_vinfo);
1890
1891   vect_pattern_recog (loop_vinfo);
1892
1893   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1894
1895   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1896      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1897
1898   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1899   if (!ok)
1900     {
1901       if (dump_enabled_p ())
1902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903                          "bad data access.\n");
1904       return false;
1905     }
1906
1907   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1908
1909   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1910   if (!ok)
1911     {
1912       if (dump_enabled_p ())
1913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914                          "unexpected pattern.\n");
1915       return false;
1916     }
1917
1918   /* While the rest of the analysis below depends on it in some way.  */
1919   fatal = false;
1920
1921   /* Analyze data dependences between the data-refs in the loop
1922      and adjust the maximum vectorization factor according to
1923      the dependences.
1924      FORNOW: fail at the first data dependence that we encounter.  */
1925
1926   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1927   if (!ok
1928       || (max_vf != MAX_VECTORIZATION_FACTOR
1929           && maybe_lt (max_vf, min_vf)))
1930     {
1931       if (dump_enabled_p ())
1932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1933                              "bad data dependence.\n");
1934       return false;
1935     }
1936   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1937
1938   ok = vect_determine_vectorization_factor (loop_vinfo);
1939   if (!ok)
1940     {
1941       if (dump_enabled_p ())
1942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943                          "can't determine vectorization factor.\n");
1944       return false;
1945     }
1946   if (max_vf != MAX_VECTORIZATION_FACTOR
1947       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1948     {
1949       if (dump_enabled_p ())
1950         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1951                          "bad data dependence.\n");
1952       return false;
1953     }
1954
1955   /* Compute the scalar iteration cost.  */
1956   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1957
1958   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959   HOST_WIDE_INT estimated_niter;
1960   unsigned th;
1961   int min_scalar_loop_bound;
1962
1963   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1964   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1965   if (!ok)
1966     return false;
1967
1968   /* If there are any SLP instances mark them as pure_slp.  */
1969   bool slp = vect_make_slp_decision (loop_vinfo);
1970   if (slp)
1971     {
1972       /* Find stmts that need to be both vectorized and SLPed.  */
1973       vect_detect_hybrid_slp (loop_vinfo);
1974
1975       /* Update the vectorization factor based on the SLP decision.  */
1976       vect_update_vf_for_slp (loop_vinfo);
1977     }
1978
1979   /* This is the point where we can re-start analysis with SLP forced off.  */
1980 start_over:
1981
1982   /* Now the vectorization factor is final.  */
1983   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1984   gcc_assert (known_ne (vectorization_factor, 0U));
1985   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1986
1987   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1988     {
1989       dump_printf_loc (MSG_NOTE, vect_location,
1990                        "vectorization_factor = ");
1991       dump_dec (MSG_NOTE, vectorization_factor);
1992       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1993                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1994     }
1995
1996   HOST_WIDE_INT max_niter
1997     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1998   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1999        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < assumed_vf))
2000       || (max_niter != -1
2001           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf))
2002     {
2003       if (dump_enabled_p ())
2004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005                          "not vectorized: iteration count smaller than "
2006                          "vectorization factor.\n");
2007       return false;
2008     }
2009
2010   /* Analyze the alignment of the data-refs in the loop.
2011      Fail if a data reference is found that cannot be vectorized.  */
2012
2013   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2014   if (!ok)
2015     {
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "bad data alignment.\n");
2019       return false;
2020     }
2021
2022   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2023      It is important to call pruning after vect_analyze_data_ref_accesses,
2024      since we use grouping information gathered by interleaving analysis.  */
2025   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2026   if (!ok)
2027     return false;
2028
2029   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2030      vectorization.  */
2031   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2032     {
2033     /* This pass will decide on using loop versioning and/or loop peeling in
2034        order to enhance the alignment of data references in the loop.  */
2035     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2036     if (!ok)
2037       {
2038         if (dump_enabled_p ())
2039           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040                            "bad data alignment.\n");
2041         return false;
2042       }
2043     }
2044
2045   if (slp)
2046     {
2047       /* Analyze operations in the SLP instances.  Note this may
2048          remove unsupported SLP instances which makes the above
2049          SLP kind detection invalid.  */
2050       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2051       vect_slp_analyze_operations (loop_vinfo);
2052       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2053         goto again;
2054     }
2055
2056   /* Scan all the remaining operations in the loop that are not subject
2057      to SLP and make sure they are vectorizable.  */
2058   ok = vect_analyze_loop_operations (loop_vinfo);
2059   if (!ok)
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063                          "bad operation or unsupported loop bound.\n");
2064       return false;
2065     }
2066
2067   /* If epilog loop is required because of data accesses with gaps,
2068      one additional iteration needs to be peeled.  Check if there is
2069      enough iterations for vectorization.  */
2070   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2071       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2072     {
2073       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2074       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2075
2076       if (known_lt (wi::to_widest (scalar_niters), vf))
2077         {
2078           if (dump_enabled_p ())
2079             dump_printf_loc (MSG_NOTE, vect_location,
2080                              "loop has no enough iterations to support"
2081                              " peeling for gaps.\n");
2082           return false;
2083         }
2084     }
2085
2086   /* Analyze cost.  Decide if worth while to vectorize.  */
2087   int min_profitable_estimate, min_profitable_iters;
2088   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2089                                       &min_profitable_estimate);
2090
2091   if (min_profitable_iters < 0)
2092     {
2093       if (dump_enabled_p ())
2094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2095                          "not vectorized: vectorization not profitable.\n");
2096       if (dump_enabled_p ())
2097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                          "not vectorized: vector version will never be "
2099                          "profitable.\n");
2100       goto again;
2101     }
2102
2103   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2104                            * assumed_vf);
2105
2106   /* Use the cost model only if it is more conservative than user specified
2107      threshold.  */
2108   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2109
2110   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2111
2112   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2113       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2114     {
2115       if (dump_enabled_p ())
2116         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117                          "not vectorized: vectorization not profitable.\n");
2118       if (dump_enabled_p ())
2119         dump_printf_loc (MSG_NOTE, vect_location,
2120                          "not vectorized: iteration count smaller than user "
2121                          "specified loop bound parameter or minimum profitable "
2122                          "iterations (whichever is more conservative).\n");
2123       goto again;
2124     }
2125
2126   estimated_niter
2127     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2128   if (estimated_niter == -1)
2129     estimated_niter = max_niter;
2130   if (estimated_niter != -1
2131       && ((unsigned HOST_WIDE_INT) estimated_niter
2132           < MAX (th, (unsigned) min_profitable_estimate)))
2133     {
2134       if (dump_enabled_p ())
2135         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136                          "not vectorized: estimated iteration count too "
2137                          "small.\n");
2138       if (dump_enabled_p ())
2139         dump_printf_loc (MSG_NOTE, vect_location,
2140                          "not vectorized: estimated iteration count smaller "
2141                          "than specified loop bound parameter or minimum "
2142                          "profitable iterations (whichever is more "
2143                          "conservative).\n");
2144       goto again;
2145     }
2146
2147   /* Decide whether we need to create an epilogue loop to handle
2148      remaining scalar iterations.  */
2149   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2150
2151   unsigned HOST_WIDE_INT const_vf;
2152   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2153       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2154     {
2155       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2156                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2157                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2158         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2159     }
2160   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2161            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2162            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2163                 < (unsigned) exact_log2 (const_vf))
2164                /* In case of versioning, check if the maximum number of
2165                   iterations is greater than th.  If they are identical,
2166                   the epilogue is unnecessary.  */
2167                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2168                    || ((unsigned HOST_WIDE_INT) max_niter
2169                        > (th / const_vf) * const_vf))))
2170     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2171
2172   /* If an epilogue loop is required make sure we can create one.  */
2173   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2174       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2175     {
2176       if (dump_enabled_p ())
2177         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2178       if (!vect_can_advance_ivs_p (loop_vinfo)
2179           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2180                                            single_exit (LOOP_VINFO_LOOP
2181                                                          (loop_vinfo))))
2182         {
2183           if (dump_enabled_p ())
2184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185                              "not vectorized: can't create required "
2186                              "epilog loop\n");
2187           goto again;
2188         }
2189     }
2190
2191   /* During peeling, we need to check if number of loop iterations is
2192      enough for both peeled prolog loop and vector loop.  This check
2193      can be merged along with threshold check of loop versioning, so
2194      increase threshold for this case if necessary.  */
2195   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2196     {
2197       poly_uint64 niters_th;
2198
2199       /* Niters for peeled prolog loop.  */
2200       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2201         {
2202           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2203           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2204
2205           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2206         }
2207       else
2208         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2209
2210       /* Niters for at least one iteration of vectorized loop.  */
2211       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2212       /* One additional iteration because of peeling for gap.  */
2213       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2214         niters_th += 1;
2215       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2216     }
2217
2218   gcc_assert (known_eq (vectorization_factor,
2219                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2220
2221   /* Ok to vectorize!  */
2222   return true;
2223
2224 again:
2225   /* Try again with SLP forced off but if we didn't do any SLP there is
2226      no point in re-trying.  */
2227   if (!slp)
2228     return false;
2229
2230   /* If there are reduction chains re-trying will fail anyway.  */
2231   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2232     return false;
2233
2234   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2235      via interleaving or lane instructions.  */
2236   slp_instance instance;
2237   slp_tree node;
2238   unsigned i, j;
2239   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2240     {
2241       stmt_vec_info vinfo;
2242       vinfo = vinfo_for_stmt
2243           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2244       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2245         continue;
2246       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2247       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2248       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2249       if (! vect_store_lanes_supported (vectype, size)
2250           && ! vect_grouped_store_supported (vectype, size))
2251         return false;
2252       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2253         {
2254           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2255           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2256           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2257           size = STMT_VINFO_GROUP_SIZE (vinfo);
2258           vectype = STMT_VINFO_VECTYPE (vinfo);
2259           if (! vect_load_lanes_supported (vectype, size)
2260               && ! vect_grouped_load_supported (vectype, single_element_p,
2261                                                 size))
2262             return false;
2263         }
2264     }
2265
2266   if (dump_enabled_p ())
2267     dump_printf_loc (MSG_NOTE, vect_location,
2268                      "re-trying with SLP disabled\n");
2269
2270   /* Roll back state appropriately.  No SLP this time.  */
2271   slp = false;
2272   /* Restore vectorization factor as it were without SLP.  */
2273   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2274   /* Free the SLP instances.  */
2275   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2276     vect_free_slp_instance (instance);
2277   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2278   /* Reset SLP type to loop_vect on all stmts.  */
2279   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2280     {
2281       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2282       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2283            !gsi_end_p (si); gsi_next (&si))
2284         {
2285           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2286           STMT_SLP_TYPE (stmt_info) = loop_vect;
2287         }
2288       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2289            !gsi_end_p (si); gsi_next (&si))
2290         {
2291           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2292           STMT_SLP_TYPE (stmt_info) = loop_vect;
2293           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2294             {
2295               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2296               STMT_SLP_TYPE (stmt_info) = loop_vect;
2297               for (gimple_stmt_iterator pi
2298                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2299                    !gsi_end_p (pi); gsi_next (&pi))
2300                 {
2301                   gimple *pstmt = gsi_stmt (pi);
2302                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2303                 }
2304             }
2305         }
2306     }
2307   /* Free optimized alias test DDRS.  */
2308   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2309   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2310   /* Reset target cost data.  */
2311   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2312   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2313     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2314   /* Reset assorted flags.  */
2315   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2316   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2317   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2318   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2319
2320   goto start_over;
2321 }
2322
2323 /* Function vect_analyze_loop.
2324
2325    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2326    for it.  The different analyses will record information in the
2327    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2328    be vectorized.  */
2329 loop_vec_info
2330 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2331 {
2332   loop_vec_info loop_vinfo;
2333   auto_vector_sizes vector_sizes;
2334
2335   /* Autodetect first vector size we try.  */
2336   current_vector_size = 0;
2337   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2338   unsigned int next_size = 0;
2339
2340   if (dump_enabled_p ())
2341     dump_printf_loc (MSG_NOTE, vect_location,
2342                      "===== analyze_loop_nest =====\n");
2343
2344   if (loop_outer (loop)
2345       && loop_vec_info_for_loop (loop_outer (loop))
2346       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2347     {
2348       if (dump_enabled_p ())
2349         dump_printf_loc (MSG_NOTE, vect_location,
2350                          "outer-loop already vectorized.\n");
2351       return NULL;
2352     }
2353
2354   poly_uint64 autodetected_vector_size = 0;
2355   while (1)
2356     {
2357       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2358       loop_vinfo = vect_analyze_loop_form (loop);
2359       if (!loop_vinfo)
2360         {
2361           if (dump_enabled_p ())
2362             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2363                              "bad loop form.\n");
2364           return NULL;
2365         }
2366
2367       bool fatal = false;
2368
2369       if (orig_loop_vinfo)
2370         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2371
2372       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2373         {
2374           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2375
2376           return loop_vinfo;
2377         }
2378
2379       delete loop_vinfo;
2380
2381       if (next_size == 0)
2382         autodetected_vector_size = current_vector_size;
2383
2384       if (next_size < vector_sizes.length ()
2385           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2386         next_size += 1;
2387
2388       if (fatal
2389           || next_size == vector_sizes.length ()
2390           || known_eq (current_vector_size, 0U))
2391         return NULL;
2392
2393       /* Try the next biggest vector size.  */
2394       current_vector_size = vector_sizes[next_size++];
2395       if (dump_enabled_p ())
2396         {
2397           dump_printf_loc (MSG_NOTE, vect_location,
2398                            "***** Re-trying analysis with "
2399                            "vector size ");
2400           dump_dec (MSG_NOTE, current_vector_size);
2401           dump_printf (MSG_NOTE, "\n");
2402         }
2403     }
2404 }
2405
2406
2407 /* Function reduction_fn_for_scalar_code
2408
2409    Input:
2410    CODE - tree_code of a reduction operations.
2411
2412    Output:
2413    REDUC_FN - the corresponding internal function to be used to reduce the
2414       vector of partial results into a single scalar result, or IFN_LAST
2415       if the operation is a supported reduction operation, but does not have
2416       such an internal function.
2417
2418    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2419
2420 static bool
2421 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2422 {
2423   switch (code)
2424     {
2425       case MAX_EXPR:
2426         *reduc_fn = IFN_REDUC_MAX;
2427         return true;
2428
2429       case MIN_EXPR:
2430         *reduc_fn = IFN_REDUC_MIN;
2431         return true;
2432
2433       case PLUS_EXPR:
2434         *reduc_fn = IFN_REDUC_PLUS;
2435         return true;
2436
2437       case MULT_EXPR:
2438       case MINUS_EXPR:
2439       case BIT_IOR_EXPR:
2440       case BIT_XOR_EXPR:
2441       case BIT_AND_EXPR:
2442         *reduc_fn = IFN_LAST;
2443         return true;
2444
2445       default:
2446        return false;
2447     }
2448 }
2449
2450
2451 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2452    STMT is printed with a message MSG. */
2453
2454 static void
2455 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2456 {
2457   dump_printf_loc (msg_type, vect_location, "%s", msg);
2458   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2459 }
2460
2461
2462 /* Detect SLP reduction of the form:
2463
2464    #a1 = phi <a5, a0>
2465    a2 = operation (a1)
2466    a3 = operation (a2)
2467    a4 = operation (a3)
2468    a5 = operation (a4)
2469
2470    #a = phi <a5>
2471
2472    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2473    FIRST_STMT is the first reduction stmt in the chain
2474    (a2 = operation (a1)).
2475
2476    Return TRUE if a reduction chain was detected.  */
2477
2478 static bool
2479 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2480                        gimple *first_stmt)
2481 {
2482   struct loop *loop = (gimple_bb (phi))->loop_father;
2483   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2484   enum tree_code code;
2485   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2486   stmt_vec_info use_stmt_info, current_stmt_info;
2487   tree lhs;
2488   imm_use_iterator imm_iter;
2489   use_operand_p use_p;
2490   int nloop_uses, size = 0, n_out_of_loop_uses;
2491   bool found = false;
2492
2493   if (loop != vect_loop)
2494     return false;
2495
2496   lhs = PHI_RESULT (phi);
2497   code = gimple_assign_rhs_code (first_stmt);
2498   while (1)
2499     {
2500       nloop_uses = 0;
2501       n_out_of_loop_uses = 0;
2502       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2503         {
2504           gimple *use_stmt = USE_STMT (use_p);
2505           if (is_gimple_debug (use_stmt))
2506             continue;
2507
2508           /* Check if we got back to the reduction phi.  */
2509           if (use_stmt == phi)
2510             {
2511               loop_use_stmt = use_stmt;
2512               found = true;
2513               break;
2514             }
2515
2516           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2517             {
2518               loop_use_stmt = use_stmt;
2519               nloop_uses++;
2520             }
2521            else
2522              n_out_of_loop_uses++;
2523
2524            /* There are can be either a single use in the loop or two uses in
2525               phi nodes.  */
2526            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2527              return false;
2528         }
2529
2530       if (found)
2531         break;
2532
2533       /* We reached a statement with no loop uses.  */
2534       if (nloop_uses == 0)
2535         return false;
2536
2537       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2538       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2539         return false;
2540
2541       if (!is_gimple_assign (loop_use_stmt)
2542           || code != gimple_assign_rhs_code (loop_use_stmt)
2543           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2544         return false;
2545
2546       /* Insert USE_STMT into reduction chain.  */
2547       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2548       if (current_stmt)
2549         {
2550           current_stmt_info = vinfo_for_stmt (current_stmt);
2551           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2552           GROUP_FIRST_ELEMENT (use_stmt_info)
2553             = GROUP_FIRST_ELEMENT (current_stmt_info);
2554         }
2555       else
2556         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2557
2558       lhs = gimple_assign_lhs (loop_use_stmt);
2559       current_stmt = loop_use_stmt;
2560       size++;
2561    }
2562
2563   if (!found || loop_use_stmt != phi || size < 2)
2564     return false;
2565
2566   /* Swap the operands, if needed, to make the reduction operand be the second
2567      operand.  */
2568   lhs = PHI_RESULT (phi);
2569   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2570   while (next_stmt)
2571     {
2572       if (gimple_assign_rhs2 (next_stmt) == lhs)
2573         {
2574           tree op = gimple_assign_rhs1 (next_stmt);
2575           gimple *def_stmt = NULL;
2576
2577           if (TREE_CODE (op) == SSA_NAME)
2578             def_stmt = SSA_NAME_DEF_STMT (op);
2579
2580           /* Check that the other def is either defined in the loop
2581              ("vect_internal_def"), or it's an induction (defined by a
2582              loop-header phi-node).  */
2583           if (def_stmt
2584               && gimple_bb (def_stmt)
2585               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2586               && (is_gimple_assign (def_stmt)
2587                   || is_gimple_call (def_stmt)
2588                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2589                            == vect_induction_def
2590                   || (gimple_code (def_stmt) == GIMPLE_PHI
2591                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2592                                   == vect_internal_def
2593                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2594             {
2595               lhs = gimple_assign_lhs (next_stmt);
2596               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2597               continue;
2598             }
2599
2600           return false;
2601         }
2602       else
2603         {
2604           tree op = gimple_assign_rhs2 (next_stmt);
2605           gimple *def_stmt = NULL;
2606
2607           if (TREE_CODE (op) == SSA_NAME)
2608             def_stmt = SSA_NAME_DEF_STMT (op);
2609
2610           /* Check that the other def is either defined in the loop
2611             ("vect_internal_def"), or it's an induction (defined by a
2612             loop-header phi-node).  */
2613           if (def_stmt
2614               && gimple_bb (def_stmt)
2615               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2616               && (is_gimple_assign (def_stmt)
2617                   || is_gimple_call (def_stmt)
2618                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2619                               == vect_induction_def
2620                   || (gimple_code (def_stmt) == GIMPLE_PHI
2621                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2622                                   == vect_internal_def
2623                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2624             {
2625               if (dump_enabled_p ())
2626                 {
2627                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2628                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2629                 }
2630
2631               swap_ssa_operands (next_stmt,
2632                                  gimple_assign_rhs1_ptr (next_stmt),
2633                                  gimple_assign_rhs2_ptr (next_stmt));
2634               update_stmt (next_stmt);
2635
2636               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2637                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2638             }
2639           else
2640             return false;
2641         }
2642
2643       lhs = gimple_assign_lhs (next_stmt);
2644       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2645     }
2646
2647   /* Save the chain for further analysis in SLP detection.  */
2648   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2649   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2650   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2651
2652   return true;
2653 }
2654
2655
2656 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2657    reduction operation CODE has a handled computation expression.  */
2658
2659 bool
2660 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2661                       enum tree_code code)
2662 {
2663   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2664   auto_bitmap visited;
2665   tree lookfor = PHI_RESULT (phi);
2666   ssa_op_iter curri;
2667   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2668   while (USE_FROM_PTR (curr) != loop_arg)
2669     curr = op_iter_next_use (&curri);
2670   curri.i = curri.numops;
2671   do
2672     {
2673       path.safe_push (std::make_pair (curri, curr));
2674       tree use = USE_FROM_PTR (curr);
2675       if (use == lookfor)
2676         break;
2677       gimple *def = SSA_NAME_DEF_STMT (use);
2678       if (gimple_nop_p (def)
2679           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2680         {
2681 pop:
2682           do
2683             {
2684               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2685               curri = x.first;
2686               curr = x.second;
2687               do
2688                 curr = op_iter_next_use (&curri);
2689               /* Skip already visited or non-SSA operands (from iterating
2690                  over PHI args).  */
2691               while (curr != NULL_USE_OPERAND_P
2692                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2693                          || ! bitmap_set_bit (visited,
2694                                               SSA_NAME_VERSION
2695                                                 (USE_FROM_PTR (curr)))));
2696             }
2697           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2698           if (curr == NULL_USE_OPERAND_P)
2699             break;
2700         }
2701       else
2702         {
2703           if (gimple_code (def) == GIMPLE_PHI)
2704             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2705           else
2706             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2707           while (curr != NULL_USE_OPERAND_P
2708                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2709                      || ! bitmap_set_bit (visited,
2710                                           SSA_NAME_VERSION
2711                                             (USE_FROM_PTR (curr)))))
2712             curr = op_iter_next_use (&curri);
2713           if (curr == NULL_USE_OPERAND_P)
2714             goto pop;
2715         }
2716     }
2717   while (1);
2718   if (dump_file && (dump_flags & TDF_DETAILS))
2719     {
2720       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2721       unsigned i;
2722       std::pair<ssa_op_iter, use_operand_p> *x;
2723       FOR_EACH_VEC_ELT (path, i, x)
2724         {
2725           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2726           dump_printf (MSG_NOTE, " ");
2727         }
2728       dump_printf (MSG_NOTE, "\n");
2729     }
2730
2731   /* Check whether the reduction path detected is valid.  */
2732   bool fail = path.length () == 0;
2733   bool neg = false;
2734   for (unsigned i = 1; i < path.length (); ++i)
2735     {
2736       gimple *use_stmt = USE_STMT (path[i].second);
2737       tree op = USE_FROM_PTR (path[i].second);
2738       if (! has_single_use (op)
2739           || ! is_gimple_assign (use_stmt))
2740         {
2741           fail = true;
2742           break;
2743         }
2744       if (gimple_assign_rhs_code (use_stmt) != code)
2745         {
2746           if (code == PLUS_EXPR
2747               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2748             {
2749               /* Track whether we negate the reduction value each iteration.  */
2750               if (gimple_assign_rhs2 (use_stmt) == op)
2751                 neg = ! neg;
2752             }
2753           else
2754             {
2755               fail = true;
2756               break;
2757             }
2758         }
2759     }
2760   return ! fail && ! neg;
2761 }
2762
2763
2764 /* Function vect_is_simple_reduction
2765
2766    (1) Detect a cross-iteration def-use cycle that represents a simple
2767    reduction computation.  We look for the following pattern:
2768
2769    loop_header:
2770      a1 = phi < a0, a2 >
2771      a3 = ...
2772      a2 = operation (a3, a1)
2773
2774    or
2775
2776    a3 = ...
2777    loop_header:
2778      a1 = phi < a0, a2 >
2779      a2 = operation (a3, a1)
2780
2781    such that:
2782    1. operation is commutative and associative and it is safe to
2783       change the order of the computation
2784    2. no uses for a2 in the loop (a2 is used out of the loop)
2785    3. no uses of a1 in the loop besides the reduction operation
2786    4. no uses of a1 outside the loop.
2787
2788    Conditions 1,4 are tested here.
2789    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2790
2791    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2792    nested cycles.
2793
2794    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2795    reductions:
2796
2797      a1 = phi < a0, a2 >
2798      inner loop (def of a3)
2799      a2 = phi < a3 >
2800
2801    (4) Detect condition expressions, ie:
2802      for (int i = 0; i < N; i++)
2803        if (a[i] < val)
2804         ret_val = a[i];
2805
2806 */
2807
2808 static gimple *
2809 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2810                           bool *double_reduc,
2811                           bool need_wrapping_integral_overflow,
2812                           enum vect_reduction_type *v_reduc_type)
2813 {
2814   struct loop *loop = (gimple_bb (phi))->loop_father;
2815   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2816   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2817   enum tree_code orig_code, code;
2818   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2819   tree type;
2820   int nloop_uses;
2821   tree name;
2822   imm_use_iterator imm_iter;
2823   use_operand_p use_p;
2824   bool phi_def;
2825
2826   *double_reduc = false;
2827   *v_reduc_type = TREE_CODE_REDUCTION;
2828
2829   tree phi_name = PHI_RESULT (phi);
2830   /* ???  If there are no uses of the PHI result the inner loop reduction
2831      won't be detected as possibly double-reduction by vectorizable_reduction
2832      because that tries to walk the PHI arg from the preheader edge which
2833      can be constant.  See PR60382.  */
2834   if (has_zero_uses (phi_name))
2835     return NULL;
2836   nloop_uses = 0;
2837   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2838     {
2839       gimple *use_stmt = USE_STMT (use_p);
2840       if (is_gimple_debug (use_stmt))
2841         continue;
2842
2843       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2844         {
2845           if (dump_enabled_p ())
2846             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2847                              "intermediate value used outside loop.\n");
2848
2849           return NULL;
2850         }
2851
2852       nloop_uses++;
2853       if (nloop_uses > 1)
2854         {
2855           if (dump_enabled_p ())
2856             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2857                              "reduction value used in loop.\n");
2858           return NULL;
2859         }
2860
2861       phi_use_stmt = use_stmt;
2862     }
2863
2864   edge latch_e = loop_latch_edge (loop);
2865   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2866   if (TREE_CODE (loop_arg) != SSA_NAME)
2867     {
2868       if (dump_enabled_p ())
2869         {
2870           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2871                            "reduction: not ssa_name: ");
2872           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2873           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2874         }
2875       return NULL;
2876     }
2877
2878   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2879   if (is_gimple_assign (def_stmt))
2880     {
2881       name = gimple_assign_lhs (def_stmt);
2882       phi_def = false;
2883     }
2884   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2885     {
2886       name = PHI_RESULT (def_stmt);
2887       phi_def = true;
2888     }
2889   else
2890     {
2891       if (dump_enabled_p ())
2892         {
2893           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2894                            "reduction: unhandled reduction operation: ");
2895           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2896         }
2897       return NULL;
2898     }
2899
2900   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2901     return NULL;
2902
2903   nloop_uses = 0;
2904   auto_vec<gphi *, 3> lcphis;
2905   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2906     {
2907       gimple *use_stmt = USE_STMT (use_p);
2908       if (is_gimple_debug (use_stmt))
2909         continue;
2910       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2911         nloop_uses++;
2912       else
2913         /* We can have more than one loop-closed PHI.  */
2914         lcphis.safe_push (as_a <gphi *> (use_stmt));
2915       if (nloop_uses > 1)
2916         {
2917           if (dump_enabled_p ())
2918             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2919                              "reduction used in loop.\n");
2920           return NULL;
2921         }
2922     }
2923
2924   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2925      defined in the inner loop.  */
2926   if (phi_def)
2927     {
2928       op1 = PHI_ARG_DEF (def_stmt, 0);
2929
2930       if (gimple_phi_num_args (def_stmt) != 1
2931           || TREE_CODE (op1) != SSA_NAME)
2932         {
2933           if (dump_enabled_p ())
2934             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2935                              "unsupported phi node definition.\n");
2936
2937           return NULL;
2938         }
2939
2940       def1 = SSA_NAME_DEF_STMT (op1);
2941       if (gimple_bb (def1)
2942           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2943           && loop->inner
2944           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2945           && is_gimple_assign (def1)
2946           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2947         {
2948           if (dump_enabled_p ())
2949             report_vect_op (MSG_NOTE, def_stmt,
2950                             "detected double reduction: ");
2951
2952           *double_reduc = true;
2953           return def_stmt;
2954         }
2955
2956       return NULL;
2957     }
2958
2959   /* If we are vectorizing an inner reduction we are executing that
2960      in the original order only in case we are not dealing with a
2961      double reduction.  */
2962   bool check_reduction = true;
2963   if (flow_loop_nested_p (vect_loop, loop))
2964     {
2965       gphi *lcphi;
2966       unsigned i;
2967       check_reduction = false;
2968       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2969         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2970           {
2971             gimple *use_stmt = USE_STMT (use_p);
2972             if (is_gimple_debug (use_stmt))
2973               continue;
2974             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2975               check_reduction = true;
2976           }
2977     }
2978
2979   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2980   code = orig_code = gimple_assign_rhs_code (def_stmt);
2981
2982   /* We can handle "res -= x[i]", which is non-associative by
2983      simply rewriting this into "res += -x[i]".  Avoid changing
2984      gimple instruction for the first simple tests and only do this
2985      if we're allowed to change code at all.  */
2986   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2987     code = PLUS_EXPR;
2988
2989   if (code == COND_EXPR)
2990     {
2991       if (! nested_in_vect_loop)
2992         *v_reduc_type = COND_REDUCTION;
2993
2994       op3 = gimple_assign_rhs1 (def_stmt);
2995       if (COMPARISON_CLASS_P (op3))
2996         {
2997           op4 = TREE_OPERAND (op3, 1);
2998           op3 = TREE_OPERAND (op3, 0);
2999         }
3000       if (op3 == phi_name || op4 == phi_name)
3001         {
3002           if (dump_enabled_p ())
3003             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3004                             "reduction: condition depends on previous"
3005                             " iteration: ");
3006           return NULL;
3007         }
3008
3009       op1 = gimple_assign_rhs2 (def_stmt);
3010       op2 = gimple_assign_rhs3 (def_stmt);
3011     }
3012   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3013     {
3014       if (dump_enabled_p ())
3015         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3016                         "reduction: not commutative/associative: ");
3017       return NULL;
3018     }
3019   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3020     {
3021       op1 = gimple_assign_rhs1 (def_stmt);
3022       op2 = gimple_assign_rhs2 (def_stmt);
3023     }
3024   else
3025     {
3026       if (dump_enabled_p ())
3027         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3028                         "reduction: not handled operation: ");
3029       return NULL;
3030     }
3031
3032   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3033     {
3034       if (dump_enabled_p ())
3035         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3036                         "reduction: both uses not ssa_names: ");
3037
3038       return NULL;
3039     }
3040
3041   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3042   if ((TREE_CODE (op1) == SSA_NAME
3043        && !types_compatible_p (type,TREE_TYPE (op1)))
3044       || (TREE_CODE (op2) == SSA_NAME
3045           && !types_compatible_p (type, TREE_TYPE (op2)))
3046       || (op3 && TREE_CODE (op3) == SSA_NAME
3047           && !types_compatible_p (type, TREE_TYPE (op3)))
3048       || (op4 && TREE_CODE (op4) == SSA_NAME
3049           && !types_compatible_p (type, TREE_TYPE (op4))))
3050     {
3051       if (dump_enabled_p ())
3052         {
3053           dump_printf_loc (MSG_NOTE, vect_location,
3054                            "reduction: multiple types: operation type: ");
3055           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3056           dump_printf (MSG_NOTE, ", operands types: ");
3057           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3058                              TREE_TYPE (op1));
3059           dump_printf (MSG_NOTE, ",");
3060           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3061                              TREE_TYPE (op2));
3062           if (op3)
3063             {
3064               dump_printf (MSG_NOTE, ",");
3065               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3066                                  TREE_TYPE (op3));
3067             }
3068
3069           if (op4)
3070             {
3071               dump_printf (MSG_NOTE, ",");
3072               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3073                                  TREE_TYPE (op4));
3074             }
3075           dump_printf (MSG_NOTE, "\n");
3076         }
3077
3078       return NULL;
3079     }
3080
3081   /* Check that it's ok to change the order of the computation.
3082      Generally, when vectorizing a reduction we change the order of the
3083      computation.  This may change the behavior of the program in some
3084      cases, so we need to check that this is ok.  One exception is when
3085      vectorizing an outer-loop: the inner-loop is executed sequentially,
3086      and therefore vectorizing reductions in the inner-loop during
3087      outer-loop vectorization is safe.  */
3088
3089   if (*v_reduc_type != COND_REDUCTION
3090       && check_reduction)
3091     {
3092       /* CHECKME: check for !flag_finite_math_only too?  */
3093       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3094         {
3095           /* Changing the order of operations changes the semantics.  */
3096           if (dump_enabled_p ())
3097             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3098                         "reduction: unsafe fp math optimization: ");
3099           return NULL;
3100         }
3101       else if (INTEGRAL_TYPE_P (type))
3102         {
3103           if (!operation_no_trapping_overflow (type, code))
3104             {
3105               /* Changing the order of operations changes the semantics.  */
3106               if (dump_enabled_p ())
3107                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3108                                 "reduction: unsafe int math optimization"
3109                                 " (overflow traps): ");
3110               return NULL;
3111             }
3112           if (need_wrapping_integral_overflow
3113               && !TYPE_OVERFLOW_WRAPS (type)
3114               && operation_can_overflow (code))
3115             {
3116               /* Changing the order of operations changes the semantics.  */
3117               if (dump_enabled_p ())
3118                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3119                                 "reduction: unsafe int math optimization"
3120                                 " (overflow doesn't wrap): ");
3121               return NULL;
3122             }
3123         }
3124       else if (SAT_FIXED_POINT_TYPE_P (type))
3125         {
3126           /* Changing the order of operations changes the semantics.  */
3127           if (dump_enabled_p ())
3128           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3129                           "reduction: unsafe fixed-point math optimization: ");
3130           return NULL;
3131         }
3132     }
3133
3134   /* Reduction is safe. We're dealing with one of the following:
3135      1) integer arithmetic and no trapv
3136      2) floating point arithmetic, and special flags permit this optimization
3137      3) nested cycle (i.e., outer loop vectorization).  */
3138   if (TREE_CODE (op1) == SSA_NAME)
3139     def1 = SSA_NAME_DEF_STMT (op1);
3140
3141   if (TREE_CODE (op2) == SSA_NAME)
3142     def2 = SSA_NAME_DEF_STMT (op2);
3143
3144   if (code != COND_EXPR
3145       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3146     {
3147       if (dump_enabled_p ())
3148         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3149       return NULL;
3150     }
3151
3152   /* Check that one def is the reduction def, defined by PHI,
3153      the other def is either defined in the loop ("vect_internal_def"),
3154      or it's an induction (defined by a loop-header phi-node).  */
3155
3156   if (def2 && def2 == phi
3157       && (code == COND_EXPR
3158           || !def1 || gimple_nop_p (def1)
3159           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3160           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3161               && (is_gimple_assign (def1)
3162                   || is_gimple_call (def1)
3163                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3164                       == vect_induction_def
3165                   || (gimple_code (def1) == GIMPLE_PHI
3166                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3167                           == vect_internal_def
3168                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3169     {
3170       if (dump_enabled_p ())
3171         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3172       return def_stmt;
3173     }
3174
3175   if (def1 && def1 == phi
3176       && (code == COND_EXPR
3177           || !def2 || gimple_nop_p (def2)
3178           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3179           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3180               && (is_gimple_assign (def2)
3181                   || is_gimple_call (def2)
3182                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3183                        == vect_induction_def
3184                   || (gimple_code (def2) == GIMPLE_PHI
3185                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3186                            == vect_internal_def
3187                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3188     {
3189       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3190         {
3191           /* Check if we can swap operands (just for simplicity - so that
3192              the rest of the code can assume that the reduction variable
3193              is always the last (second) argument).  */
3194           if (code == COND_EXPR)
3195             {
3196               /* Swap cond_expr by inverting the condition.  */
3197               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3198               enum tree_code invert_code = ERROR_MARK;
3199               enum tree_code cond_code = TREE_CODE (cond_expr);
3200
3201               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3202                 {
3203                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3204                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3205                 }
3206               if (invert_code != ERROR_MARK)
3207                 {
3208                   TREE_SET_CODE (cond_expr, invert_code);
3209                   swap_ssa_operands (def_stmt,
3210                                      gimple_assign_rhs2_ptr (def_stmt),
3211                                      gimple_assign_rhs3_ptr (def_stmt));
3212                 }
3213               else
3214                 {
3215                   if (dump_enabled_p ())
3216                     report_vect_op (MSG_NOTE, def_stmt,
3217                                     "detected reduction: cannot swap operands "
3218                                     "for cond_expr");
3219                   return NULL;
3220                 }
3221             }
3222           else
3223             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3224                                gimple_assign_rhs2_ptr (def_stmt));
3225
3226           if (dump_enabled_p ())
3227             report_vect_op (MSG_NOTE, def_stmt,
3228                             "detected reduction: need to swap operands: ");
3229
3230           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3231             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3232         }
3233       else
3234         {
3235           if (dump_enabled_p ())
3236             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3237         }
3238
3239       return def_stmt;
3240     }
3241
3242   /* Try to find SLP reduction chain.  */
3243   if (! nested_in_vect_loop
3244       && code != COND_EXPR
3245       && orig_code != MINUS_EXPR
3246       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3247     {
3248       if (dump_enabled_p ())
3249         report_vect_op (MSG_NOTE, def_stmt,
3250                         "reduction: detected reduction chain: ");
3251
3252       return def_stmt;
3253     }
3254
3255   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3256   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3257   while (first)
3258     {
3259       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3260       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3261       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3262       first = next;
3263     }
3264
3265   /* Look for the expression computing loop_arg from loop PHI result.  */
3266   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3267                             code))
3268     return def_stmt;
3269
3270   if (dump_enabled_p ())
3271     {
3272       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3273                       "reduction: unknown pattern: ");
3274     }
3275
3276   return NULL;
3277 }
3278
3279 /* Wrapper around vect_is_simple_reduction, which will modify code
3280    in-place if it enables detection of more reductions.  Arguments
3281    as there.  */
3282
3283 gimple *
3284 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3285                              bool *double_reduc,
3286                              bool need_wrapping_integral_overflow)
3287 {
3288   enum vect_reduction_type v_reduc_type;
3289   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3290                                           need_wrapping_integral_overflow,
3291                                           &v_reduc_type);
3292   if (def)
3293     {
3294       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3295       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3296       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3297       reduc_def_info = vinfo_for_stmt (def);
3298       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3299     }
3300   return def;
3301 }
3302
3303 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3304 int
3305 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3306                              int *peel_iters_epilogue,
3307                              stmt_vector_for_cost *scalar_cost_vec,
3308                              stmt_vector_for_cost *prologue_cost_vec,
3309                              stmt_vector_for_cost *epilogue_cost_vec)
3310 {
3311   int retval = 0;
3312   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3313
3314   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3315     {
3316       *peel_iters_epilogue = assumed_vf / 2;
3317       if (dump_enabled_p ())
3318         dump_printf_loc (MSG_NOTE, vect_location,
3319                          "cost model: epilogue peel iters set to vf/2 "
3320                          "because loop iterations are unknown .\n");
3321
3322       /* If peeled iterations are known but number of scalar loop
3323          iterations are unknown, count a taken branch per peeled loop.  */
3324       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3325                                  NULL, 0, vect_prologue);
3326       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3327                                  NULL, 0, vect_epilogue);
3328     }
3329   else
3330     {
3331       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3332       peel_iters_prologue = niters < peel_iters_prologue ?
3333                             niters : peel_iters_prologue;
3334       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3335       /* If we need to peel for gaps, but no peeling is required, we have to
3336          peel VF iterations.  */
3337       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3338         *peel_iters_epilogue = assumed_vf;
3339     }
3340
3341   stmt_info_for_cost *si;
3342   int j;
3343   if (peel_iters_prologue)
3344     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3345         {
3346           stmt_vec_info stmt_info
3347             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3348           retval += record_stmt_cost (prologue_cost_vec,
3349                                       si->count * peel_iters_prologue,
3350                                       si->kind, stmt_info, si->misalign,
3351                                       vect_prologue);
3352         }
3353   if (*peel_iters_epilogue)
3354     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3355         {
3356           stmt_vec_info stmt_info
3357             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3358           retval += record_stmt_cost (epilogue_cost_vec,
3359                                       si->count * *peel_iters_epilogue,
3360                                       si->kind, stmt_info, si->misalign,
3361                                       vect_epilogue);
3362         }
3363
3364   return retval;
3365 }
3366
3367 /* Function vect_estimate_min_profitable_iters
3368
3369    Return the number of iterations required for the vector version of the
3370    loop to be profitable relative to the cost of the scalar version of the
3371    loop.
3372
3373    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3374    of iterations for vectorization.  -1 value means loop vectorization
3375    is not profitable.  This returned value may be used for dynamic
3376    profitability check.
3377
3378    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3379    for static check against estimated number of iterations.  */
3380
3381 static void
3382 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3383                                     int *ret_min_profitable_niters,
3384                                     int *ret_min_profitable_estimate)
3385 {
3386   int min_profitable_iters;
3387   int min_profitable_estimate;
3388   int peel_iters_prologue;
3389   int peel_iters_epilogue;
3390   unsigned vec_inside_cost = 0;
3391   int vec_outside_cost = 0;
3392   unsigned vec_prologue_cost = 0;
3393   unsigned vec_epilogue_cost = 0;
3394   int scalar_single_iter_cost = 0;
3395   int scalar_outside_cost = 0;
3396   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3397   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3398   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3399
3400   /* Cost model disabled.  */
3401   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3402     {
3403       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3404       *ret_min_profitable_niters = 0;
3405       *ret_min_profitable_estimate = 0;
3406       return;
3407     }
3408
3409   /* Requires loop versioning tests to handle misalignment.  */
3410   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3411     {
3412       /*  FIXME: Make cost depend on complexity of individual check.  */
3413       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3414       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3415                             vect_prologue);
3416       dump_printf (MSG_NOTE,
3417                    "cost model: Adding cost of checks for loop "
3418                    "versioning to treat misalignment.\n");
3419     }
3420
3421   /* Requires loop versioning with alias checks.  */
3422   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3423     {
3424       /*  FIXME: Make cost depend on complexity of individual check.  */
3425       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3426       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3427                             vect_prologue);
3428       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3429       if (len)
3430         /* Count LEN - 1 ANDs and LEN comparisons.  */
3431         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3432                               NULL, 0, vect_prologue);
3433       dump_printf (MSG_NOTE,
3434                    "cost model: Adding cost of checks for loop "
3435                    "versioning aliasing.\n");
3436     }
3437
3438   /* Requires loop versioning with niter checks.  */
3439   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3440     {
3441       /*  FIXME: Make cost depend on complexity of individual check.  */
3442       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3443                             vect_prologue);
3444       dump_printf (MSG_NOTE,
3445                    "cost model: Adding cost of checks for loop "
3446                    "versioning niters.\n");
3447     }
3448
3449   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3450     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3451                           vect_prologue);
3452
3453   /* Count statements in scalar loop.  Using this as scalar cost for a single
3454      iteration for now.
3455
3456      TODO: Add outer loop support.
3457
3458      TODO: Consider assigning different costs to different scalar
3459      statements.  */
3460
3461   scalar_single_iter_cost
3462     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3463
3464   /* Add additional cost for the peeled instructions in prologue and epilogue
3465      loop.
3466
3467      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3468      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3469
3470      TODO: Build an expression that represents peel_iters for prologue and
3471      epilogue to be used in a run-time test.  */
3472
3473   if (npeel  < 0)
3474     {
3475       peel_iters_prologue = assumed_vf / 2;
3476       dump_printf (MSG_NOTE, "cost model: "
3477                    "prologue peel iters set to vf/2.\n");
3478
3479       /* If peeling for alignment is unknown, loop bound of main loop becomes
3480          unknown.  */
3481       peel_iters_epilogue = assumed_vf / 2;
3482       dump_printf (MSG_NOTE, "cost model: "
3483                    "epilogue peel iters set to vf/2 because "
3484                    "peeling for alignment is unknown.\n");
3485
3486       /* If peeled iterations are unknown, count a taken branch and a not taken
3487          branch per peeled loop. Even if scalar loop iterations are known,
3488          vector iterations are not known since peeled prologue iterations are
3489          not known. Hence guards remain the same.  */
3490       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3491                             NULL, 0, vect_prologue);
3492       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3493                             NULL, 0, vect_prologue);
3494       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3495                             NULL, 0, vect_epilogue);
3496       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3497                             NULL, 0, vect_epilogue);
3498       stmt_info_for_cost *si;
3499       int j;
3500       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3501         {
3502           struct _stmt_vec_info *stmt_info
3503             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3504           (void) add_stmt_cost (target_cost_data,
3505                                 si->count * peel_iters_prologue,
3506                                 si->kind, stmt_info, si->misalign,
3507                                 vect_prologue);
3508           (void) add_stmt_cost (target_cost_data,
3509                                 si->count * peel_iters_epilogue,
3510                                 si->kind, stmt_info, si->misalign,
3511                                 vect_epilogue);
3512         }
3513     }
3514   else
3515     {
3516       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3517       stmt_info_for_cost *si;
3518       int j;
3519       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3520
3521       prologue_cost_vec.create (2);
3522       epilogue_cost_vec.create (2);
3523       peel_iters_prologue = npeel;
3524
3525       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3526                                           &peel_iters_epilogue,
3527                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3528                                             (loop_vinfo),
3529                                           &prologue_cost_vec,
3530                                           &epilogue_cost_vec);
3531
3532       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3533         {
3534           struct _stmt_vec_info *stmt_info
3535             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3536           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3537                                 si->misalign, vect_prologue);
3538         }
3539
3540       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3541         {
3542           struct _stmt_vec_info *stmt_info
3543             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3544           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3545                                 si->misalign, vect_epilogue);
3546         }
3547
3548       prologue_cost_vec.release ();
3549       epilogue_cost_vec.release ();
3550     }
3551
3552   /* FORNOW: The scalar outside cost is incremented in one of the
3553      following ways:
3554
3555      1. The vectorizer checks for alignment and aliasing and generates
3556      a condition that allows dynamic vectorization.  A cost model
3557      check is ANDED with the versioning condition.  Hence scalar code
3558      path now has the added cost of the versioning check.
3559
3560        if (cost > th & versioning_check)
3561          jmp to vector code
3562
3563      Hence run-time scalar is incremented by not-taken branch cost.
3564
3565      2. The vectorizer then checks if a prologue is required.  If the
3566      cost model check was not done before during versioning, it has to
3567      be done before the prologue check.
3568
3569        if (cost <= th)
3570          prologue = scalar_iters
3571        if (prologue == 0)
3572          jmp to vector code
3573        else
3574          execute prologue
3575        if (prologue == num_iters)
3576          go to exit
3577
3578      Hence the run-time scalar cost is incremented by a taken branch,
3579      plus a not-taken branch, plus a taken branch cost.
3580
3581      3. The vectorizer then checks if an epilogue is required.  If the
3582      cost model check was not done before during prologue check, it
3583      has to be done with the epilogue check.
3584
3585        if (prologue == 0)
3586          jmp to vector code
3587        else
3588          execute prologue
3589        if (prologue == num_iters)
3590          go to exit
3591        vector code:
3592          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3593            jmp to epilogue
3594
3595      Hence the run-time scalar cost should be incremented by 2 taken
3596      branches.
3597
3598      TODO: The back end may reorder the BBS's differently and reverse
3599      conditions/branch directions.  Change the estimates below to
3600      something more reasonable.  */
3601
3602   /* If the number of iterations is known and we do not do versioning, we can
3603      decide whether to vectorize at compile time.  Hence the scalar version
3604      do not carry cost model guard costs.  */
3605   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3606       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3607     {
3608       /* Cost model check occurs at versioning.  */
3609       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3610         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3611       else
3612         {
3613           /* Cost model check occurs at prologue generation.  */
3614           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3615             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3616               + vect_get_stmt_cost (cond_branch_not_taken);
3617           /* Cost model check occurs at epilogue generation.  */
3618           else
3619             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3620         }
3621     }
3622
3623   /* Complete the target-specific cost calculations.  */
3624   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3625                &vec_inside_cost, &vec_epilogue_cost);
3626
3627   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3628
3629   if (dump_enabled_p ())
3630     {
3631       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3632       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3633                    vec_inside_cost);
3634       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3635                    vec_prologue_cost);
3636       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3637                    vec_epilogue_cost);
3638       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3639                    scalar_single_iter_cost);
3640       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3641                    scalar_outside_cost);
3642       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3643                    vec_outside_cost);
3644       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3645                    peel_iters_prologue);
3646       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3647                    peel_iters_epilogue);
3648     }
3649
3650   /* Calculate number of iterations required to make the vector version
3651      profitable, relative to the loop bodies only.  The following condition
3652      must hold true:
3653      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3654      where
3655      SIC = scalar iteration cost, VIC = vector iteration cost,
3656      VOC = vector outside cost, VF = vectorization factor,
3657      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3658      SOC = scalar outside cost for run time cost model check.  */
3659
3660   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3661     {
3662       if (vec_outside_cost <= 0)
3663         min_profitable_iters = 0;
3664       else
3665         {
3666           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3667                                   * assumed_vf
3668                                   - vec_inside_cost * peel_iters_prologue
3669                                   - vec_inside_cost * peel_iters_epilogue)
3670                                  / ((scalar_single_iter_cost * assumed_vf)
3671                                     - vec_inside_cost);
3672
3673           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3674               <= (((int) vec_inside_cost * min_profitable_iters)
3675                   + (((int) vec_outside_cost - scalar_outside_cost)
3676                      * assumed_vf)))
3677             min_profitable_iters++;
3678         }
3679     }
3680   /* vector version will never be profitable.  */
3681   else
3682     {
3683       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3684         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3685                     "did not happen for a simd loop");
3686
3687       if (dump_enabled_p ())
3688         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3689                          "cost model: the vector iteration cost = %d "
3690                          "divided by the scalar iteration cost = %d "
3691                          "is greater or equal to the vectorization factor = %d"
3692                          ".\n",
3693                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3694       *ret_min_profitable_niters = -1;
3695       *ret_min_profitable_estimate = -1;
3696       return;
3697     }
3698
3699   dump_printf (MSG_NOTE,
3700                "  Calculated minimum iters for profitability: %d\n",
3701                min_profitable_iters);
3702
3703   /* We want the vectorized loop to execute at least once.  */
3704   if (min_profitable_iters < (assumed_vf + peel_iters_prologue))
3705     min_profitable_iters = assumed_vf + peel_iters_prologue;
3706
3707   if (dump_enabled_p ())
3708     dump_printf_loc (MSG_NOTE, vect_location,
3709                      "  Runtime profitability threshold = %d\n",
3710                      min_profitable_iters);
3711
3712   *ret_min_profitable_niters = min_profitable_iters;
3713
3714   /* Calculate number of iterations required to make the vector version
3715      profitable, relative to the loop bodies only.
3716
3717      Non-vectorized variant is SIC * niters and it must win over vector
3718      variant on the expected loop trip count.  The following condition must hold true:
3719      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3720
3721   if (vec_outside_cost <= 0)
3722     min_profitable_estimate = 0;
3723   else
3724     {
3725       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3726                                  * assumed_vf
3727                                  - vec_inside_cost * peel_iters_prologue
3728                                  - vec_inside_cost * peel_iters_epilogue)
3729                                  / ((scalar_single_iter_cost * assumed_vf)
3730                                    - vec_inside_cost);
3731     }
3732   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3733   if (dump_enabled_p ())
3734     dump_printf_loc (MSG_NOTE, vect_location,
3735                      "  Static estimate profitability threshold = %d\n",
3736                      min_profitable_estimate);
3737
3738   *ret_min_profitable_estimate = min_profitable_estimate;
3739 }
3740
3741 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3742    vector elements (not bits) for a vector with NELT elements.  */
3743 static void
3744 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3745                               vec_perm_builder *sel)
3746 {
3747   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3748      by vec_perm_indices.  */
3749   sel->new_vector (nelt, 1, 3);
3750   for (unsigned int i = 0; i < 3; i++)
3751     sel->quick_push (i + offset);
3752 }
3753
3754 /* Checks whether the target supports whole-vector shifts for vectors of mode
3755    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3756    it supports vec_perm_const with masks for all necessary shift amounts.  */
3757 static bool
3758 have_whole_vector_shift (machine_mode mode)
3759 {
3760   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3761     return true;
3762
3763   /* Variable-length vectors should be handled via the optab.  */
3764   unsigned int nelt;
3765   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3766     return false;
3767
3768   vec_perm_builder sel;
3769   vec_perm_indices indices;
3770   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3771     {
3772       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3773       indices.new_vector (sel, 2, nelt);
3774       if (!can_vec_perm_const_p (mode, indices, false))
3775         return false;
3776     }
3777   return true;
3778 }
3779
3780 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3781    functions. Design better to avoid maintenance issues.  */
3782
3783 /* Function vect_model_reduction_cost.
3784
3785    Models cost for a reduction operation, including the vector ops
3786    generated within the strip-mine loop, the initial definition before
3787    the loop, and the epilogue code that must be generated.  */
3788
3789 static void
3790 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3791                            int ncopies)
3792 {
3793   int prologue_cost = 0, epilogue_cost = 0;
3794   enum tree_code code;
3795   optab optab;
3796   tree vectype;
3797   gimple *orig_stmt;
3798   machine_mode mode;
3799   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3800   struct loop *loop = NULL;
3801   void *target_cost_data;
3802
3803   if (loop_vinfo)
3804     {
3805       loop = LOOP_VINFO_LOOP (loop_vinfo);
3806       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3807     }
3808   else
3809     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3810
3811   /* Condition reductions generate two reductions in the loop.  */
3812   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3813     ncopies *= 2;
3814
3815   /* Cost of reduction op inside loop.  */
3816   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3817                                         stmt_info, 0, vect_body);
3818
3819   vectype = STMT_VINFO_VECTYPE (stmt_info);
3820   mode = TYPE_MODE (vectype);
3821   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3822
3823   if (!orig_stmt)
3824     orig_stmt = STMT_VINFO_STMT (stmt_info);
3825
3826   code = gimple_assign_rhs_code (orig_stmt);
3827
3828   /* Add in cost for initial definition.
3829      For cond reduction we have four vectors: initial index, step, initial
3830      result of the data reduction, initial value of the index reduction.  */
3831   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3832                        == COND_REDUCTION ? 4 : 1;
3833   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3834                                   scalar_to_vec, stmt_info, 0,
3835                                   vect_prologue);
3836
3837   /* Determine cost of epilogue code.
3838
3839      We have a reduction operator that will reduce the vector in one statement.
3840      Also requires scalar extract.  */
3841
3842   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3843     {
3844       if (reduc_fn != IFN_LAST)
3845         {
3846           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3847             {
3848               /* An EQ stmt and an COND_EXPR stmt.  */
3849               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3850                                               vector_stmt, stmt_info, 0,
3851                                               vect_epilogue);
3852               /* Reduction of the max index and a reduction of the found
3853                  values.  */
3854               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3855                                               vec_to_scalar, stmt_info, 0,
3856                                               vect_epilogue);
3857               /* A broadcast of the max value.  */
3858               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3859                                               scalar_to_vec, stmt_info, 0,
3860                                               vect_epilogue);
3861             }
3862           else
3863             {
3864               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3865                                               stmt_info, 0, vect_epilogue);
3866               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3867                                               vec_to_scalar, stmt_info, 0,
3868                                               vect_epilogue);
3869             }
3870         }
3871       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3872         {
3873           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3874           /* Extraction of scalar elements.  */
3875           epilogue_cost += add_stmt_cost (target_cost_data,
3876                                           2 * estimated_nunits,
3877                                           vec_to_scalar, stmt_info, 0,
3878                                           vect_epilogue);
3879           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3880           epilogue_cost += add_stmt_cost (target_cost_data,
3881                                           2 * estimated_nunits - 3,
3882                                           scalar_stmt, stmt_info, 0,
3883                                           vect_epilogue);
3884         }
3885       else
3886         {
3887           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3888           tree bitsize =
3889             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3890           int element_bitsize = tree_to_uhwi (bitsize);
3891           int nelements = vec_size_in_bits / element_bitsize;
3892
3893           if (code == COND_EXPR)
3894             code = MAX_EXPR;
3895
3896           optab = optab_for_tree_code (code, vectype, optab_default);
3897
3898           /* We have a whole vector shift available.  */
3899           if (optab != unknown_optab
3900               && VECTOR_MODE_P (mode)
3901               && optab_handler (optab, mode) != CODE_FOR_nothing
3902               && have_whole_vector_shift (mode))
3903             {
3904               /* Final reduction via vector shifts and the reduction operator.
3905                  Also requires scalar extract.  */
3906               epilogue_cost += add_stmt_cost (target_cost_data,
3907                                               exact_log2 (nelements) * 2,
3908                                               vector_stmt, stmt_info, 0,
3909                                               vect_epilogue);
3910               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3911                                               vec_to_scalar, stmt_info, 0,
3912                                               vect_epilogue);
3913             }
3914           else
3915             /* Use extracts and reduction op for final reduction.  For N
3916                elements, we have N extracts and N-1 reduction ops.  */
3917             epilogue_cost += add_stmt_cost (target_cost_data,
3918                                             nelements + nelements - 1,
3919                                             vector_stmt, stmt_info, 0,
3920                                             vect_epilogue);
3921         }
3922     }
3923
3924   if (dump_enabled_p ())
3925     dump_printf (MSG_NOTE,
3926                  "vect_model_reduction_cost: inside_cost = %d, "
3927                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3928                  prologue_cost, epilogue_cost);
3929 }
3930
3931
3932 /* Function vect_model_induction_cost.
3933
3934    Models cost for induction operations.  */
3935
3936 static void
3937 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3938 {
3939   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3940   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3941   unsigned inside_cost, prologue_cost;
3942
3943   if (PURE_SLP_STMT (stmt_info))
3944     return;
3945
3946   /* loop cost for vec_loop.  */
3947   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3948                                stmt_info, 0, vect_body);
3949
3950   /* prologue cost for vec_init and vec_step.  */
3951   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3952                                  stmt_info, 0, vect_prologue);
3953
3954   if (dump_enabled_p ())
3955     dump_printf_loc (MSG_NOTE, vect_location,
3956                      "vect_model_induction_cost: inside_cost = %d, "
3957                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3958 }
3959
3960
3961
3962 /* Function get_initial_def_for_reduction
3963
3964    Input:
3965    STMT - a stmt that performs a reduction operation in the loop.
3966    INIT_VAL - the initial value of the reduction variable
3967
3968    Output:
3969    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3970         of the reduction (used for adjusting the epilog - see below).
3971    Return a vector variable, initialized according to the operation that STMT
3972         performs. This vector will be used as the initial value of the
3973         vector of partial results.
3974
3975    Option1 (adjust in epilog): Initialize the vector as follows:
3976      add/bit or/xor:    [0,0,...,0,0]
3977      mult/bit and:      [1,1,...,1,1]
3978      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3979    and when necessary (e.g. add/mult case) let the caller know
3980    that it needs to adjust the result by init_val.
3981
3982    Option2: Initialize the vector as follows:
3983      add/bit or/xor:    [init_val,0,0,...,0]
3984      mult/bit and:      [init_val,1,1,...,1]
3985      min/max/cond_expr: [init_val,init_val,...,init_val]
3986    and no adjustments are needed.
3987
3988    For example, for the following code:
3989
3990    s = init_val;
3991    for (i=0;i<n;i++)
3992      s = s + a[i];
3993
3994    STMT is 's = s + a[i]', and the reduction variable is 's'.
3995    For a vector of 4 units, we want to return either [0,0,0,init_val],
3996    or [0,0,0,0] and let the caller know that it needs to adjust
3997    the result at the end by 'init_val'.
3998
3999    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4000    initialization vector is simpler (same element in all entries), if
4001    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4002
4003    A cost model should help decide between these two schemes.  */
4004
4005 tree
4006 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4007                                tree *adjustment_def)
4008 {
4009   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4010   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4011   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4012   tree scalar_type = TREE_TYPE (init_val);
4013   tree vectype = get_vectype_for_scalar_type (scalar_type);
4014   enum tree_code code = gimple_assign_rhs_code (stmt);
4015   tree def_for_init;
4016   tree init_def;
4017   bool nested_in_vect_loop = false;
4018   REAL_VALUE_TYPE real_init_val = dconst0;
4019   int int_init_val = 0;
4020   gimple *def_stmt = NULL;
4021   gimple_seq stmts = NULL;
4022
4023   gcc_assert (vectype);
4024
4025   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4026               || SCALAR_FLOAT_TYPE_P (scalar_type));
4027
4028   if (nested_in_vect_loop_p (loop, stmt))
4029     nested_in_vect_loop = true;
4030   else
4031     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4032
4033   /* In case of double reduction we only create a vector variable to be put
4034      in the reduction phi node.  The actual statement creation is done in
4035      vect_create_epilog_for_reduction.  */
4036   if (adjustment_def && nested_in_vect_loop
4037       && TREE_CODE (init_val) == SSA_NAME
4038       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4039       && gimple_code (def_stmt) == GIMPLE_PHI
4040       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4041       && vinfo_for_stmt (def_stmt)
4042       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4043           == vect_double_reduction_def)
4044     {
4045       *adjustment_def = NULL;
4046       return vect_create_destination_var (init_val, vectype);
4047     }
4048
4049   /* In case of a nested reduction do not use an adjustment def as
4050      that case is not supported by the epilogue generation correctly
4051      if ncopies is not one.  */
4052   if (adjustment_def && nested_in_vect_loop)
4053     {
4054       *adjustment_def = NULL;
4055       return vect_get_vec_def_for_operand (init_val, stmt);
4056     }
4057
4058   switch (code)
4059     {
4060     case WIDEN_SUM_EXPR:
4061     case DOT_PROD_EXPR:
4062     case SAD_EXPR:
4063     case PLUS_EXPR:
4064     case MINUS_EXPR:
4065     case BIT_IOR_EXPR:
4066     case BIT_XOR_EXPR:
4067     case MULT_EXPR:
4068     case BIT_AND_EXPR:
4069       {
4070         /* ADJUSTMENT_DEF is NULL when called from
4071            vect_create_epilog_for_reduction to vectorize double reduction.  */
4072         if (adjustment_def)
4073           *adjustment_def = init_val;
4074
4075         if (code == MULT_EXPR)
4076           {
4077             real_init_val = dconst1;
4078             int_init_val = 1;
4079           }
4080
4081         if (code == BIT_AND_EXPR)
4082           int_init_val = -1;
4083
4084         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4085           def_for_init = build_real (scalar_type, real_init_val);
4086         else
4087           def_for_init = build_int_cst (scalar_type, int_init_val);
4088
4089         if (adjustment_def)
4090           /* Option1: the first element is '0' or '1' as well.  */
4091           init_def = gimple_build_vector_from_val (&stmts, vectype,
4092                                                    def_for_init);
4093         else
4094           {
4095             /* Option2: the first element is INIT_VAL.  */
4096             tree_vector_builder elts (vectype, 1, 2);
4097             elts.quick_push (init_val);
4098             elts.quick_push (def_for_init);
4099             init_def = gimple_build_vector (&stmts, &elts);
4100           }
4101       }
4102       break;
4103
4104     case MIN_EXPR:
4105     case MAX_EXPR:
4106     case COND_EXPR:
4107       {
4108         if (adjustment_def)
4109           {
4110             *adjustment_def = NULL_TREE;
4111             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4112               {
4113                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4114                 break;
4115               }
4116           }
4117         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4118         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4119       }
4120       break;
4121
4122     default:
4123       gcc_unreachable ();
4124     }
4125
4126   if (stmts)
4127     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4128   return init_def;
4129 }
4130
4131 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4132    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4133
4134 static void
4135 get_initial_defs_for_reduction (slp_tree slp_node,
4136                                 vec<tree> *vec_oprnds,
4137                                 unsigned int number_of_vectors,
4138                                 enum tree_code code, bool reduc_chain)
4139 {
4140   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4141   gimple *stmt = stmts[0];
4142   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4143   unsigned nunits;
4144   unsigned j, number_of_places_left_in_vector;
4145   tree vector_type, scalar_type;
4146   tree vop;
4147   int group_size = stmts.length ();
4148   unsigned int vec_num, i;
4149   unsigned number_of_copies = 1;
4150   vec<tree> voprnds;
4151   voprnds.create (number_of_vectors);
4152   tree neutral_op = NULL;
4153   struct loop *loop;
4154
4155   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4156   scalar_type = TREE_TYPE (vector_type);
4157   /* vectorizable_reduction has already rejected SLP reductions on
4158      variable-length vectors.  */
4159   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4160
4161   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4162
4163   loop = (gimple_bb (stmt))->loop_father;
4164   gcc_assert (loop);
4165   edge pe = loop_preheader_edge (loop);
4166
4167   /* op is the reduction operand of the first stmt already.  */
4168   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4169      we need either neutral operands or the original operands.  See
4170      get_initial_def_for_reduction() for details.  */
4171   switch (code)
4172     {
4173     case WIDEN_SUM_EXPR:
4174     case DOT_PROD_EXPR:
4175     case SAD_EXPR:
4176     case PLUS_EXPR:
4177     case MINUS_EXPR:
4178     case BIT_IOR_EXPR:
4179     case BIT_XOR_EXPR:
4180       neutral_op = build_zero_cst (scalar_type);
4181       break;
4182
4183     case MULT_EXPR:
4184       neutral_op = build_one_cst (scalar_type);
4185       break;
4186
4187     case BIT_AND_EXPR:
4188       neutral_op = build_all_ones_cst (scalar_type);
4189       break;
4190
4191     /* For MIN/MAX we don't have an easy neutral operand but
4192        the initial values can be used fine here.  Only for
4193        a reduction chain we have to force a neutral element.  */
4194     case MAX_EXPR:
4195     case MIN_EXPR:
4196       if (! reduc_chain)
4197         neutral_op = NULL;
4198       else
4199         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4200       break;
4201
4202     default:
4203       gcc_assert (! reduc_chain);
4204       neutral_op = NULL;
4205     }
4206
4207   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4208      created vectors. It is greater than 1 if unrolling is performed.
4209
4210      For example, we have two scalar operands, s1 and s2 (e.g., group of
4211      strided accesses of size two), while NUNITS is four (i.e., four scalars
4212      of this type can be packed in a vector).  The output vector will contain
4213      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4214      will be 2).
4215
4216      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4217      containing the operands.
4218
4219      For example, NUNITS is four as before, and the group size is 8
4220      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4221      {s5, s6, s7, s8}.  */
4222
4223   number_of_copies = nunits * number_of_vectors / group_size;
4224
4225   number_of_places_left_in_vector = nunits;
4226   tree_vector_builder elts (vector_type, nunits, 1);
4227   elts.quick_grow (nunits);
4228   for (j = 0; j < number_of_copies; j++)
4229     {
4230       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4231         {
4232           tree op;
4233           /* Get the def before the loop.  In reduction chain we have only
4234              one initial value.  */
4235           if ((j != (number_of_copies - 1)
4236                || (reduc_chain && i != 0))
4237               && neutral_op)
4238             op = neutral_op;
4239           else
4240             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4241
4242           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4243           number_of_places_left_in_vector--;
4244           elts[number_of_places_left_in_vector] = op;
4245
4246           if (number_of_places_left_in_vector == 0)
4247             {
4248               gimple_seq ctor_seq = NULL;
4249               tree init = gimple_build_vector (&ctor_seq, &elts);
4250               if (ctor_seq != NULL)
4251                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4252               voprnds.quick_push (init);
4253
4254               number_of_places_left_in_vector = nunits;
4255               elts.new_vector (vector_type, nunits, 1);
4256               elts.quick_grow (nunits);
4257             }
4258         }
4259     }
4260
4261   /* Since the vectors are created in the reverse order, we should invert
4262      them.  */
4263   vec_num = voprnds.length ();
4264   for (j = vec_num; j != 0; j--)
4265     {
4266       vop = voprnds[j - 1];
4267       vec_oprnds->quick_push (vop);
4268     }
4269
4270   voprnds.release ();
4271
4272   /* In case that VF is greater than the unrolling factor needed for the SLP
4273      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4274      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4275      to replicate the vectors.  */
4276   tree neutral_vec = NULL;
4277   while (number_of_vectors > vec_oprnds->length ())
4278     {
4279       if (neutral_op)
4280         {
4281           if (!neutral_vec)
4282             {
4283               gimple_seq ctor_seq = NULL;
4284               neutral_vec = gimple_build_vector_from_val
4285                 (&ctor_seq, vector_type, neutral_op);
4286               if (ctor_seq != NULL)
4287                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4288             }
4289           vec_oprnds->quick_push (neutral_vec);
4290         }
4291       else
4292         {
4293           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4294             vec_oprnds->quick_push (vop);
4295         }
4296     }
4297 }
4298
4299
4300 /* Function vect_create_epilog_for_reduction
4301
4302    Create code at the loop-epilog to finalize the result of a reduction
4303    computation.
4304
4305    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4306      reduction statements.
4307    STMT is the scalar reduction stmt that is being vectorized.
4308    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4309      number of elements that we can fit in a vectype (nunits).  In this case
4310      we have to generate more than one vector stmt - i.e - we need to "unroll"
4311      the vector stmt by a factor VF/nunits.  For more details see documentation
4312      in vectorizable_operation.
4313    REDUC_FN is the internal function for the epilog reduction.
4314    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4315      computation.
4316    REDUC_INDEX is the index of the operand in the right hand side of the
4317      statement that is defined by REDUCTION_PHI.
4318    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4319    SLP_NODE is an SLP node containing a group of reduction statements. The
4320      first one in this group is STMT.
4321    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4322      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4323      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4324      any value of the IV in the loop.
4325    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4326
4327    This function:
4328    1. Creates the reduction def-use cycles: sets the arguments for
4329       REDUCTION_PHIS:
4330       The loop-entry argument is the vectorized initial-value of the reduction.
4331       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4332       sums.
4333    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4334       by calling the function specified by REDUC_FN if available, or by
4335       other means (whole-vector shifts or a scalar loop).
4336       The function also creates a new phi node at the loop exit to preserve
4337       loop-closed form, as illustrated below.
4338
4339      The flow at the entry to this function:
4340
4341         loop:
4342           vec_def = phi <null, null>            # REDUCTION_PHI
4343           VECT_DEF = vector_stmt                # vectorized form of STMT
4344           s_loop = scalar_stmt                  # (scalar) STMT
4345         loop_exit:
4346           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4347           use <s_out0>
4348           use <s_out0>
4349
4350      The above is transformed by this function into:
4351
4352         loop:
4353           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4354           VECT_DEF = vector_stmt                # vectorized form of STMT
4355           s_loop = scalar_stmt                  # (scalar) STMT
4356         loop_exit:
4357           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4358           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4359           v_out2 = reduce <v_out1>
4360           s_out3 = extract_field <v_out2, 0>
4361           s_out4 = adjust_result <s_out3>
4362           use <s_out4>
4363           use <s_out4>
4364 */
4365
4366 static void
4367 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4368                                   gimple *reduc_def_stmt,
4369                                   int ncopies, internal_fn reduc_fn,
4370                                   vec<gimple *> reduction_phis,
4371                                   bool double_reduc,
4372                                   slp_tree slp_node,
4373                                   slp_instance slp_node_instance,
4374                                   tree induc_val, enum tree_code induc_code)
4375 {
4376   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4377   stmt_vec_info prev_phi_info;
4378   tree vectype;
4379   machine_mode mode;
4380   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4381   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4382   basic_block exit_bb;
4383   tree scalar_dest;
4384   tree scalar_type;
4385   gimple *new_phi = NULL, *phi;
4386   gimple_stmt_iterator exit_gsi;
4387   tree vec_dest;
4388   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4389   gimple *epilog_stmt = NULL;
4390   enum tree_code code = gimple_assign_rhs_code (stmt);
4391   gimple *exit_phi;
4392   tree bitsize;
4393   tree adjustment_def = NULL;
4394   tree vec_initial_def = NULL;
4395   tree expr, def, initial_def = NULL;
4396   tree orig_name, scalar_result;
4397   imm_use_iterator imm_iter, phi_imm_iter;
4398   use_operand_p use_p, phi_use_p;
4399   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4400   bool nested_in_vect_loop = false;
4401   auto_vec<gimple *> new_phis;
4402   auto_vec<gimple *> inner_phis;
4403   enum vect_def_type dt = vect_unknown_def_type;
4404   int j, i;
4405   auto_vec<tree> scalar_results;
4406   unsigned int group_size = 1, k, ratio;
4407   auto_vec<tree> vec_initial_defs;
4408   auto_vec<gimple *> phis;
4409   bool slp_reduc = false;
4410   tree new_phi_result;
4411   gimple *inner_phi = NULL;
4412   tree induction_index = NULL_TREE;
4413
4414   if (slp_node)
4415     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4416
4417   if (nested_in_vect_loop_p (loop, stmt))
4418     {
4419       outer_loop = loop;
4420       loop = loop->inner;
4421       nested_in_vect_loop = true;
4422       gcc_assert (!slp_node);
4423     }
4424
4425   vectype = STMT_VINFO_VECTYPE (stmt_info);
4426   gcc_assert (vectype);
4427   mode = TYPE_MODE (vectype);
4428
4429   /* 1. Create the reduction def-use cycle:
4430      Set the arguments of REDUCTION_PHIS, i.e., transform
4431
4432         loop:
4433           vec_def = phi <null, null>            # REDUCTION_PHI
4434           VECT_DEF = vector_stmt                # vectorized form of STMT
4435           ...
4436
4437      into:
4438
4439         loop:
4440           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4441           VECT_DEF = vector_stmt                # vectorized form of STMT
4442           ...
4443
4444      (in case of SLP, do it for all the phis). */
4445
4446   /* Get the loop-entry arguments.  */
4447   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4448   if (slp_node)
4449     {
4450       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4451       vec_initial_defs.reserve (vec_num);
4452       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4453                                       &vec_initial_defs, vec_num, code,
4454                                       GROUP_FIRST_ELEMENT (stmt_info));
4455     }
4456   else
4457     {
4458       /* Get at the scalar def before the loop, that defines the initial value
4459          of the reduction variable.  */
4460       gimple *def_stmt;
4461       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4462                                            loop_preheader_edge (loop));
4463       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4464          and we can't use zero for induc_val, use initial_def.  Similarly
4465          for REDUC_MIN and initial_def larger than the base.  */
4466       if (TREE_CODE (initial_def) == INTEGER_CST
4467           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4468               == INTEGER_INDUC_COND_REDUCTION)
4469           && !integer_zerop (induc_val)
4470           && ((induc_code == MAX_EXPR
4471                && tree_int_cst_lt (initial_def, induc_val))
4472               || (induc_code == MIN_EXPR
4473                   && tree_int_cst_lt (induc_val, initial_def))))
4474         induc_val = initial_def;
4475       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4476       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4477                                                        &adjustment_def);
4478       vec_initial_defs.create (1);
4479       vec_initial_defs.quick_push (vec_initial_def);
4480     }
4481
4482   /* Set phi nodes arguments.  */
4483   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4484     {
4485       tree vec_init_def = vec_initial_defs[i];
4486       tree def = vect_defs[i];
4487       for (j = 0; j < ncopies; j++)
4488         {
4489           if (j != 0)
4490             {
4491               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4492               if (nested_in_vect_loop)
4493                 vec_init_def
4494                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4495                                                     vec_init_def);
4496             }
4497
4498           /* Set the loop-entry arg of the reduction-phi.  */
4499
4500           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4501               == INTEGER_INDUC_COND_REDUCTION)
4502             {
4503               /* Initialise the reduction phi to zero.  This prevents initial
4504                  values of non-zero interferring with the reduction op.  */
4505               gcc_assert (ncopies == 1);
4506               gcc_assert (i == 0);
4507
4508               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4509               tree induc_val_vec
4510                 = build_vector_from_val (vec_init_def_type, induc_val);
4511
4512               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4513                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4514             }
4515           else
4516             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4517                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4518
4519           /* Set the loop-latch arg for the reduction-phi.  */
4520           if (j > 0)
4521             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4522
4523           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4524                        UNKNOWN_LOCATION);
4525
4526           if (dump_enabled_p ())
4527             {
4528               dump_printf_loc (MSG_NOTE, vect_location,
4529                                "transform reduction: created def-use cycle: ");
4530               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4531               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4532             }
4533         }
4534     }
4535
4536   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4537      which is updated with the current index of the loop for every match of
4538      the original loop's cond_expr (VEC_STMT).  This results in a vector
4539      containing the last time the condition passed for that vector lane.
4540      The first match will be a 1 to allow 0 to be used for non-matching
4541      indexes.  If there are no matches at all then the vector will be all
4542      zeroes.  */
4543   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4544     {
4545       tree indx_before_incr, indx_after_incr;
4546       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4547
4548       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4549       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4550
4551       int scalar_precision
4552         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4553       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4554       tree cr_index_vector_type = build_vector_type
4555         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4556
4557       /* First we create a simple vector induction variable which starts
4558          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4559          vector size (STEP).  */
4560
4561       /* Create a {1,2,3,...} vector.  */
4562       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4563
4564       /* Create a vector of the step value.  */
4565       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4566       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4567
4568       /* Create an induction variable.  */
4569       gimple_stmt_iterator incr_gsi;
4570       bool insert_after;
4571       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4572       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4573                  insert_after, &indx_before_incr, &indx_after_incr);
4574
4575       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4576          filled with zeros (VEC_ZERO).  */
4577
4578       /* Create a vector of 0s.  */
4579       tree zero = build_zero_cst (cr_index_scalar_type);
4580       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4581
4582       /* Create a vector phi node.  */
4583       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4584       new_phi = create_phi_node (new_phi_tree, loop->header);
4585       set_vinfo_for_stmt (new_phi,
4586                           new_stmt_vec_info (new_phi, loop_vinfo));
4587       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4588                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4589
4590       /* Now take the condition from the loops original cond_expr
4591          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4592          every match uses values from the induction variable
4593          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4594          (NEW_PHI_TREE).
4595          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4596          the new cond_expr (INDEX_COND_EXPR).  */
4597
4598       /* Duplicate the condition from vec_stmt.  */
4599       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4600
4601       /* Create a conditional, where the condition is taken from vec_stmt
4602          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4603          else is the phi (NEW_PHI_TREE).  */
4604       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4605                                      ccompare, indx_before_incr,
4606                                      new_phi_tree);
4607       induction_index = make_ssa_name (cr_index_vector_type);
4608       gimple *index_condition = gimple_build_assign (induction_index,
4609                                                      index_cond_expr);
4610       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4611       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4612                                                         loop_vinfo);
4613       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4614       set_vinfo_for_stmt (index_condition, index_vec_info);
4615
4616       /* Update the phi with the vec cond.  */
4617       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4618                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4619     }
4620
4621   /* 2. Create epilog code.
4622         The reduction epilog code operates across the elements of the vector
4623         of partial results computed by the vectorized loop.
4624         The reduction epilog code consists of:
4625
4626         step 1: compute the scalar result in a vector (v_out2)
4627         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4628         step 3: adjust the scalar result (s_out3) if needed.
4629
4630         Step 1 can be accomplished using one the following three schemes:
4631           (scheme 1) using reduc_fn, if available.
4632           (scheme 2) using whole-vector shifts, if available.
4633           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4634                      combined.
4635
4636           The overall epilog code looks like this:
4637
4638           s_out0 = phi <s_loop>         # original EXIT_PHI
4639           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4640           v_out2 = reduce <v_out1>              # step 1
4641           s_out3 = extract_field <v_out2, 0>    # step 2
4642           s_out4 = adjust_result <s_out3>       # step 3
4643
4644           (step 3 is optional, and steps 1 and 2 may be combined).
4645           Lastly, the uses of s_out0 are replaced by s_out4.  */
4646
4647
4648   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4649          v_out1 = phi <VECT_DEF>
4650          Store them in NEW_PHIS.  */
4651
4652   exit_bb = single_exit (loop)->dest;
4653   prev_phi_info = NULL;
4654   new_phis.create (vect_defs.length ());
4655   FOR_EACH_VEC_ELT (vect_defs, i, def)
4656     {
4657       for (j = 0; j < ncopies; j++)
4658         {
4659           tree new_def = copy_ssa_name (def);
4660           phi = create_phi_node (new_def, exit_bb);
4661           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4662           if (j == 0)
4663             new_phis.quick_push (phi);
4664           else
4665             {
4666               def = vect_get_vec_def_for_stmt_copy (dt, def);
4667               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4668             }
4669
4670           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4671           prev_phi_info = vinfo_for_stmt (phi);
4672         }
4673     }
4674
4675   /* The epilogue is created for the outer-loop, i.e., for the loop being
4676      vectorized.  Create exit phis for the outer loop.  */
4677   if (double_reduc)
4678     {
4679       loop = outer_loop;
4680       exit_bb = single_exit (loop)->dest;
4681       inner_phis.create (vect_defs.length ());
4682       FOR_EACH_VEC_ELT (new_phis, i, phi)
4683         {
4684           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4685           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4686           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4687                            PHI_RESULT (phi));
4688           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4689                                                             loop_vinfo));
4690           inner_phis.quick_push (phi);
4691           new_phis[i] = outer_phi;
4692           prev_phi_info = vinfo_for_stmt (outer_phi);
4693           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4694             {
4695               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4696               new_result = copy_ssa_name (PHI_RESULT (phi));
4697               outer_phi = create_phi_node (new_result, exit_bb);
4698               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4699                                PHI_RESULT (phi));
4700               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4701                                                                 loop_vinfo));
4702               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4703               prev_phi_info = vinfo_for_stmt (outer_phi);
4704             }
4705         }
4706     }
4707
4708   exit_gsi = gsi_after_labels (exit_bb);
4709
4710   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4711          (i.e. when reduc_fn is not available) and in the final adjustment
4712          code (if needed).  Also get the original scalar reduction variable as
4713          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4714          represents a reduction pattern), the tree-code and scalar-def are
4715          taken from the original stmt that the pattern-stmt (STMT) replaces.
4716          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4717          are taken from STMT.  */
4718
4719   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4720   if (!orig_stmt)
4721     {
4722       /* Regular reduction  */
4723       orig_stmt = stmt;
4724     }
4725   else
4726     {
4727       /* Reduction pattern  */
4728       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4729       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4730       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4731     }
4732
4733   code = gimple_assign_rhs_code (orig_stmt);
4734   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4735      partial results are added and not subtracted.  */
4736   if (code == MINUS_EXPR)
4737     code = PLUS_EXPR;
4738
4739   scalar_dest = gimple_assign_lhs (orig_stmt);
4740   scalar_type = TREE_TYPE (scalar_dest);
4741   scalar_results.create (group_size);
4742   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4743   bitsize = TYPE_SIZE (scalar_type);
4744
4745   /* In case this is a reduction in an inner-loop while vectorizing an outer
4746      loop - we don't need to extract a single scalar result at the end of the
4747      inner-loop (unless it is double reduction, i.e., the use of reduction is
4748      outside the outer-loop).  The final vector of partial results will be used
4749      in the vectorized outer-loop, or reduced to a scalar result at the end of
4750      the outer-loop.  */
4751   if (nested_in_vect_loop && !double_reduc)
4752     goto vect_finalize_reduction;
4753
4754   /* SLP reduction without reduction chain, e.g.,
4755      # a1 = phi <a2, a0>
4756      # b1 = phi <b2, b0>
4757      a2 = operation (a1)
4758      b2 = operation (b1)  */
4759   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4760
4761   /* In case of reduction chain, e.g.,
4762      # a1 = phi <a3, a0>
4763      a2 = operation (a1)
4764      a3 = operation (a2),
4765
4766      we may end up with more than one vector result.  Here we reduce them to
4767      one vector.  */
4768   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4769     {
4770       tree first_vect = PHI_RESULT (new_phis[0]);
4771       gassign *new_vec_stmt = NULL;
4772       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4773       for (k = 1; k < new_phis.length (); k++)
4774         {
4775           gimple *next_phi = new_phis[k];
4776           tree second_vect = PHI_RESULT (next_phi);
4777           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4778           new_vec_stmt = gimple_build_assign (tem, code,
4779                                               first_vect, second_vect);
4780           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4781           first_vect = tem;
4782         }
4783
4784       new_phi_result = first_vect;
4785       if (new_vec_stmt)
4786         {
4787           new_phis.truncate (0);
4788           new_phis.safe_push (new_vec_stmt);
4789         }
4790     }
4791   /* Likewise if we couldn't use a single defuse cycle.  */
4792   else if (ncopies > 1)
4793     {
4794       gcc_assert (new_phis.length () == 1);
4795       tree first_vect = PHI_RESULT (new_phis[0]);
4796       gassign *new_vec_stmt = NULL;
4797       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4798       gimple *next_phi = new_phis[0];
4799       for (int k = 1; k < ncopies; ++k)
4800         {
4801           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4802           tree second_vect = PHI_RESULT (next_phi);
4803           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4804           new_vec_stmt = gimple_build_assign (tem, code,
4805                                               first_vect, second_vect);
4806           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4807           first_vect = tem;
4808         }
4809       new_phi_result = first_vect;
4810       new_phis.truncate (0);
4811       new_phis.safe_push (new_vec_stmt);
4812     }
4813   else
4814     new_phi_result = PHI_RESULT (new_phis[0]);
4815
4816   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4817       && reduc_fn != IFN_LAST)
4818     {
4819       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4820          various data values where the condition matched and another vector
4821          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4822          need to extract the last matching index (which will be the index with
4823          highest value) and use this to index into the data vector.
4824          For the case where there were no matches, the data vector will contain
4825          all default values and the index vector will be all zeros.  */
4826
4827       /* Get various versions of the type of the vector of indexes.  */
4828       tree index_vec_type = TREE_TYPE (induction_index);
4829       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4830       tree index_scalar_type = TREE_TYPE (index_vec_type);
4831       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4832         (index_vec_type);
4833
4834       /* Get an unsigned integer version of the type of the data vector.  */
4835       int scalar_precision
4836         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4837       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4838       tree vectype_unsigned = build_vector_type
4839         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4840
4841       /* First we need to create a vector (ZERO_VEC) of zeros and another
4842          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4843          can create using a MAX reduction and then expanding.
4844          In the case where the loop never made any matches, the max index will
4845          be zero.  */
4846
4847       /* Vector of {0, 0, 0,...}.  */
4848       tree zero_vec = make_ssa_name (vectype);
4849       tree zero_vec_rhs = build_zero_cst (vectype);
4850       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4851       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4852
4853       /* Find maximum value from the vector of found indexes.  */
4854       tree max_index = make_ssa_name (index_scalar_type);
4855       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4856                                                           1, induction_index);
4857       gimple_call_set_lhs (max_index_stmt, max_index);
4858       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4859
4860       /* Vector of {max_index, max_index, max_index,...}.  */
4861       tree max_index_vec = make_ssa_name (index_vec_type);
4862       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4863                                                       max_index);
4864       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4865                                                         max_index_vec_rhs);
4866       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4867
4868       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4869          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4870          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4871          otherwise.  Only one value should match, resulting in a vector
4872          (VEC_COND) with one data value and the rest zeros.
4873          In the case where the loop never made any matches, every index will
4874          match, resulting in a vector with all data values (which will all be
4875          the default value).  */
4876
4877       /* Compare the max index vector to the vector of found indexes to find
4878          the position of the max value.  */
4879       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4880       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4881                                                       induction_index,
4882                                                       max_index_vec);
4883       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4884
4885       /* Use the compare to choose either values from the data vector or
4886          zero.  */
4887       tree vec_cond = make_ssa_name (vectype);
4888       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4889                                                    vec_compare, new_phi_result,
4890                                                    zero_vec);
4891       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4892
4893       /* Finally we need to extract the data value from the vector (VEC_COND)
4894          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4895          reduction, but because this doesn't exist, we can use a MAX reduction
4896          instead.  The data value might be signed or a float so we need to cast
4897          it first.
4898          In the case where the loop never made any matches, the data values are
4899          all identical, and so will reduce down correctly.  */
4900
4901       /* Make the matched data values unsigned.  */
4902       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4903       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4904                                        vec_cond);
4905       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4906                                                         VIEW_CONVERT_EXPR,
4907                                                         vec_cond_cast_rhs);
4908       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4909
4910       /* Reduce down to a scalar value.  */
4911       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4912       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4913                                                            1, vec_cond_cast);
4914       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4915       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4916
4917       /* Convert the reduced value back to the result type and set as the
4918          result.  */
4919       gimple_seq stmts = NULL;
4920       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4921                                data_reduc);
4922       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4923       scalar_results.safe_push (new_temp);
4924     }
4925   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4926            && reduc_fn == IFN_LAST)
4927     {
4928       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4929          idx = 0;
4930          idx_val = induction_index[0];
4931          val = data_reduc[0];
4932          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4933            if (induction_index[i] > idx_val)
4934              val = data_reduc[i], idx_val = induction_index[i];
4935          return val;  */
4936
4937       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4938       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4939       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4940       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4941       /* Enforced by vectorizable_reduction, which ensures we have target
4942          support before allowing a conditional reduction on variable-length
4943          vectors.  */
4944       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4945       tree idx_val = NULL_TREE, val = NULL_TREE;
4946       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4947         {
4948           tree old_idx_val = idx_val;
4949           tree old_val = val;
4950           idx_val = make_ssa_name (idx_eltype);
4951           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4952                                              build3 (BIT_FIELD_REF, idx_eltype,
4953                                                      induction_index,
4954                                                      bitsize_int (el_size),
4955                                                      bitsize_int (off)));
4956           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4957           val = make_ssa_name (data_eltype);
4958           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4959                                              build3 (BIT_FIELD_REF,
4960                                                      data_eltype,
4961                                                      new_phi_result,
4962                                                      bitsize_int (el_size),
4963                                                      bitsize_int (off)));
4964           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4965           if (off != 0)
4966             {
4967               tree new_idx_val = idx_val;
4968               tree new_val = val;
4969               if (off != v_size - el_size)
4970                 {
4971                   new_idx_val = make_ssa_name (idx_eltype);
4972                   epilog_stmt = gimple_build_assign (new_idx_val,
4973                                                      MAX_EXPR, idx_val,
4974                                                      old_idx_val);
4975                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4976                 }
4977               new_val = make_ssa_name (data_eltype);
4978               epilog_stmt = gimple_build_assign (new_val,
4979                                                  COND_EXPR,
4980                                                  build2 (GT_EXPR,
4981                                                          boolean_type_node,
4982                                                          idx_val,
4983                                                          old_idx_val),
4984                                                  val, old_val);
4985               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4986               idx_val = new_idx_val;
4987               val = new_val;
4988             }
4989         }
4990       /* Convert the reduced value back to the result type and set as the
4991          result.  */
4992       gimple_seq stmts = NULL;
4993       val = gimple_convert (&stmts, scalar_type, val);
4994       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4995       scalar_results.safe_push (val);
4996     }
4997
4998   /* 2.3 Create the reduction code, using one of the three schemes described
4999          above. In SLP we simply need to extract all the elements from the
5000          vector (without reducing them), so we use scalar shifts.  */
5001   else if (reduc_fn != IFN_LAST && !slp_reduc)
5002     {
5003       tree tmp;
5004       tree vec_elem_type;
5005
5006       /* Case 1:  Create:
5007          v_out2 = reduc_expr <v_out1>  */
5008
5009       if (dump_enabled_p ())
5010         dump_printf_loc (MSG_NOTE, vect_location,
5011                          "Reduce using direct vector reduction.\n");
5012
5013       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5014       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5015         {
5016           tree tmp_dest
5017             = vect_create_destination_var (scalar_dest, vec_elem_type);
5018           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5019                                                     new_phi_result);
5020           gimple_set_lhs (epilog_stmt, tmp_dest);
5021           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5022           gimple_set_lhs (epilog_stmt, new_temp);
5023           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5024
5025           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5026                                              new_temp);
5027         }
5028       else
5029         {
5030           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5031                                                     new_phi_result);
5032           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5033         }
5034
5035       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5036       gimple_set_lhs (epilog_stmt, new_temp);
5037       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5038
5039       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5040            == INTEGER_INDUC_COND_REDUCTION)
5041           && !operand_equal_p (initial_def, induc_val, 0))
5042         {
5043           /* Earlier we set the initial value to be a vector if induc_val
5044              values.  Check the result and if it is induc_val then replace
5045              with the original initial value, unless induc_val is
5046              the same as initial_def already.  */
5047           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5048                                   induc_val);
5049
5050           tmp = make_ssa_name (new_scalar_dest);
5051           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5052                                              initial_def, new_temp);
5053           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5054           new_temp = tmp;
5055         }
5056
5057       scalar_results.safe_push (new_temp);
5058     }
5059   else
5060     {
5061       bool reduce_with_shift = have_whole_vector_shift (mode);
5062       int element_bitsize = tree_to_uhwi (bitsize);
5063       /* Enforced by vectorizable_reduction, which disallows SLP reductions
5064          for variable-length vectors and also requires direct target support
5065          for loop reductions.  */
5066       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5067       tree vec_temp;
5068
5069       /* COND reductions all do the final reduction with MAX_EXPR
5070          or MIN_EXPR.  */
5071       if (code == COND_EXPR)
5072         {
5073           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5074               == INTEGER_INDUC_COND_REDUCTION)
5075             code = induc_code;
5076           else
5077             code = MAX_EXPR;
5078         }
5079
5080       /* Regardless of whether we have a whole vector shift, if we're
5081          emulating the operation via tree-vect-generic, we don't want
5082          to use it.  Only the first round of the reduction is likely
5083          to still be profitable via emulation.  */
5084       /* ??? It might be better to emit a reduction tree code here, so that
5085          tree-vect-generic can expand the first round via bit tricks.  */
5086       if (!VECTOR_MODE_P (mode))
5087         reduce_with_shift = false;
5088       else
5089         {
5090           optab optab = optab_for_tree_code (code, vectype, optab_default);
5091           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5092             reduce_with_shift = false;
5093         }
5094
5095       if (reduce_with_shift && !slp_reduc)
5096         {
5097           int nelements = vec_size_in_bits / element_bitsize;
5098           vec_perm_builder sel;
5099           vec_perm_indices indices;
5100
5101           int elt_offset;
5102
5103           tree zero_vec = build_zero_cst (vectype);
5104           /* Case 2: Create:
5105              for (offset = nelements/2; offset >= 1; offset/=2)
5106                 {
5107                   Create:  va' = vec_shift <va, offset>
5108                   Create:  va = vop <va, va'>
5109                 }  */
5110
5111           tree rhs;
5112
5113           if (dump_enabled_p ())
5114             dump_printf_loc (MSG_NOTE, vect_location,
5115                              "Reduce using vector shifts\n");
5116
5117           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5118           new_temp = new_phi_result;
5119           for (elt_offset = nelements / 2;
5120                elt_offset >= 1;
5121                elt_offset /= 2)
5122             {
5123               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5124               indices.new_vector (sel, 2, nelements);
5125               tree mask = vect_gen_perm_mask_any (vectype, indices);
5126               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5127                                                  new_temp, zero_vec, mask);
5128               new_name = make_ssa_name (vec_dest, epilog_stmt);
5129               gimple_assign_set_lhs (epilog_stmt, new_name);
5130               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5131
5132               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5133                                                  new_temp);
5134               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5135               gimple_assign_set_lhs (epilog_stmt, new_temp);
5136               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5137             }
5138
5139           /* 2.4  Extract the final scalar result.  Create:
5140              s_out3 = extract_field <v_out2, bitpos>  */
5141
5142           if (dump_enabled_p ())
5143             dump_printf_loc (MSG_NOTE, vect_location,
5144                              "extract scalar result\n");
5145
5146           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5147                         bitsize, bitsize_zero_node);
5148           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5149           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5150           gimple_assign_set_lhs (epilog_stmt, new_temp);
5151           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5152           scalar_results.safe_push (new_temp);
5153         }
5154       else
5155         {
5156           /* Case 3: Create:
5157              s = extract_field <v_out2, 0>
5158              for (offset = element_size;
5159                   offset < vector_size;
5160                   offset += element_size;)
5161                {
5162                  Create:  s' = extract_field <v_out2, offset>
5163                  Create:  s = op <s, s'>  // For non SLP cases
5164                }  */
5165
5166           if (dump_enabled_p ())
5167             dump_printf_loc (MSG_NOTE, vect_location,
5168                              "Reduce using scalar code.\n");
5169
5170           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5171           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5172             {
5173               int bit_offset;
5174               if (gimple_code (new_phi) == GIMPLE_PHI)
5175                 vec_temp = PHI_RESULT (new_phi);
5176               else
5177                 vec_temp = gimple_assign_lhs (new_phi);
5178               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5179                                  bitsize_zero_node);
5180               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5181               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5182               gimple_assign_set_lhs (epilog_stmt, new_temp);
5183               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5184
5185               /* In SLP we don't need to apply reduction operation, so we just
5186                  collect s' values in SCALAR_RESULTS.  */
5187               if (slp_reduc)
5188                 scalar_results.safe_push (new_temp);
5189
5190               for (bit_offset = element_bitsize;
5191                    bit_offset < vec_size_in_bits;
5192                    bit_offset += element_bitsize)
5193                 {
5194                   tree bitpos = bitsize_int (bit_offset);
5195                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5196                                      bitsize, bitpos);
5197
5198                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5199                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5200                   gimple_assign_set_lhs (epilog_stmt, new_name);
5201                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5202
5203                   if (slp_reduc)
5204                     {
5205                       /* In SLP we don't need to apply reduction operation, so
5206                          we just collect s' values in SCALAR_RESULTS.  */
5207                       new_temp = new_name;
5208                       scalar_results.safe_push (new_name);
5209                     }
5210                   else
5211                     {
5212                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5213                                                          new_name, new_temp);
5214                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5215                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5216                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5217                     }
5218                 }
5219             }
5220
5221           /* The only case where we need to reduce scalar results in SLP, is
5222              unrolling.  If the size of SCALAR_RESULTS is greater than
5223              GROUP_SIZE, we reduce them combining elements modulo
5224              GROUP_SIZE.  */
5225           if (slp_reduc)
5226             {
5227               tree res, first_res, new_res;
5228               gimple *new_stmt;
5229
5230               /* Reduce multiple scalar results in case of SLP unrolling.  */
5231               for (j = group_size; scalar_results.iterate (j, &res);
5232                    j++)
5233                 {
5234                   first_res = scalar_results[j % group_size];
5235                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5236                                                   first_res, res);
5237                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5238                   gimple_assign_set_lhs (new_stmt, new_res);
5239                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5240                   scalar_results[j % group_size] = new_res;
5241                 }
5242             }
5243           else
5244             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5245             scalar_results.safe_push (new_temp);
5246         }
5247
5248       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5249            == INTEGER_INDUC_COND_REDUCTION)
5250           && !operand_equal_p (initial_def, induc_val, 0))
5251         {
5252           /* Earlier we set the initial value to be a vector if induc_val
5253              values.  Check the result and if it is induc_val then replace
5254              with the original initial value, unless induc_val is
5255              the same as initial_def already.  */
5256           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5257                                   induc_val);
5258
5259           tree tmp = make_ssa_name (new_scalar_dest);
5260           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5261                                              initial_def, new_temp);
5262           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5263           scalar_results[0] = tmp;
5264         }
5265     }
5266
5267 vect_finalize_reduction:
5268
5269   if (double_reduc)
5270     loop = loop->inner;
5271
5272   /* 2.5 Adjust the final result by the initial value of the reduction
5273          variable. (When such adjustment is not needed, then
5274          'adjustment_def' is zero).  For example, if code is PLUS we create:
5275          new_temp = loop_exit_def + adjustment_def  */
5276
5277   if (adjustment_def)
5278     {
5279       gcc_assert (!slp_reduc);
5280       if (nested_in_vect_loop)
5281         {
5282           new_phi = new_phis[0];
5283           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5284           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5285           new_dest = vect_create_destination_var (scalar_dest, vectype);
5286         }
5287       else
5288         {
5289           new_temp = scalar_results[0];
5290           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5291           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5292           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5293         }
5294
5295       epilog_stmt = gimple_build_assign (new_dest, expr);
5296       new_temp = make_ssa_name (new_dest, epilog_stmt);
5297       gimple_assign_set_lhs (epilog_stmt, new_temp);
5298       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5299       if (nested_in_vect_loop)
5300         {
5301           set_vinfo_for_stmt (epilog_stmt,
5302                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5303           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5304                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5305
5306           if (!double_reduc)
5307             scalar_results.quick_push (new_temp);
5308           else
5309             scalar_results[0] = new_temp;
5310         }
5311       else
5312         scalar_results[0] = new_temp;
5313
5314       new_phis[0] = epilog_stmt;
5315     }
5316
5317   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5318           phis with new adjusted scalar results, i.e., replace use <s_out0>
5319           with use <s_out4>.
5320
5321      Transform:
5322         loop_exit:
5323           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5324           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5325           v_out2 = reduce <v_out1>
5326           s_out3 = extract_field <v_out2, 0>
5327           s_out4 = adjust_result <s_out3>
5328           use <s_out0>
5329           use <s_out0>
5330
5331      into:
5332
5333         loop_exit:
5334           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5335           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5336           v_out2 = reduce <v_out1>
5337           s_out3 = extract_field <v_out2, 0>
5338           s_out4 = adjust_result <s_out3>
5339           use <s_out4>
5340           use <s_out4> */
5341
5342
5343   /* In SLP reduction chain we reduce vector results into one vector if
5344      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5345      the last stmt in the reduction chain, since we are looking for the loop
5346      exit phi node.  */
5347   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5348     {
5349       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5350       /* Handle reduction patterns.  */
5351       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5352         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5353
5354       scalar_dest = gimple_assign_lhs (dest_stmt);
5355       group_size = 1;
5356     }
5357
5358   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5359      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5360      need to match SCALAR_RESULTS with corresponding statements.  The first
5361      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5362      the first vector stmt, etc.
5363      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5364   if (group_size > new_phis.length ())
5365     {
5366       ratio = group_size / new_phis.length ();
5367       gcc_assert (!(group_size % new_phis.length ()));
5368     }
5369   else
5370     ratio = 1;
5371
5372   for (k = 0; k < group_size; k++)
5373     {
5374       if (k % ratio == 0)
5375         {
5376           epilog_stmt = new_phis[k / ratio];
5377           reduction_phi = reduction_phis[k / ratio];
5378           if (double_reduc)
5379             inner_phi = inner_phis[k / ratio];
5380         }
5381
5382       if (slp_reduc)
5383         {
5384           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5385
5386           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5387           /* SLP statements can't participate in patterns.  */
5388           gcc_assert (!orig_stmt);
5389           scalar_dest = gimple_assign_lhs (current_stmt);
5390         }
5391
5392       phis.create (3);
5393       /* Find the loop-closed-use at the loop exit of the original scalar
5394          result.  (The reduction result is expected to have two immediate uses -
5395          one at the latch block, and one at the loop exit).  */
5396       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5397         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5398             && !is_gimple_debug (USE_STMT (use_p)))
5399           phis.safe_push (USE_STMT (use_p));
5400
5401       /* While we expect to have found an exit_phi because of loop-closed-ssa
5402          form we can end up without one if the scalar cycle is dead.  */
5403
5404       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5405         {
5406           if (outer_loop)
5407             {
5408               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5409               gphi *vect_phi;
5410
5411               /* FORNOW. Currently not supporting the case that an inner-loop
5412                  reduction is not used in the outer-loop (but only outside the
5413                  outer-loop), unless it is double reduction.  */
5414               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5415                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5416                           || double_reduc);
5417
5418               if (double_reduc)
5419                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5420               else
5421                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5422               if (!double_reduc
5423                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5424                       != vect_double_reduction_def)
5425                 continue;
5426
5427               /* Handle double reduction:
5428
5429                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5430                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5431                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5432                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5433
5434                  At that point the regular reduction (stmt2 and stmt3) is
5435                  already vectorized, as well as the exit phi node, stmt4.
5436                  Here we vectorize the phi node of double reduction, stmt1, and
5437                  update all relevant statements.  */
5438
5439               /* Go through all the uses of s2 to find double reduction phi
5440                  node, i.e., stmt1 above.  */
5441               orig_name = PHI_RESULT (exit_phi);
5442               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5443                 {
5444                   stmt_vec_info use_stmt_vinfo;
5445                   stmt_vec_info new_phi_vinfo;
5446                   tree vect_phi_init, preheader_arg, vect_phi_res;
5447                   basic_block bb = gimple_bb (use_stmt);
5448                   gimple *use;
5449
5450                   /* Check that USE_STMT is really double reduction phi
5451                      node.  */
5452                   if (gimple_code (use_stmt) != GIMPLE_PHI
5453                       || gimple_phi_num_args (use_stmt) != 2
5454                       || bb->loop_father != outer_loop)
5455                     continue;
5456                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5457                   if (!use_stmt_vinfo
5458                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5459                           != vect_double_reduction_def)
5460                     continue;
5461
5462                   /* Create vector phi node for double reduction:
5463                      vs1 = phi <vs0, vs2>
5464                      vs1 was created previously in this function by a call to
5465                        vect_get_vec_def_for_operand and is stored in
5466                        vec_initial_def;
5467                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5468                      vs0 is created here.  */
5469
5470                   /* Create vector phi node.  */
5471                   vect_phi = create_phi_node (vec_initial_def, bb);
5472                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5473                                     loop_vec_info_for_loop (outer_loop));
5474                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5475
5476                   /* Create vs0 - initial def of the double reduction phi.  */
5477                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5478                                              loop_preheader_edge (outer_loop));
5479                   vect_phi_init = get_initial_def_for_reduction
5480                     (stmt, preheader_arg, NULL);
5481
5482                   /* Update phi node arguments with vs0 and vs2.  */
5483                   add_phi_arg (vect_phi, vect_phi_init,
5484                                loop_preheader_edge (outer_loop),
5485                                UNKNOWN_LOCATION);
5486                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5487                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5488                   if (dump_enabled_p ())
5489                     {
5490                       dump_printf_loc (MSG_NOTE, vect_location,
5491                                        "created double reduction phi node: ");
5492                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5493                     }
5494
5495                   vect_phi_res = PHI_RESULT (vect_phi);
5496
5497                   /* Replace the use, i.e., set the correct vs1 in the regular
5498                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5499                      loop is redundant.  */
5500                   use = reduction_phi;
5501                   for (j = 0; j < ncopies; j++)
5502                     {
5503                       edge pr_edge = loop_preheader_edge (loop);
5504                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5505                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5506                     }
5507                 }
5508             }
5509         }
5510
5511       phis.release ();
5512       if (nested_in_vect_loop)
5513         {
5514           if (double_reduc)
5515             loop = outer_loop;
5516           else
5517             continue;
5518         }
5519
5520       phis.create (3);
5521       /* Find the loop-closed-use at the loop exit of the original scalar
5522          result.  (The reduction result is expected to have two immediate uses,
5523          one at the latch block, and one at the loop exit).  For double
5524          reductions we are looking for exit phis of the outer loop.  */
5525       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5526         {
5527           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5528             {
5529               if (!is_gimple_debug (USE_STMT (use_p)))
5530                 phis.safe_push (USE_STMT (use_p));
5531             }
5532           else
5533             {
5534               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5535                 {
5536                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5537
5538                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5539                     {
5540                       if (!flow_bb_inside_loop_p (loop,
5541                                              gimple_bb (USE_STMT (phi_use_p)))
5542                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5543                         phis.safe_push (USE_STMT (phi_use_p));
5544                     }
5545                 }
5546             }
5547         }
5548
5549       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5550         {
5551           /* Replace the uses:  */
5552           orig_name = PHI_RESULT (exit_phi);
5553           scalar_result = scalar_results[k];
5554           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5555             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5556               SET_USE (use_p, scalar_result);
5557         }
5558
5559       phis.release ();
5560     }
5561 }
5562
5563
5564 /* Function is_nonwrapping_integer_induction.
5565
5566    Check if STMT (which is part of loop LOOP) both increments and
5567    does not cause overflow.  */
5568
5569 static bool
5570 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5571 {
5572   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5573   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5574   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5575   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5576   widest_int ni, max_loop_value, lhs_max;
5577   bool overflow = false;
5578
5579   /* Make sure the loop is integer based.  */
5580   if (TREE_CODE (base) != INTEGER_CST
5581       || TREE_CODE (step) != INTEGER_CST)
5582     return false;
5583
5584   /* Check that the max size of the loop will not wrap.  */
5585
5586   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5587     return true;
5588
5589   if (! max_stmt_executions (loop, &ni))
5590     return false;
5591
5592   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5593                             &overflow);
5594   if (overflow)
5595     return false;
5596
5597   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5598                             TYPE_SIGN (lhs_type), &overflow);
5599   if (overflow)
5600     return false;
5601
5602   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5603           <= TYPE_PRECISION (lhs_type));
5604 }
5605
5606 /* Function vectorizable_reduction.
5607
5608    Check if STMT performs a reduction operation that can be vectorized.
5609    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5610    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5611    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5612
5613    This function also handles reduction idioms (patterns) that have been
5614    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5615    of this form:
5616      X = pattern_expr (arg0, arg1, ..., X)
5617    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5618    sequence that had been detected and replaced by the pattern-stmt (STMT).
5619
5620    This function also handles reduction of condition expressions, for example:
5621      for (int i = 0; i < N; i++)
5622        if (a[i] < value)
5623          last = a[i];
5624    This is handled by vectorising the loop and creating an additional vector
5625    containing the loop indexes for which "a[i] < value" was true.  In the
5626    function epilogue this is reduced to a single max value and then used to
5627    index into the vector of results.
5628
5629    In some cases of reduction patterns, the type of the reduction variable X is
5630    different than the type of the other arguments of STMT.
5631    In such cases, the vectype that is used when transforming STMT into a vector
5632    stmt is different than the vectype that is used to determine the
5633    vectorization factor, because it consists of a different number of elements
5634    than the actual number of elements that are being operated upon in parallel.
5635
5636    For example, consider an accumulation of shorts into an int accumulator.
5637    On some targets it's possible to vectorize this pattern operating on 8
5638    shorts at a time (hence, the vectype for purposes of determining the
5639    vectorization factor should be V8HI); on the other hand, the vectype that
5640    is used to create the vector form is actually V4SI (the type of the result).
5641
5642    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5643    indicates what is the actual level of parallelism (V8HI in the example), so
5644    that the right vectorization factor would be derived.  This vectype
5645    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5646    be used to create the vectorized stmt.  The right vectype for the vectorized
5647    stmt is obtained from the type of the result X:
5648         get_vectype_for_scalar_type (TREE_TYPE (X))
5649
5650    This means that, contrary to "regular" reductions (or "regular" stmts in
5651    general), the following equation:
5652       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5653    does *NOT* necessarily hold for reduction patterns.  */
5654
5655 bool
5656 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5657                         gimple **vec_stmt, slp_tree slp_node,
5658                         slp_instance slp_node_instance)
5659 {
5660   tree vec_dest;
5661   tree scalar_dest;
5662   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5663   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5664   tree vectype_in = NULL_TREE;
5665   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5666   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5667   enum tree_code code, orig_code;
5668   internal_fn reduc_fn;
5669   machine_mode vec_mode;
5670   int op_type;
5671   optab optab;
5672   tree new_temp = NULL_TREE;
5673   gimple *def_stmt;
5674   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5675   gimple *cond_reduc_def_stmt = NULL;
5676   enum tree_code cond_reduc_op_code = ERROR_MARK;
5677   tree scalar_type;
5678   bool is_simple_use;
5679   gimple *orig_stmt;
5680   stmt_vec_info orig_stmt_info = NULL;
5681   int i;
5682   int ncopies;
5683   int epilog_copies;
5684   stmt_vec_info prev_stmt_info, prev_phi_info;
5685   bool single_defuse_cycle = false;
5686   gimple *new_stmt = NULL;
5687   int j;
5688   tree ops[3];
5689   enum vect_def_type dts[3];
5690   bool nested_cycle = false, found_nested_cycle_def = false;
5691   bool double_reduc = false;
5692   basic_block def_bb;
5693   struct loop * def_stmt_loop, *outer_loop = NULL;
5694   tree def_arg;
5695   gimple *def_arg_stmt;
5696   auto_vec<tree> vec_oprnds0;
5697   auto_vec<tree> vec_oprnds1;
5698   auto_vec<tree> vec_oprnds2;
5699   auto_vec<tree> vect_defs;
5700   auto_vec<gimple *> phis;
5701   int vec_num;
5702   tree def0, tem;
5703   bool first_p = true;
5704   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5705   tree cond_reduc_val = NULL_TREE;
5706
5707   /* Make sure it was already recognized as a reduction computation.  */
5708   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5709       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5710     return false;
5711
5712   if (nested_in_vect_loop_p (loop, stmt))
5713     {
5714       outer_loop = loop;
5715       loop = loop->inner;
5716       nested_cycle = true;
5717     }
5718
5719   /* In case of reduction chain we switch to the first stmt in the chain, but
5720      we don't update STMT_INFO, since only the last stmt is marked as reduction
5721      and has reduction properties.  */
5722   if (GROUP_FIRST_ELEMENT (stmt_info)
5723       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5724     {
5725       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5726       first_p = false;
5727     }
5728
5729   if (gimple_code (stmt) == GIMPLE_PHI)
5730     {
5731       /* Analysis is fully done on the reduction stmt invocation.  */
5732       if (! vec_stmt)
5733         {
5734           if (slp_node)
5735             slp_node_instance->reduc_phis = slp_node;
5736
5737           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5738           return true;
5739         }
5740
5741       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5742       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5743         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5744
5745       gcc_assert (is_gimple_assign (reduc_stmt));
5746       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5747         {
5748           tree op = gimple_op (reduc_stmt, k);
5749           if (op == gimple_phi_result (stmt))
5750             continue;
5751           if (k == 1
5752               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5753             continue;
5754           if (!vectype_in
5755               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5756                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
5757             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
5758           break;
5759         }
5760       gcc_assert (vectype_in);
5761
5762       if (slp_node)
5763         ncopies = 1;
5764       else
5765         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5766
5767       use_operand_p use_p;
5768       gimple *use_stmt;
5769       if (ncopies > 1
5770           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5771               <= vect_used_only_live)
5772           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5773           && (use_stmt == reduc_stmt
5774               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5775                   == reduc_stmt)))
5776         single_defuse_cycle = true;
5777
5778       /* Create the destination vector  */
5779       scalar_dest = gimple_assign_lhs (reduc_stmt);
5780       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5781
5782       if (slp_node)
5783         /* The size vect_schedule_slp_instance computes is off for us.  */
5784         vec_num = vect_get_num_vectors
5785           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5786            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
5787            vectype_in);
5788       else
5789         vec_num = 1;
5790
5791       /* Generate the reduction PHIs upfront.  */
5792       prev_phi_info = NULL;
5793       for (j = 0; j < ncopies; j++)
5794         {
5795           if (j == 0 || !single_defuse_cycle)
5796             {
5797               for (i = 0; i < vec_num; i++)
5798                 {
5799                   /* Create the reduction-phi that defines the reduction
5800                      operand.  */
5801                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5802                   set_vinfo_for_stmt (new_phi,
5803                                       new_stmt_vec_info (new_phi, loop_vinfo));
5804
5805                   if (slp_node)
5806                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5807                   else
5808                     {
5809                       if (j == 0)
5810                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5811                       else
5812                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5813                       prev_phi_info = vinfo_for_stmt (new_phi);
5814                     }
5815                 }
5816             }
5817         }
5818
5819       return true;
5820     }
5821
5822   /* 1. Is vectorizable reduction?  */
5823   /* Not supportable if the reduction variable is used in the loop, unless
5824      it's a reduction chain.  */
5825   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5826       && !GROUP_FIRST_ELEMENT (stmt_info))
5827     return false;
5828
5829   /* Reductions that are not used even in an enclosing outer-loop,
5830      are expected to be "live" (used out of the loop).  */
5831   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5832       && !STMT_VINFO_LIVE_P (stmt_info))
5833     return false;
5834
5835   /* 2. Has this been recognized as a reduction pattern?
5836
5837      Check if STMT represents a pattern that has been recognized
5838      in earlier analysis stages.  For stmts that represent a pattern,
5839      the STMT_VINFO_RELATED_STMT field records the last stmt in
5840      the original sequence that constitutes the pattern.  */
5841
5842   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5843   if (orig_stmt)
5844     {
5845       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5846       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5847       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5848     }
5849
5850   /* 3. Check the operands of the operation.  The first operands are defined
5851         inside the loop body. The last operand is the reduction variable,
5852         which is defined by the loop-header-phi.  */
5853
5854   gcc_assert (is_gimple_assign (stmt));
5855
5856   /* Flatten RHS.  */
5857   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5858     {
5859     case GIMPLE_BINARY_RHS:
5860       code = gimple_assign_rhs_code (stmt);
5861       op_type = TREE_CODE_LENGTH (code);
5862       gcc_assert (op_type == binary_op);
5863       ops[0] = gimple_assign_rhs1 (stmt);
5864       ops[1] = gimple_assign_rhs2 (stmt);
5865       break;
5866
5867     case GIMPLE_TERNARY_RHS:
5868       code = gimple_assign_rhs_code (stmt);
5869       op_type = TREE_CODE_LENGTH (code);
5870       gcc_assert (op_type == ternary_op);
5871       ops[0] = gimple_assign_rhs1 (stmt);
5872       ops[1] = gimple_assign_rhs2 (stmt);
5873       ops[2] = gimple_assign_rhs3 (stmt);
5874       break;
5875
5876     case GIMPLE_UNARY_RHS:
5877       return false;
5878
5879     default:
5880       gcc_unreachable ();
5881     }
5882
5883   if (code == COND_EXPR && slp_node)
5884     return false;
5885
5886   scalar_dest = gimple_assign_lhs (stmt);
5887   scalar_type = TREE_TYPE (scalar_dest);
5888   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5889       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5890     return false;
5891
5892   /* Do not try to vectorize bit-precision reductions.  */
5893   if (!type_has_mode_precision_p (scalar_type))
5894     return false;
5895
5896   /* All uses but the last are expected to be defined in the loop.
5897      The last use is the reduction variable.  In case of nested cycle this
5898      assumption is not true: we use reduc_index to record the index of the
5899      reduction variable.  */
5900   gimple *reduc_def_stmt = NULL;
5901   int reduc_index = -1;
5902   for (i = 0; i < op_type; i++)
5903     {
5904       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5905       if (i == 0 && code == COND_EXPR)
5906         continue;
5907
5908       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5909                                           &def_stmt, &dts[i], &tem);
5910       dt = dts[i];
5911       gcc_assert (is_simple_use);
5912       if (dt == vect_reduction_def)
5913         {
5914           reduc_def_stmt = def_stmt;
5915           reduc_index = i;
5916           continue;
5917         }
5918       else if (tem)
5919         {
5920           /* To properly compute ncopies we are interested in the widest
5921              input type in case we're looking at a widening accumulation.  */
5922           if (!vectype_in
5923               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5924                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
5925             vectype_in = tem;
5926         }
5927
5928       if (dt != vect_internal_def
5929           && dt != vect_external_def
5930           && dt != vect_constant_def
5931           && dt != vect_induction_def
5932           && !(dt == vect_nested_cycle && nested_cycle))
5933         return false;
5934
5935       if (dt == vect_nested_cycle)
5936         {
5937           found_nested_cycle_def = true;
5938           reduc_def_stmt = def_stmt;
5939           reduc_index = i;
5940         }
5941
5942       if (i == 1 && code == COND_EXPR)
5943         {
5944           /* Record how value of COND_EXPR is defined.  */
5945           if (dt == vect_constant_def)
5946             {
5947               cond_reduc_dt = dt;
5948               cond_reduc_val = ops[i];
5949             }
5950           if (dt == vect_induction_def
5951               && def_stmt != NULL
5952               && is_nonwrapping_integer_induction (def_stmt, loop))
5953             {
5954               cond_reduc_dt = dt;
5955               cond_reduc_def_stmt = def_stmt;
5956             }
5957         }
5958     }
5959
5960   if (!vectype_in)
5961     vectype_in = vectype_out;
5962
5963   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5964      directy used in stmt.  */
5965   if (reduc_index == -1)
5966     {
5967       if (orig_stmt)
5968         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5969       else
5970         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5971     }
5972
5973   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5974     return false;
5975
5976   if (!(reduc_index == -1
5977         || dts[reduc_index] == vect_reduction_def
5978         || dts[reduc_index] == vect_nested_cycle
5979         || ((dts[reduc_index] == vect_internal_def
5980              || dts[reduc_index] == vect_external_def
5981              || dts[reduc_index] == vect_constant_def
5982              || dts[reduc_index] == vect_induction_def)
5983             && nested_cycle && found_nested_cycle_def)))
5984     {
5985       /* For pattern recognized stmts, orig_stmt might be a reduction,
5986          but some helper statements for the pattern might not, or
5987          might be COND_EXPRs with reduction uses in the condition.  */
5988       gcc_assert (orig_stmt);
5989       return false;
5990     }
5991
5992   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5993   enum vect_reduction_type v_reduc_type
5994     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5995   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5996
5997   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5998   /* If we have a condition reduction, see if we can simplify it further.  */
5999   if (v_reduc_type == COND_REDUCTION)
6000     {
6001       if (cond_reduc_dt == vect_induction_def)
6002         {
6003           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6004           tree base
6005             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6006           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6007
6008           gcc_assert (TREE_CODE (base) == INTEGER_CST
6009                       && TREE_CODE (step) == INTEGER_CST);
6010           cond_reduc_val = NULL_TREE;
6011           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6012              above base; punt if base is the minimum value of the type for
6013              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6014           if (tree_int_cst_sgn (step) == -1)
6015             {
6016               cond_reduc_op_code = MIN_EXPR;
6017               if (tree_int_cst_sgn (base) == -1)
6018                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6019               else if (tree_int_cst_lt (base,
6020                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6021                 cond_reduc_val
6022                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6023             }
6024           else
6025             {
6026               cond_reduc_op_code = MAX_EXPR;
6027               if (tree_int_cst_sgn (base) == 1)
6028                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6029               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6030                                         base))
6031                 cond_reduc_val
6032                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6033             }
6034           if (cond_reduc_val)
6035             {
6036               if (dump_enabled_p ())
6037                 dump_printf_loc (MSG_NOTE, vect_location,
6038                                  "condition expression based on "
6039                                  "integer induction.\n");
6040               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6041                 = INTEGER_INDUC_COND_REDUCTION;
6042             }
6043         }
6044
6045       /* Loop peeling modifies initial value of reduction PHI, which
6046          makes the reduction stmt to be transformed different to the
6047          original stmt analyzed.  We need to record reduction code for
6048          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6049          it can be used directly at transform stage.  */
6050       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6051           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6052         {
6053           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6054           gcc_assert (cond_reduc_dt == vect_constant_def);
6055           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6056         }
6057       else if (cond_reduc_dt == vect_constant_def)
6058         {
6059           enum vect_def_type cond_initial_dt;
6060           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6061           tree cond_initial_val
6062             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6063
6064           gcc_assert (cond_reduc_val != NULL_TREE);
6065           vect_is_simple_use (cond_initial_val, loop_vinfo,
6066                               &def_stmt, &cond_initial_dt);
6067           if (cond_initial_dt == vect_constant_def
6068               && types_compatible_p (TREE_TYPE (cond_initial_val),
6069                                      TREE_TYPE (cond_reduc_val)))
6070             {
6071               tree e = fold_binary (LE_EXPR, boolean_type_node,
6072                                     cond_initial_val, cond_reduc_val);
6073               if (e && (integer_onep (e) || integer_zerop (e)))
6074                 {
6075                   if (dump_enabled_p ())
6076                     dump_printf_loc (MSG_NOTE, vect_location,
6077                                      "condition expression based on "
6078                                      "compile time constant.\n");
6079                   /* Record reduction code at analysis stage.  */
6080                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6081                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6082                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6083                     = CONST_COND_REDUCTION;
6084                 }
6085             }
6086         }
6087     }
6088
6089   if (orig_stmt)
6090     gcc_assert (tmp == orig_stmt
6091                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6092   else
6093     /* We changed STMT to be the first stmt in reduction chain, hence we
6094        check that in this case the first element in the chain is STMT.  */
6095     gcc_assert (stmt == tmp
6096                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6097
6098   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6099     return false;
6100
6101   if (slp_node)
6102     ncopies = 1;
6103   else
6104     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6105
6106   gcc_assert (ncopies >= 1);
6107
6108   vec_mode = TYPE_MODE (vectype_in);
6109   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6110
6111   if (code == COND_EXPR)
6112     {
6113       /* Only call during the analysis stage, otherwise we'll lose
6114          STMT_VINFO_TYPE.  */
6115       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6116                                                 ops[reduc_index], 0, NULL))
6117         {
6118           if (dump_enabled_p ())
6119             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6120                              "unsupported condition in reduction\n");
6121           return false;
6122         }
6123     }
6124   else
6125     {
6126       /* 4. Supportable by target?  */
6127
6128       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6129           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6130         {
6131           /* Shifts and rotates are only supported by vectorizable_shifts,
6132              not vectorizable_reduction.  */
6133           if (dump_enabled_p ())
6134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135                              "unsupported shift or rotation.\n");
6136           return false;
6137         }
6138
6139       /* 4.1. check support for the operation in the loop  */
6140       optab = optab_for_tree_code (code, vectype_in, optab_default);
6141       if (!optab)
6142         {
6143           if (dump_enabled_p ())
6144             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6145                              "no optab.\n");
6146
6147           return false;
6148         }
6149
6150       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6151         {
6152           if (dump_enabled_p ())
6153             dump_printf (MSG_NOTE, "op not supported by target.\n");
6154
6155           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6156               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6157             return false;
6158
6159           if (dump_enabled_p ())
6160             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6161         }
6162
6163       /* Worthwhile without SIMD support?  */
6164       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6165           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6166         {
6167           if (dump_enabled_p ())
6168             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6169                              "not worthwhile without SIMD support.\n");
6170
6171           return false;
6172         }
6173     }
6174
6175   /* 4.2. Check support for the epilog operation.
6176
6177           If STMT represents a reduction pattern, then the type of the
6178           reduction variable may be different than the type of the rest
6179           of the arguments.  For example, consider the case of accumulation
6180           of shorts into an int accumulator; The original code:
6181                         S1: int_a = (int) short_a;
6182           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6183
6184           was replaced with:
6185                         STMT: int_acc = widen_sum <short_a, int_acc>
6186
6187           This means that:
6188           1. The tree-code that is used to create the vector operation in the
6189              epilog code (that reduces the partial results) is not the
6190              tree-code of STMT, but is rather the tree-code of the original
6191              stmt from the pattern that STMT is replacing.  I.e, in the example
6192              above we want to use 'widen_sum' in the loop, but 'plus' in the
6193              epilog.
6194           2. The type (mode) we use to check available target support
6195              for the vector operation to be created in the *epilog*, is
6196              determined by the type of the reduction variable (in the example
6197              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6198              However the type (mode) we use to check available target support
6199              for the vector operation to be created *inside the loop*, is
6200              determined by the type of the other arguments to STMT (in the
6201              example we'd check this: optab_handler (widen_sum_optab,
6202              vect_short_mode)).
6203
6204           This is contrary to "regular" reductions, in which the types of all
6205           the arguments are the same as the type of the reduction variable.
6206           For "regular" reductions we can therefore use the same vector type
6207           (and also the same tree-code) when generating the epilog code and
6208           when generating the code inside the loop.  */
6209
6210   if (orig_stmt)
6211     {
6212       /* This is a reduction pattern: get the vectype from the type of the
6213          reduction variable, and get the tree-code from orig_stmt.  */
6214       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6215                   == TREE_CODE_REDUCTION);
6216       orig_code = gimple_assign_rhs_code (orig_stmt);
6217       gcc_assert (vectype_out);
6218       vec_mode = TYPE_MODE (vectype_out);
6219     }
6220   else
6221     {
6222       /* Regular reduction: use the same vectype and tree-code as used for
6223          the vector code inside the loop can be used for the epilog code. */
6224       orig_code = code;
6225
6226       if (code == MINUS_EXPR)
6227         orig_code = PLUS_EXPR;
6228
6229       /* For simple condition reductions, replace with the actual expression
6230          we want to base our reduction around.  */
6231       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6232         {
6233           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6234           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6235         }
6236       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6237                == INTEGER_INDUC_COND_REDUCTION)
6238         orig_code = cond_reduc_op_code;
6239     }
6240
6241   if (nested_cycle)
6242     {
6243       def_bb = gimple_bb (reduc_def_stmt);
6244       def_stmt_loop = def_bb->loop_father;
6245       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6246                                        loop_preheader_edge (def_stmt_loop));
6247       if (TREE_CODE (def_arg) == SSA_NAME
6248           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6249           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6250           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6251           && vinfo_for_stmt (def_arg_stmt)
6252           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6253               == vect_double_reduction_def)
6254         double_reduc = true;
6255     }
6256
6257   reduc_fn = IFN_LAST;
6258
6259   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6260     {
6261       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6262         {
6263           if (reduc_fn != IFN_LAST
6264               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6265                                                   OPTIMIZE_FOR_SPEED))
6266             {
6267               if (dump_enabled_p ())
6268                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6269                                  "reduc op not supported by target.\n");
6270
6271               reduc_fn = IFN_LAST;
6272             }
6273         }
6274       else
6275         {
6276           if (!nested_cycle || double_reduc)
6277             {
6278               if (dump_enabled_p ())
6279                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6280                                  "no reduc code for scalar code.\n");
6281
6282               return false;
6283             }
6284         }
6285     }
6286   else
6287     {
6288       int scalar_precision
6289         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6290       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6291       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6292                                                 nunits_out);
6293
6294       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6295                                           OPTIMIZE_FOR_SPEED))
6296         reduc_fn = IFN_REDUC_MAX;
6297     }
6298
6299   if (reduc_fn == IFN_LAST && !nunits_out.is_constant ())
6300     {
6301       if (dump_enabled_p ())
6302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6303                          "missing target support for reduction on"
6304                          " variable-length vectors.\n");
6305       return false;
6306     }
6307
6308   if ((double_reduc
6309        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6310       && ncopies > 1)
6311     {
6312       if (dump_enabled_p ())
6313         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6314                          "multiple types in double reduction or condition "
6315                          "reduction.\n");
6316       return false;
6317     }
6318
6319   if (double_reduc && !nunits_out.is_constant ())
6320     {
6321       /* The current double-reduction code creates the initial value
6322          element-by-element.  */
6323       if (dump_enabled_p ())
6324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6325                          "double reduction not supported for variable-length"
6326                          " vectors.\n");
6327       return false;
6328     }
6329
6330   if (slp_node && !nunits_out.is_constant ())
6331     {
6332       /* The current SLP code creates the initial value element-by-element.  */
6333       if (dump_enabled_p ())
6334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6335                          "SLP reduction not supported for variable-length"
6336                          " vectors.\n");
6337       return false;
6338     }
6339
6340   /* In case of widenning multiplication by a constant, we update the type
6341      of the constant to be the type of the other operand.  We check that the
6342      constant fits the type in the pattern recognition pass.  */
6343   if (code == DOT_PROD_EXPR
6344       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6345     {
6346       if (TREE_CODE (ops[0]) == INTEGER_CST)
6347         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6348       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6349         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6350       else
6351         {
6352           if (dump_enabled_p ())
6353             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6354                              "invalid types in dot-prod\n");
6355
6356           return false;
6357         }
6358     }
6359
6360   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6361     {
6362       widest_int ni;
6363
6364       if (! max_loop_iterations (loop, &ni))
6365         {
6366           if (dump_enabled_p ())
6367             dump_printf_loc (MSG_NOTE, vect_location,
6368                              "loop count not known, cannot create cond "
6369                              "reduction.\n");
6370           return false;
6371         }
6372       /* Convert backedges to iterations.  */
6373       ni += 1;
6374
6375       /* The additional index will be the same type as the condition.  Check
6376          that the loop can fit into this less one (because we'll use up the
6377          zero slot for when there are no matches).  */
6378       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6379       if (wi::geu_p (ni, wi::to_widest (max_index)))
6380         {
6381           if (dump_enabled_p ())
6382             dump_printf_loc (MSG_NOTE, vect_location,
6383                              "loop size is greater than data size.\n");
6384           return false;
6385         }
6386     }
6387
6388   /* In case the vectorization factor (VF) is bigger than the number
6389      of elements that we can fit in a vectype (nunits), we have to generate
6390      more than one vector stmt - i.e - we need to "unroll" the
6391      vector stmt by a factor VF/nunits.  For more details see documentation
6392      in vectorizable_operation.  */
6393
6394   /* If the reduction is used in an outer loop we need to generate
6395      VF intermediate results, like so (e.g. for ncopies=2):
6396         r0 = phi (init, r0)
6397         r1 = phi (init, r1)
6398         r0 = x0 + r0;
6399         r1 = x1 + r1;
6400     (i.e. we generate VF results in 2 registers).
6401     In this case we have a separate def-use cycle for each copy, and therefore
6402     for each copy we get the vector def for the reduction variable from the
6403     respective phi node created for this copy.
6404
6405     Otherwise (the reduction is unused in the loop nest), we can combine
6406     together intermediate results, like so (e.g. for ncopies=2):
6407         r = phi (init, r)
6408         r = x0 + r;
6409         r = x1 + r;
6410    (i.e. we generate VF/2 results in a single register).
6411    In this case for each copy we get the vector def for the reduction variable
6412    from the vectorized reduction operation generated in the previous iteration.
6413
6414    This only works when we see both the reduction PHI and its only consumer
6415    in vectorizable_reduction and there are no intermediate stmts
6416    participating.  */
6417   use_operand_p use_p;
6418   gimple *use_stmt;
6419   if (ncopies > 1
6420       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6421       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6422       && (use_stmt == stmt
6423           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6424     {
6425       single_defuse_cycle = true;
6426       epilog_copies = 1;
6427     }
6428   else
6429     epilog_copies = ncopies;
6430
6431   /* If the reduction stmt is one of the patterns that have lane
6432      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6433   if ((ncopies > 1
6434        && ! single_defuse_cycle)
6435       && (code == DOT_PROD_EXPR
6436           || code == WIDEN_SUM_EXPR
6437           || code == SAD_EXPR))
6438     {
6439       if (dump_enabled_p ())
6440         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6441                          "multi def-use cycle not possible for lane-reducing "
6442                          "reduction operation\n");
6443       return false;
6444     }
6445
6446   if (!vec_stmt) /* transformation not required.  */
6447     {
6448       if (first_p)
6449         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6450       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6451       return true;
6452     }
6453
6454   /* Transform.  */
6455
6456   if (dump_enabled_p ())
6457     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6458
6459   /* FORNOW: Multiple types are not supported for condition.  */
6460   if (code == COND_EXPR)
6461     gcc_assert (ncopies == 1);
6462
6463   /* Create the destination vector  */
6464   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6465
6466   prev_stmt_info = NULL;
6467   prev_phi_info = NULL;
6468   if (slp_node)
6469     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6470   else
6471     {
6472       vec_num = 1;
6473       vec_oprnds0.create (1);
6474       vec_oprnds1.create (1);
6475       if (op_type == ternary_op)
6476         vec_oprnds2.create (1);
6477     }
6478
6479   phis.create (vec_num);
6480   vect_defs.create (vec_num);
6481   if (!slp_node)
6482     vect_defs.quick_push (NULL_TREE);
6483
6484   if (slp_node)
6485     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6486   else
6487     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6488
6489   for (j = 0; j < ncopies; j++)
6490     {
6491       if (code == COND_EXPR)
6492         {
6493           gcc_assert (!slp_node);
6494           vectorizable_condition (stmt, gsi, vec_stmt,
6495                                   PHI_RESULT (phis[0]),
6496                                   reduc_index, NULL);
6497           /* Multiple types are not supported for condition.  */
6498           break;
6499         }
6500
6501       /* Handle uses.  */
6502       if (j == 0)
6503         {
6504           if (slp_node)
6505             {
6506               /* Get vec defs for all the operands except the reduction index,
6507                  ensuring the ordering of the ops in the vector is kept.  */
6508               auto_vec<tree, 3> slp_ops;
6509               auto_vec<vec<tree>, 3> vec_defs;
6510
6511               slp_ops.quick_push (ops[0]);
6512               slp_ops.quick_push (ops[1]);
6513               if (op_type == ternary_op)
6514                 slp_ops.quick_push (ops[2]);
6515
6516               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6517
6518               vec_oprnds0.safe_splice (vec_defs[0]);
6519               vec_defs[0].release ();
6520               vec_oprnds1.safe_splice (vec_defs[1]);
6521               vec_defs[1].release ();
6522               if (op_type == ternary_op)
6523                 {
6524                   vec_oprnds2.safe_splice (vec_defs[2]);
6525                   vec_defs[2].release ();
6526                 }
6527             }
6528           else
6529             {
6530               vec_oprnds0.quick_push
6531                 (vect_get_vec_def_for_operand (ops[0], stmt));
6532               vec_oprnds1.quick_push
6533                 (vect_get_vec_def_for_operand (ops[1], stmt));
6534               if (op_type == ternary_op)
6535                 vec_oprnds2.quick_push
6536                   (vect_get_vec_def_for_operand (ops[2], stmt));
6537             }
6538         }
6539       else
6540         {
6541           if (!slp_node)
6542             {
6543               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6544
6545               if (single_defuse_cycle && reduc_index == 0)
6546                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6547               else
6548                 vec_oprnds0[0]
6549                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6550               if (single_defuse_cycle && reduc_index == 1)
6551                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6552               else
6553                 vec_oprnds1[0]
6554                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6555               if (op_type == ternary_op)
6556                 {
6557                   if (single_defuse_cycle && reduc_index == 2)
6558                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6559                   else
6560                     vec_oprnds2[0]
6561                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6562                 }
6563             }
6564         }
6565
6566       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6567         {
6568           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6569           if (op_type == ternary_op)
6570             vop[2] = vec_oprnds2[i];
6571
6572           new_temp = make_ssa_name (vec_dest, new_stmt);
6573           new_stmt = gimple_build_assign (new_temp, code,
6574                                           vop[0], vop[1], vop[2]);
6575           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6576
6577           if (slp_node)
6578             {
6579               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6580               vect_defs.quick_push (new_temp);
6581             }
6582           else
6583             vect_defs[0] = new_temp;
6584         }
6585
6586       if (slp_node)
6587         continue;
6588
6589       if (j == 0)
6590         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6591       else
6592         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6593
6594       prev_stmt_info = vinfo_for_stmt (new_stmt);
6595     }
6596
6597   /* Finalize the reduction-phi (set its arguments) and create the
6598      epilog reduction code.  */
6599   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6600     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6601
6602   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6603                                     epilog_copies, reduc_fn, phis,
6604                                     double_reduc, slp_node, slp_node_instance,
6605                                     cond_reduc_val, cond_reduc_op_code);
6606
6607   return true;
6608 }
6609
6610 /* Function vect_min_worthwhile_factor.
6611
6612    For a loop where we could vectorize the operation indicated by CODE,
6613    return the minimum vectorization factor that makes it worthwhile
6614    to use generic vectors.  */
6615 static unsigned int
6616 vect_min_worthwhile_factor (enum tree_code code)
6617 {
6618   switch (code)
6619     {
6620     case PLUS_EXPR:
6621     case MINUS_EXPR:
6622     case NEGATE_EXPR:
6623       return 4;
6624
6625     case BIT_AND_EXPR:
6626     case BIT_IOR_EXPR:
6627     case BIT_XOR_EXPR:
6628     case BIT_NOT_EXPR:
6629       return 2;
6630
6631     default:
6632       return INT_MAX;
6633     }
6634 }
6635
6636 /* Return true if VINFO indicates we are doing loop vectorization and if
6637    it is worth decomposing CODE operations into scalar operations for
6638    that loop's vectorization factor.  */
6639
6640 bool
6641 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6642 {
6643   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6644   unsigned HOST_WIDE_INT value;
6645   return (loop_vinfo
6646           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
6647           && value >= vect_min_worthwhile_factor (code));
6648 }
6649
6650 /* Function vectorizable_induction
6651
6652    Check if PHI performs an induction computation that can be vectorized.
6653    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6654    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6655    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6656
6657 bool
6658 vectorizable_induction (gimple *phi,
6659                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6660                         gimple **vec_stmt, slp_tree slp_node)
6661 {
6662   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6663   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6664   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6665   unsigned ncopies;
6666   bool nested_in_vect_loop = false;
6667   struct loop *iv_loop;
6668   tree vec_def;
6669   edge pe = loop_preheader_edge (loop);
6670   basic_block new_bb;
6671   tree new_vec, vec_init, vec_step, t;
6672   tree new_name;
6673   gimple *new_stmt;
6674   gphi *induction_phi;
6675   tree induc_def, vec_dest;
6676   tree init_expr, step_expr;
6677   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6678   unsigned i;
6679   tree expr;
6680   gimple_seq stmts;
6681   imm_use_iterator imm_iter;
6682   use_operand_p use_p;
6683   gimple *exit_phi;
6684   edge latch_e;
6685   tree loop_arg;
6686   gimple_stmt_iterator si;
6687   basic_block bb = gimple_bb (phi);
6688
6689   if (gimple_code (phi) != GIMPLE_PHI)
6690     return false;
6691
6692   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6693     return false;
6694
6695   /* Make sure it was recognized as induction computation.  */
6696   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6697     return false;
6698
6699   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6700   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6701
6702   if (slp_node)
6703     ncopies = 1;
6704   else
6705     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6706   gcc_assert (ncopies >= 1);
6707
6708   /* FORNOW. These restrictions should be relaxed.  */
6709   if (nested_in_vect_loop_p (loop, phi))
6710     {
6711       imm_use_iterator imm_iter;
6712       use_operand_p use_p;
6713       gimple *exit_phi;
6714       edge latch_e;
6715       tree loop_arg;
6716
6717       if (ncopies > 1)
6718         {
6719           if (dump_enabled_p ())
6720             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6721                              "multiple types in nested loop.\n");
6722           return false;
6723         }
6724
6725       /* FORNOW: outer loop induction with SLP not supported.  */
6726       if (STMT_SLP_TYPE (stmt_info))
6727         return false;
6728
6729       exit_phi = NULL;
6730       latch_e = loop_latch_edge (loop->inner);
6731       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6732       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6733         {
6734           gimple *use_stmt = USE_STMT (use_p);
6735           if (is_gimple_debug (use_stmt))
6736             continue;
6737
6738           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6739             {
6740               exit_phi = use_stmt;
6741               break;
6742             }
6743         }
6744       if (exit_phi)
6745         {
6746           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6747           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6748                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6749             {
6750               if (dump_enabled_p ())
6751                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6752                                  "inner-loop induction only used outside "
6753                                  "of the outer vectorized loop.\n");
6754               return false;
6755             }
6756         }
6757
6758       nested_in_vect_loop = true;
6759       iv_loop = loop->inner;
6760     }
6761   else
6762     iv_loop = loop;
6763   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6764
6765   if (slp_node && !nunits.is_constant ())
6766     {
6767       /* The current SLP code creates the initial value element-by-element.  */
6768       if (dump_enabled_p ())
6769         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6770                          "SLP induction not supported for variable-length"
6771                          " vectors.\n");
6772       return false;
6773     }
6774
6775   if (!vec_stmt) /* transformation not required.  */
6776     {
6777       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6778       if (dump_enabled_p ())
6779         dump_printf_loc (MSG_NOTE, vect_location,
6780                          "=== vectorizable_induction ===\n");
6781       vect_model_induction_cost (stmt_info, ncopies);
6782       return true;
6783     }
6784
6785   /* Transform.  */
6786
6787   /* Compute a vector variable, initialized with the first VF values of
6788      the induction variable.  E.g., for an iv with IV_PHI='X' and
6789      evolution S, for a vector of 4 units, we want to compute:
6790      [X, X + S, X + 2*S, X + 3*S].  */
6791
6792   if (dump_enabled_p ())
6793     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6794
6795   latch_e = loop_latch_edge (iv_loop);
6796   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6797
6798   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6799   gcc_assert (step_expr != NULL_TREE);
6800
6801   pe = loop_preheader_edge (iv_loop);
6802   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6803                                      loop_preheader_edge (iv_loop));
6804
6805   /* Convert the step to the desired type.  */
6806   stmts = NULL;
6807   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6808   if (stmts)
6809     {
6810       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6811       gcc_assert (!new_bb);
6812     }
6813
6814   /* Find the first insertion point in the BB.  */
6815   si = gsi_after_labels (bb);
6816
6817   /* For SLP induction we have to generate several IVs as for example
6818      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6819      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6820      [VF*S, VF*S, VF*S, VF*S] for all.  */
6821   if (slp_node)
6822     {
6823       /* Enforced above.  */
6824       unsigned int const_nunits = nunits.to_constant ();
6825
6826       /* Convert the init to the desired type.  */
6827       stmts = NULL;
6828       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6829       if (stmts)
6830         {
6831           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6832           gcc_assert (!new_bb);
6833         }
6834
6835       /* Generate [VF*S, VF*S, ... ].  */
6836       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6837         {
6838           expr = build_int_cst (integer_type_node, vf);
6839           expr = fold_convert (TREE_TYPE (step_expr), expr);
6840         }
6841       else
6842         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6843       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6844                               expr, step_expr);
6845       if (! CONSTANT_CLASS_P (new_name))
6846         new_name = vect_init_vector (phi, new_name,
6847                                      TREE_TYPE (step_expr), NULL);
6848       new_vec = build_vector_from_val (vectype, new_name);
6849       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6850
6851       /* Now generate the IVs.  */
6852       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6853       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6854       unsigned elts = const_nunits * nvects;
6855       unsigned nivs = least_common_multiple (group_size,
6856                                              const_nunits) / const_nunits;
6857       gcc_assert (elts % group_size == 0);
6858       tree elt = init_expr;
6859       unsigned ivn;
6860       for (ivn = 0; ivn < nivs; ++ivn)
6861         {
6862           tree_vector_builder elts (vectype, const_nunits, 1);
6863           stmts = NULL;
6864           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
6865             {
6866               if (ivn*const_nunits + eltn >= group_size
6867                   && (ivn * const_nunits + eltn) % group_size == 0)
6868                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6869                                     elt, step_expr);
6870               elts.quick_push (elt);
6871             }
6872           vec_init = gimple_build_vector (&stmts, &elts);
6873           if (stmts)
6874             {
6875               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6876               gcc_assert (!new_bb);
6877             }
6878
6879           /* Create the induction-phi that defines the induction-operand.  */
6880           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6881           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6882           set_vinfo_for_stmt (induction_phi,
6883                               new_stmt_vec_info (induction_phi, loop_vinfo));
6884           induc_def = PHI_RESULT (induction_phi);
6885
6886           /* Create the iv update inside the loop  */
6887           vec_def = make_ssa_name (vec_dest);
6888           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6889           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6890           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6891
6892           /* Set the arguments of the phi node:  */
6893           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6894           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6895                        UNKNOWN_LOCATION);
6896
6897           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6898         }
6899
6900       /* Re-use IVs when we can.  */
6901       if (ivn < nvects)
6902         {
6903           unsigned vfp
6904             = least_common_multiple (group_size, const_nunits) / group_size;
6905           /* Generate [VF'*S, VF'*S, ... ].  */
6906           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6907             {
6908               expr = build_int_cst (integer_type_node, vfp);
6909               expr = fold_convert (TREE_TYPE (step_expr), expr);
6910             }
6911           else
6912             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6913           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6914                                   expr, step_expr);
6915           if (! CONSTANT_CLASS_P (new_name))
6916             new_name = vect_init_vector (phi, new_name,
6917                                          TREE_TYPE (step_expr), NULL);
6918           new_vec = build_vector_from_val (vectype, new_name);
6919           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6920           for (; ivn < nvects; ++ivn)
6921             {
6922               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6923               tree def;
6924               if (gimple_code (iv) == GIMPLE_PHI)
6925                 def = gimple_phi_result (iv);
6926               else
6927                 def = gimple_assign_lhs (iv);
6928               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6929                                               PLUS_EXPR,
6930                                               def, vec_step);
6931               if (gimple_code (iv) == GIMPLE_PHI)
6932                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6933               else
6934                 {
6935                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6936                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6937                 }
6938               set_vinfo_for_stmt (new_stmt,
6939                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6940               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6941             }
6942         }
6943
6944       return true;
6945     }
6946
6947   /* Create the vector that holds the initial_value of the induction.  */
6948   if (nested_in_vect_loop)
6949     {
6950       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6951          been created during vectorization of previous stmts.  We obtain it
6952          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6953       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6954       /* If the initial value is not of proper type, convert it.  */
6955       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6956         {
6957           new_stmt
6958             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6959                                                           vect_simple_var,
6960                                                           "vec_iv_"),
6961                                    VIEW_CONVERT_EXPR,
6962                                    build1 (VIEW_CONVERT_EXPR, vectype,
6963                                            vec_init));
6964           vec_init = gimple_assign_lhs (new_stmt);
6965           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6966                                                  new_stmt);
6967           gcc_assert (!new_bb);
6968           set_vinfo_for_stmt (new_stmt,
6969                               new_stmt_vec_info (new_stmt, loop_vinfo));
6970         }
6971     }
6972   else
6973     {
6974       /* iv_loop is the loop to be vectorized. Create:
6975          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6976       stmts = NULL;
6977       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6978
6979       unsigned HOST_WIDE_INT const_nunits;
6980       if (nunits.is_constant (&const_nunits))
6981         {
6982           tree_vector_builder elts (vectype, const_nunits, 1);
6983           elts.quick_push (new_name);
6984           for (i = 1; i < const_nunits; i++)
6985             {
6986               /* Create: new_name_i = new_name + step_expr  */
6987               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6988                                        new_name, step_expr);
6989               elts.quick_push (new_name);
6990             }
6991           /* Create a vector from [new_name_0, new_name_1, ...,
6992              new_name_nunits-1]  */
6993           vec_init = gimple_build_vector (&stmts, &elts);
6994         }
6995       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
6996         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
6997         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
6998                                  new_name, step_expr);
6999       else
7000         {
7001           /* Build:
7002                 [base, base, base, ...]
7003                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7004           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7005           gcc_assert (flag_associative_math);
7006           tree index = build_index_vector (vectype, 0, 1);
7007           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7008                                                         new_name);
7009           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7010                                                         step_expr);
7011           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7012           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7013                                    vec_init, step_vec);
7014           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7015                                    vec_init, base_vec);
7016         }
7017
7018       if (stmts)
7019         {
7020           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7021           gcc_assert (!new_bb);
7022         }
7023     }
7024
7025
7026   /* Create the vector that holds the step of the induction.  */
7027   if (nested_in_vect_loop)
7028     /* iv_loop is nested in the loop to be vectorized. Generate:
7029        vec_step = [S, S, S, S]  */
7030     new_name = step_expr;
7031   else
7032     {
7033       /* iv_loop is the loop to be vectorized. Generate:
7034           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7035       gimple_seq seq = NULL;
7036       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7037         {
7038           expr = build_int_cst (integer_type_node, vf);
7039           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7040         }
7041       else
7042         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7043       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7044                                expr, step_expr);
7045       if (seq)
7046         {
7047           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7048           gcc_assert (!new_bb);
7049         }
7050     }
7051
7052   t = unshare_expr (new_name);
7053   gcc_assert (CONSTANT_CLASS_P (new_name)
7054               || TREE_CODE (new_name) == SSA_NAME);
7055   new_vec = build_vector_from_val (vectype, t);
7056   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7057
7058
7059   /* Create the following def-use cycle:
7060      loop prolog:
7061          vec_init = ...
7062          vec_step = ...
7063      loop:
7064          vec_iv = PHI <vec_init, vec_loop>
7065          ...
7066          STMT
7067          ...
7068          vec_loop = vec_iv + vec_step;  */
7069
7070   /* Create the induction-phi that defines the induction-operand.  */
7071   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7072   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7073   set_vinfo_for_stmt (induction_phi,
7074                       new_stmt_vec_info (induction_phi, loop_vinfo));
7075   induc_def = PHI_RESULT (induction_phi);
7076
7077   /* Create the iv update inside the loop  */
7078   vec_def = make_ssa_name (vec_dest);
7079   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7080   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7081   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7082
7083   /* Set the arguments of the phi node:  */
7084   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7085   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7086                UNKNOWN_LOCATION);
7087
7088   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7089
7090   /* In case that vectorization factor (VF) is bigger than the number
7091      of elements that we can fit in a vectype (nunits), we have to generate
7092      more than one vector stmt - i.e - we need to "unroll" the
7093      vector stmt by a factor VF/nunits.  For more details see documentation
7094      in vectorizable_operation.  */
7095
7096   if (ncopies > 1)
7097     {
7098       gimple_seq seq = NULL;
7099       stmt_vec_info prev_stmt_vinfo;
7100       /* FORNOW. This restriction should be relaxed.  */
7101       gcc_assert (!nested_in_vect_loop);
7102
7103       /* Create the vector that holds the step of the induction.  */
7104       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7105         {
7106           expr = build_int_cst (integer_type_node, nunits);
7107           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7108         }
7109       else
7110         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7111       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7112                                expr, step_expr);
7113       if (seq)
7114         {
7115           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7116           gcc_assert (!new_bb);
7117         }
7118
7119       t = unshare_expr (new_name);
7120       gcc_assert (CONSTANT_CLASS_P (new_name)
7121                   || TREE_CODE (new_name) == SSA_NAME);
7122       new_vec = build_vector_from_val (vectype, t);
7123       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7124
7125       vec_def = induc_def;
7126       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7127       for (i = 1; i < ncopies; i++)
7128         {
7129           /* vec_i = vec_prev + vec_step  */
7130           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7131                                           vec_def, vec_step);
7132           vec_def = make_ssa_name (vec_dest, new_stmt);
7133           gimple_assign_set_lhs (new_stmt, vec_def);
7134
7135           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7136           set_vinfo_for_stmt (new_stmt,
7137                               new_stmt_vec_info (new_stmt, loop_vinfo));
7138           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7139           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7140         }
7141     }
7142
7143   if (nested_in_vect_loop)
7144     {
7145       /* Find the loop-closed exit-phi of the induction, and record
7146          the final vector of induction results:  */
7147       exit_phi = NULL;
7148       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7149         {
7150           gimple *use_stmt = USE_STMT (use_p);
7151           if (is_gimple_debug (use_stmt))
7152             continue;
7153
7154           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7155             {
7156               exit_phi = use_stmt;
7157               break;
7158             }
7159         }
7160       if (exit_phi)
7161         {
7162           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7163           /* FORNOW. Currently not supporting the case that an inner-loop induction
7164              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7165           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7166                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7167
7168           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7169           if (dump_enabled_p ())
7170             {
7171               dump_printf_loc (MSG_NOTE, vect_location,
7172                                "vector of inductions after inner-loop:");
7173               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7174             }
7175         }
7176     }
7177
7178
7179   if (dump_enabled_p ())
7180     {
7181       dump_printf_loc (MSG_NOTE, vect_location,
7182                        "transform induction: created def-use cycle: ");
7183       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7184       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7185                         SSA_NAME_DEF_STMT (vec_def), 0);
7186     }
7187
7188   return true;
7189 }
7190
7191 /* Function vectorizable_live_operation.
7192
7193    STMT computes a value that is used outside the loop.  Check if
7194    it can be supported.  */
7195
7196 bool
7197 vectorizable_live_operation (gimple *stmt,
7198                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7199                              slp_tree slp_node, int slp_index,
7200                              gimple **vec_stmt)
7201 {
7202   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7203   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7204   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7205   imm_use_iterator imm_iter;
7206   tree lhs, lhs_type, bitsize, vec_bitsize;
7207   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7208   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7209   int ncopies;
7210   gimple *use_stmt;
7211   auto_vec<tree> vec_oprnds;
7212   int vec_entry = 0;
7213   poly_uint64 vec_index = 0;
7214
7215   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7216
7217   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7218     return false;
7219
7220   /* FORNOW.  CHECKME.  */
7221   if (nested_in_vect_loop_p (loop, stmt))
7222     return false;
7223
7224   /* If STMT is not relevant and it is a simple assignment and its inputs are
7225      invariant then it can remain in place, unvectorized.  The original last
7226      scalar value that it computes will be used.  */
7227   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7228     {
7229       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7230       if (dump_enabled_p ())
7231         dump_printf_loc (MSG_NOTE, vect_location,
7232                          "statement is simple and uses invariant.  Leaving in "
7233                          "place.\n");
7234       return true;
7235     }
7236
7237   if (slp_node)
7238     ncopies = 1;
7239   else
7240     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7241
7242   if (slp_node)
7243     {
7244       gcc_assert (slp_index >= 0);
7245
7246       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7247       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7248
7249       /* Get the last occurrence of the scalar index from the concatenation of
7250          all the slp vectors. Calculate which slp vector it is and the index
7251          within.  */
7252       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7253
7254       /* Calculate which vector contains the result, and which lane of
7255          that vector we need.  */
7256       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7257         {
7258           if (dump_enabled_p ())
7259             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7260                              "Cannot determine which vector holds the"
7261                              " final result.\n");
7262           return false;
7263         }
7264     }
7265
7266   if (!vec_stmt)
7267     /* No transformation required.  */
7268     return true;
7269
7270   /* If stmt has a related stmt, then use that for getting the lhs.  */
7271   if (is_pattern_stmt_p (stmt_info))
7272     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7273
7274   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7275         : gimple_get_lhs (stmt);
7276   lhs_type = TREE_TYPE (lhs);
7277
7278   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7279              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7280              : TYPE_SIZE (TREE_TYPE (vectype)));
7281   vec_bitsize = TYPE_SIZE (vectype);
7282
7283   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7284   tree vec_lhs, bitstart;
7285   if (slp_node)
7286     {
7287       /* Get the correct slp vectorized stmt.  */
7288       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7289
7290       /* Get entry to use.  */
7291       bitstart = bitsize_int (vec_index);
7292       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7293     }
7294   else
7295     {
7296       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7297       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7298
7299       /* For multiple copies, get the last copy.  */
7300       for (int i = 1; i < ncopies; ++i)
7301         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7302                                                   vec_lhs);
7303
7304       /* Get the last lane in the vector.  */
7305       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7306     }
7307
7308   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7309      loop.  */
7310   gimple_seq stmts = NULL;
7311   tree bftype = TREE_TYPE (vectype);
7312   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7313     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7314   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7315   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7316                                    true, NULL_TREE);
7317   if (stmts)
7318     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7319
7320   /* Replace use of lhs with newly computed result.  If the use stmt is a
7321      single arg PHI, just replace all uses of PHI result.  It's necessary
7322      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7323   use_operand_p use_p;
7324   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7325     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7326         && !is_gimple_debug (use_stmt))
7327     {
7328       if (gimple_code (use_stmt) == GIMPLE_PHI
7329           && gimple_phi_num_args (use_stmt) == 1)
7330         {
7331           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7332         }
7333       else
7334         {
7335           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7336             SET_USE (use_p, new_tree);
7337         }
7338       update_stmt (use_stmt);
7339     }
7340
7341   return true;
7342 }
7343
7344 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7345
7346 static void
7347 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7348 {
7349   ssa_op_iter op_iter;
7350   imm_use_iterator imm_iter;
7351   def_operand_p def_p;
7352   gimple *ustmt;
7353
7354   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7355     {
7356       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7357         {
7358           basic_block bb;
7359
7360           if (!is_gimple_debug (ustmt))
7361             continue;
7362
7363           bb = gimple_bb (ustmt);
7364
7365           if (!flow_bb_inside_loop_p (loop, bb))
7366             {
7367               if (gimple_debug_bind_p (ustmt))
7368                 {
7369                   if (dump_enabled_p ())
7370                     dump_printf_loc (MSG_NOTE, vect_location,
7371                                      "killing debug use\n");
7372
7373                   gimple_debug_bind_reset_value (ustmt);
7374                   update_stmt (ustmt);
7375                 }
7376               else
7377                 gcc_unreachable ();
7378             }
7379         }
7380     }
7381 }
7382
7383 /* Given loop represented by LOOP_VINFO, return true if computation of
7384    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7385    otherwise.  */
7386
7387 static bool
7388 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7389 {
7390   /* Constant case.  */
7391   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7392     {
7393       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7394       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7395
7396       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7397       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7398       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7399         return true;
7400     }
7401
7402   widest_int max;
7403   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7404   /* Check the upper bound of loop niters.  */
7405   if (get_max_loop_iterations (loop, &max))
7406     {
7407       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7408       signop sgn = TYPE_SIGN (type);
7409       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7410       if (max < type_max)
7411         return true;
7412     }
7413   return false;
7414 }
7415
7416 /* Scale profiling counters by estimation for LOOP which is vectorized
7417    by factor VF.  */
7418
7419 static void
7420 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7421 {
7422   edge preheader = loop_preheader_edge (loop);
7423   /* Reduce loop iterations by the vectorization factor.  */
7424   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7425   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7426
7427   if (freq_h.nonzero_p ())
7428     {
7429       profile_probability p;
7430
7431       /* Avoid dropping loop body profile counter to 0 because of zero count
7432          in loop's preheader.  */
7433       if (!(freq_e == profile_count::zero ()))
7434         freq_e = freq_e.force_nonzero ();
7435       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7436       scale_loop_frequencies (loop, p);
7437     }
7438
7439   edge exit_e = single_exit (loop);
7440   exit_e->probability = profile_probability::always ()
7441                                  .apply_scale (1, new_est_niter + 1);
7442
7443   edge exit_l = single_pred_edge (loop->latch);
7444   profile_probability prob = exit_l->probability;
7445   exit_l->probability = exit_e->probability.invert ();
7446   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7447     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7448 }
7449
7450 /* Function vect_transform_loop.
7451
7452    The analysis phase has determined that the loop is vectorizable.
7453    Vectorize the loop - created vectorized stmts to replace the scalar
7454    stmts in the loop, and update the loop exit condition.
7455    Returns scalar epilogue loop if any.  */
7456
7457 struct loop *
7458 vect_transform_loop (loop_vec_info loop_vinfo)
7459 {
7460   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7461   struct loop *epilogue = NULL;
7462   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7463   int nbbs = loop->num_nodes;
7464   int i;
7465   tree niters_vector = NULL_TREE;
7466   tree step_vector = NULL_TREE;
7467   tree niters_vector_mult_vf = NULL_TREE;
7468   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7469   unsigned int lowest_vf = constant_lower_bound (vf);
7470   bool grouped_store;
7471   bool slp_scheduled = false;
7472   gimple *stmt, *pattern_stmt;
7473   gimple_seq pattern_def_seq = NULL;
7474   gimple_stmt_iterator pattern_def_si = gsi_none ();
7475   bool transform_pattern_stmt = false;
7476   bool check_profitability = false;
7477   unsigned int th;
7478
7479   if (dump_enabled_p ())
7480     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7481
7482   /* Use the more conservative vectorization threshold.  If the number
7483      of iterations is constant assume the cost check has been performed
7484      by our caller.  If the threshold makes all loops profitable that
7485      run at least the (estimated) vectorization factor number of times
7486      checking is pointless, too.  */
7487   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7488   if (th >= vect_vf_for_cost (loop_vinfo)
7489       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7490     {
7491       if (dump_enabled_p ())
7492         dump_printf_loc (MSG_NOTE, vect_location,
7493                          "Profitability threshold is %d loop iterations.\n",
7494                          th);
7495       check_profitability = true;
7496     }
7497
7498   /* Make sure there exists a single-predecessor exit bb.  Do this before
7499      versioning.   */
7500   edge e = single_exit (loop);
7501   if (! single_pred_p (e->dest))
7502     {
7503       split_loop_exit_edge (e);
7504       if (dump_enabled_p ())
7505         dump_printf (MSG_NOTE, "split exit edge\n");
7506     }
7507
7508   /* Version the loop first, if required, so the profitability check
7509      comes first.  */
7510
7511   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7512     {
7513       poly_uint64 versioning_threshold
7514         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
7515       if (check_profitability
7516           && ordered_p (poly_uint64 (th), versioning_threshold))
7517         {
7518           versioning_threshold = ordered_max (poly_uint64 (th),
7519                                               versioning_threshold);
7520           check_profitability = false;
7521         }
7522       vect_loop_versioning (loop_vinfo, th, check_profitability,
7523                             versioning_threshold);
7524       check_profitability = false;
7525     }
7526
7527   /* Make sure there exists a single-predecessor exit bb also on the
7528      scalar loop copy.  Do this after versioning but before peeling
7529      so CFG structure is fine for both scalar and if-converted loop
7530      to make slpeel_duplicate_current_defs_from_edges face matched
7531      loop closed PHI nodes on the exit.  */
7532   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7533     {
7534       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7535       if (! single_pred_p (e->dest))
7536         {
7537           split_loop_exit_edge (e);
7538           if (dump_enabled_p ())
7539             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7540         }
7541     }
7542
7543   tree niters = vect_build_loop_niters (loop_vinfo);
7544   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7545   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7546   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7547   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
7548                               &step_vector, &niters_vector_mult_vf, th,
7549                               check_profitability, niters_no_overflow);
7550   if (niters_vector == NULL_TREE)
7551     {
7552       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && known_eq (lowest_vf, vf))
7553         {
7554           niters_vector
7555             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7556                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
7557           step_vector = build_one_cst (TREE_TYPE (niters));
7558         }
7559       else
7560         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7561                                      &step_vector, niters_no_overflow);
7562     }
7563
7564   /* 1) Make sure the loop header has exactly two entries
7565      2) Make sure we have a preheader basic block.  */
7566
7567   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7568
7569   split_edge (loop_preheader_edge (loop));
7570
7571   /* FORNOW: the vectorizer supports only loops which body consist
7572      of one basic block (header + empty latch). When the vectorizer will
7573      support more involved loop forms, the order by which the BBs are
7574      traversed need to be reconsidered.  */
7575
7576   for (i = 0; i < nbbs; i++)
7577     {
7578       basic_block bb = bbs[i];
7579       stmt_vec_info stmt_info;
7580
7581       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7582            gsi_next (&si))
7583         {
7584           gphi *phi = si.phi ();
7585           if (dump_enabled_p ())
7586             {
7587               dump_printf_loc (MSG_NOTE, vect_location,
7588                                "------>vectorizing phi: ");
7589               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7590             }
7591           stmt_info = vinfo_for_stmt (phi);
7592           if (!stmt_info)
7593             continue;
7594
7595           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7596             vect_loop_kill_debug_uses (loop, phi);
7597
7598           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7599               && !STMT_VINFO_LIVE_P (stmt_info))
7600             continue;
7601
7602           if (STMT_VINFO_VECTYPE (stmt_info)
7603               && (maybe_ne
7604                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
7605               && dump_enabled_p ())
7606             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7607
7608           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7609                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7610                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7611               && ! PURE_SLP_STMT (stmt_info))
7612             {
7613               if (dump_enabled_p ())
7614                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7615               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7616             }
7617         }
7618
7619       pattern_stmt = NULL;
7620       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7621            !gsi_end_p (si) || transform_pattern_stmt;)
7622         {
7623           bool is_store;
7624
7625           if (transform_pattern_stmt)
7626             stmt = pattern_stmt;
7627           else
7628             {
7629               stmt = gsi_stmt (si);
7630               /* During vectorization remove existing clobber stmts.  */
7631               if (gimple_clobber_p (stmt))
7632                 {
7633                   unlink_stmt_vdef (stmt);
7634                   gsi_remove (&si, true);
7635                   release_defs (stmt);
7636                   continue;
7637                 }
7638             }
7639
7640           if (dump_enabled_p ())
7641             {
7642               dump_printf_loc (MSG_NOTE, vect_location,
7643                                "------>vectorizing statement: ");
7644               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7645             }
7646
7647           stmt_info = vinfo_for_stmt (stmt);
7648
7649           /* vector stmts created in the outer-loop during vectorization of
7650              stmts in an inner-loop may not have a stmt_info, and do not
7651              need to be vectorized.  */
7652           if (!stmt_info)
7653             {
7654               gsi_next (&si);
7655               continue;
7656             }
7657
7658           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7659             vect_loop_kill_debug_uses (loop, stmt);
7660
7661           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7662               && !STMT_VINFO_LIVE_P (stmt_info))
7663             {
7664               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7665                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7666                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7667                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7668                 {
7669                   stmt = pattern_stmt;
7670                   stmt_info = vinfo_for_stmt (stmt);
7671                 }
7672               else
7673                 {
7674                   gsi_next (&si);
7675                   continue;
7676                 }
7677             }
7678           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7679                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7680                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7681                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7682             transform_pattern_stmt = true;
7683
7684           /* If pattern statement has def stmts, vectorize them too.  */
7685           if (is_pattern_stmt_p (stmt_info))
7686             {
7687               if (pattern_def_seq == NULL)
7688                 {
7689                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7690                   pattern_def_si = gsi_start (pattern_def_seq);
7691                 }
7692               else if (!gsi_end_p (pattern_def_si))
7693                 gsi_next (&pattern_def_si);
7694               if (pattern_def_seq != NULL)
7695                 {
7696                   gimple *pattern_def_stmt = NULL;
7697                   stmt_vec_info pattern_def_stmt_info = NULL;
7698
7699                   while (!gsi_end_p (pattern_def_si))
7700                     {
7701                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7702                       pattern_def_stmt_info
7703                         = vinfo_for_stmt (pattern_def_stmt);
7704                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7705                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7706                         break;
7707                       gsi_next (&pattern_def_si);
7708                     }
7709
7710                   if (!gsi_end_p (pattern_def_si))
7711                     {
7712                       if (dump_enabled_p ())
7713                         {
7714                           dump_printf_loc (MSG_NOTE, vect_location,
7715                                            "==> vectorizing pattern def "
7716                                            "stmt: ");
7717                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7718                                             pattern_def_stmt, 0);
7719                         }
7720
7721                       stmt = pattern_def_stmt;
7722                       stmt_info = pattern_def_stmt_info;
7723                     }
7724                   else
7725                     {
7726                       pattern_def_si = gsi_none ();
7727                       transform_pattern_stmt = false;
7728                     }
7729                 }
7730               else
7731                 transform_pattern_stmt = false;
7732             }
7733
7734           if (STMT_VINFO_VECTYPE (stmt_info))
7735             {
7736               unsigned int nunits
7737                 = (unsigned int)
7738                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7739               if (!STMT_SLP_TYPE (stmt_info)
7740                   && maybe_ne (nunits, vf)
7741                   && dump_enabled_p ())
7742                   /* For SLP VF is set according to unrolling factor, and not
7743                      to vector size, hence for SLP this print is not valid.  */
7744                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7745             }
7746
7747           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7748              reached.  */
7749           if (STMT_SLP_TYPE (stmt_info))
7750             {
7751               if (!slp_scheduled)
7752                 {
7753                   slp_scheduled = true;
7754
7755                   if (dump_enabled_p ())
7756                     dump_printf_loc (MSG_NOTE, vect_location,
7757                                      "=== scheduling SLP instances ===\n");
7758
7759                   vect_schedule_slp (loop_vinfo);
7760                 }
7761
7762               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7763               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7764                 {
7765                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7766                     {
7767                       pattern_def_seq = NULL;
7768                       gsi_next (&si);
7769                     }
7770                   continue;
7771                 }
7772             }
7773
7774           /* -------- vectorize statement ------------ */
7775           if (dump_enabled_p ())
7776             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7777
7778           grouped_store = false;
7779           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7780           if (is_store)
7781             {
7782               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7783                 {
7784                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7785                      interleaving chain was completed - free all the stores in
7786                      the chain.  */
7787                   gsi_next (&si);
7788                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7789                 }
7790               else
7791                 {
7792                   /* Free the attached stmt_vec_info and remove the stmt.  */
7793                   gimple *store = gsi_stmt (si);
7794                   free_stmt_vec_info (store);
7795                   unlink_stmt_vdef (store);
7796                   gsi_remove (&si, true);
7797                   release_defs (store);
7798                 }
7799
7800               /* Stores can only appear at the end of pattern statements.  */
7801               gcc_assert (!transform_pattern_stmt);
7802               pattern_def_seq = NULL;
7803             }
7804           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7805             {
7806               pattern_def_seq = NULL;
7807               gsi_next (&si);
7808             }
7809         }                       /* stmts in BB */
7810     }                           /* BBs in loop */
7811
7812   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
7813      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
7814   if (integer_onep (step_vector))
7815     niters_no_overflow = true;
7816   slpeel_make_loop_iterate_ntimes (loop, niters_vector, step_vector,
7817                                    niters_vector_mult_vf,
7818                                    !niters_no_overflow);
7819
7820   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
7821   scale_profile_for_vect_loop (loop, assumed_vf);
7822
7823   /* The minimum number of iterations performed by the epilogue.  This
7824      is 1 when peeling for gaps because we always need a final scalar
7825      iteration.  */
7826   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7827   /* +1 to convert latch counts to loop iteration counts,
7828      -min_epilogue_iters to remove iterations that cannot be performed
7829        by the vector code.  */
7830   int bias = 1 - min_epilogue_iters;
7831   /* In these calculations the "- 1" converts loop iteration counts
7832      back to latch counts.  */
7833   if (loop->any_upper_bound)
7834     loop->nb_iterations_upper_bound
7835       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
7836                         lowest_vf) - 1;
7837   if (loop->any_likely_upper_bound)
7838     loop->nb_iterations_likely_upper_bound
7839       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
7840                         lowest_vf) - 1;
7841   if (loop->any_estimate)
7842     loop->nb_iterations_estimate
7843       = wi::udiv_floor (loop->nb_iterations_estimate + bias,
7844                         assumed_vf) - 1;
7845
7846   if (dump_enabled_p ())
7847     {
7848       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7849         {
7850           dump_printf_loc (MSG_NOTE, vect_location,
7851                            "LOOP VECTORIZED\n");
7852           if (loop->inner)
7853             dump_printf_loc (MSG_NOTE, vect_location,
7854                              "OUTER LOOP VECTORIZED\n");
7855           dump_printf (MSG_NOTE, "\n");
7856         }
7857       else
7858         {
7859           dump_printf_loc (MSG_NOTE, vect_location,
7860                            "LOOP EPILOGUE VECTORIZED (VS=");
7861           dump_dec (MSG_NOTE, current_vector_size);
7862           dump_printf (MSG_NOTE, ")\n");
7863         }
7864     }
7865
7866   /* Free SLP instances here because otherwise stmt reference counting
7867      won't work.  */
7868   slp_instance instance;
7869   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7870     vect_free_slp_instance (instance);
7871   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7872   /* Clear-up safelen field since its value is invalid after vectorization
7873      since vectorized loop can have loop-carried dependencies.  */
7874   loop->safelen = 0;
7875
7876   /* Don't vectorize epilogue for epilogue.  */
7877   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7878     epilogue = NULL;
7879
7880   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7881     epilogue = NULL;
7882
7883   if (epilogue)
7884     {
7885       auto_vector_sizes vector_sizes;
7886       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
7887       unsigned int next_size = 0;
7888
7889       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7890           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
7891           && known_eq (vf, lowest_vf))
7892         {
7893           unsigned int eiters
7894             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
7895                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
7896           eiters = eiters % lowest_vf;
7897           epilogue->nb_iterations_upper_bound = eiters - 1;
7898
7899           unsigned int ratio;
7900           while (next_size < vector_sizes.length ()
7901                  && !(constant_multiple_p (current_vector_size,
7902                                            vector_sizes[next_size], &ratio)
7903                       && eiters >= lowest_vf / ratio))
7904             next_size += 1;
7905         }
7906       else
7907         while (next_size < vector_sizes.length ()
7908                && maybe_lt (current_vector_size, vector_sizes[next_size]))
7909           next_size += 1;
7910
7911       if (next_size == vector_sizes.length ())
7912         epilogue = NULL;
7913     }
7914
7915   if (epilogue)
7916     {
7917       epilogue->force_vectorize = loop->force_vectorize;
7918       epilogue->safelen = loop->safelen;
7919       epilogue->dont_vectorize = false;
7920
7921       /* We may need to if-convert epilogue to vectorize it.  */
7922       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7923         tree_if_conversion (epilogue);
7924     }
7925
7926   return epilogue;
7927 }
7928
7929 /* The code below is trying to perform simple optimization - revert
7930    if-conversion for masked stores, i.e. if the mask of a store is zero
7931    do not perform it and all stored value producers also if possible.
7932    For example,
7933      for (i=0; i<n; i++)
7934        if (c[i])
7935         {
7936           p1[i] += 1;
7937           p2[i] = p3[i] +2;
7938         }
7939    this transformation will produce the following semi-hammock:
7940
7941    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7942      {
7943        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7944        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7945        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7946        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7947        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7948        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7949      }
7950 */
7951
7952 void
7953 optimize_mask_stores (struct loop *loop)
7954 {
7955   basic_block *bbs = get_loop_body (loop);
7956   unsigned nbbs = loop->num_nodes;
7957   unsigned i;
7958   basic_block bb;
7959   struct loop *bb_loop;
7960   gimple_stmt_iterator gsi;
7961   gimple *stmt;
7962   auto_vec<gimple *> worklist;
7963
7964   vect_location = find_loop_location (loop);
7965   /* Pick up all masked stores in loop if any.  */
7966   for (i = 0; i < nbbs; i++)
7967     {
7968       bb = bbs[i];
7969       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7970            gsi_next (&gsi))
7971         {
7972           stmt = gsi_stmt (gsi);
7973           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7974             worklist.safe_push (stmt);
7975         }
7976     }
7977
7978   free (bbs);
7979   if (worklist.is_empty ())
7980     return;
7981
7982   /* Loop has masked stores.  */
7983   while (!worklist.is_empty ())
7984     {
7985       gimple *last, *last_store;
7986       edge e, efalse;
7987       tree mask;
7988       basic_block store_bb, join_bb;
7989       gimple_stmt_iterator gsi_to;
7990       tree vdef, new_vdef;
7991       gphi *phi;
7992       tree vectype;
7993       tree zero;
7994
7995       last = worklist.pop ();
7996       mask = gimple_call_arg (last, 2);
7997       bb = gimple_bb (last);
7998       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7999          the same loop as if_bb.  It could be different to LOOP when two
8000          level loop-nest is vectorized and mask_store belongs to the inner
8001          one.  */
8002       e = split_block (bb, last);
8003       bb_loop = bb->loop_father;
8004       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8005       join_bb = e->dest;
8006       store_bb = create_empty_bb (bb);
8007       add_bb_to_loop (store_bb, bb_loop);
8008       e->flags = EDGE_TRUE_VALUE;
8009       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8010       /* Put STORE_BB to likely part.  */
8011       efalse->probability = profile_probability::unlikely ();
8012       store_bb->count = efalse->count ();
8013       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8014       if (dom_info_available_p (CDI_DOMINATORS))
8015         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8016       if (dump_enabled_p ())
8017         dump_printf_loc (MSG_NOTE, vect_location,
8018                          "Create new block %d to sink mask stores.",
8019                          store_bb->index);
8020       /* Create vector comparison with boolean result.  */
8021       vectype = TREE_TYPE (mask);
8022       zero = build_zero_cst (vectype);
8023       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8024       gsi = gsi_last_bb (bb);
8025       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8026       /* Create new PHI node for vdef of the last masked store:
8027          .MEM_2 = VDEF <.MEM_1>
8028          will be converted to
8029          .MEM.3 = VDEF <.MEM_1>
8030          and new PHI node will be created in join bb
8031          .MEM_2 = PHI <.MEM_1, .MEM_3>
8032       */
8033       vdef = gimple_vdef (last);
8034       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8035       gimple_set_vdef (last, new_vdef);
8036       phi = create_phi_node (vdef, join_bb);
8037       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8038
8039       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8040       while (true)
8041         {
8042           gimple_stmt_iterator gsi_from;
8043           gimple *stmt1 = NULL;
8044
8045           /* Move masked store to STORE_BB.  */
8046           last_store = last;
8047           gsi = gsi_for_stmt (last);
8048           gsi_from = gsi;
8049           /* Shift GSI to the previous stmt for further traversal.  */
8050           gsi_prev (&gsi);
8051           gsi_to = gsi_start_bb (store_bb);
8052           gsi_move_before (&gsi_from, &gsi_to);
8053           /* Setup GSI_TO to the non-empty block start.  */
8054           gsi_to = gsi_start_bb (store_bb);
8055           if (dump_enabled_p ())
8056             {
8057               dump_printf_loc (MSG_NOTE, vect_location,
8058                                "Move stmt to created bb\n");
8059               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8060             }
8061           /* Move all stored value producers if possible.  */
8062           while (!gsi_end_p (gsi))
8063             {
8064               tree lhs;
8065               imm_use_iterator imm_iter;
8066               use_operand_p use_p;
8067               bool res;
8068
8069               /* Skip debug statements.  */
8070               if (is_gimple_debug (gsi_stmt (gsi)))
8071                 {
8072                   gsi_prev (&gsi);
8073                   continue;
8074                 }
8075               stmt1 = gsi_stmt (gsi);
8076               /* Do not consider statements writing to memory or having
8077                  volatile operand.  */
8078               if (gimple_vdef (stmt1)
8079                   || gimple_has_volatile_ops (stmt1))
8080                 break;
8081               gsi_from = gsi;
8082               gsi_prev (&gsi);
8083               lhs = gimple_get_lhs (stmt1);
8084               if (!lhs)
8085                 break;
8086
8087               /* LHS of vectorized stmt must be SSA_NAME.  */
8088               if (TREE_CODE (lhs) != SSA_NAME)
8089                 break;
8090
8091               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8092                 {
8093                   /* Remove dead scalar statement.  */
8094                   if (has_zero_uses (lhs))
8095                     {
8096                       gsi_remove (&gsi_from, true);
8097                       continue;
8098                     }
8099                 }
8100
8101               /* Check that LHS does not have uses outside of STORE_BB.  */
8102               res = true;
8103               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8104                 {
8105                   gimple *use_stmt;
8106                   use_stmt = USE_STMT (use_p);
8107                   if (is_gimple_debug (use_stmt))
8108                     continue;
8109                   if (gimple_bb (use_stmt) != store_bb)
8110                     {
8111                       res = false;
8112                       break;
8113                     }
8114                 }
8115               if (!res)
8116                 break;
8117
8118               if (gimple_vuse (stmt1)
8119                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8120                 break;
8121
8122               /* Can move STMT1 to STORE_BB.  */
8123               if (dump_enabled_p ())
8124                 {
8125                   dump_printf_loc (MSG_NOTE, vect_location,
8126                                    "Move stmt to created bb\n");
8127                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8128                 }
8129               gsi_move_before (&gsi_from, &gsi_to);
8130               /* Shift GSI_TO for further insertion.  */
8131               gsi_prev (&gsi_to);
8132             }
8133           /* Put other masked stores with the same mask to STORE_BB.  */
8134           if (worklist.is_empty ()
8135               || gimple_call_arg (worklist.last (), 2) != mask
8136               || worklist.last () != stmt1)
8137             break;
8138           last = worklist.pop ();
8139         }
8140       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8141     }
8142 }