gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-flow.h"
  32 #include "tree-pass.h"
  33 #include "cfgloop.h"
  34 #include "expr.h"
  35 #include "recog.h"
  36 #include "optabs.h"
  37 #include "params.h"
  38 #include "diagnostic-core.h"
  39 #include "tree-chrec.h"
  40 #include "tree-scalar-evolution.h"
  41 #include "tree-vectorizer.h"
  42 #include "target.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs").  These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized.  It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S).  This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 127    Targets that can support different sizes of vectors, for now will need
 128    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 129    flexibility will be added in the future.
 130
 131         Since we only vectorize operations which vector form can be
 132    expressed using existing tree codes, to verify that an operation is
 133    supported, the vectorizer checks the relevant optab at the relevant
 134    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 135    the value found is CODE_FOR_nothing, then there's no target support, and
 136    we can't vectorize the stmt.
 137
 138    For additional information on this project see:
 139    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 140 */
 141
 142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL;
 185   gimple_seq pattern_def_seq = NULL;
 186   gimple_stmt_iterator pattern_def_si = gsi_none ();
 187   bool analyze_pattern_stmt = false;
 188
 189   if (dump_enabled_p ())
 190     dump_printf_loc (MSG_NOTE, vect_location,
 191                      "=== vect_determine_vectorization_factor ===");
 192
 193   for (i = 0; i < nbbs; i++)
 194     {
 195       basic_block bb = bbs[i];
 196
 197       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 198         {
 199           phi = gsi_stmt (si);
 200           stmt_info = vinfo_for_stmt (phi);
 201           if (dump_enabled_p ())
 202             {
 203               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 204               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 205             }
 206
 207           gcc_assert (stmt_info);
 208
 209           if (STMT_VINFO_RELEVANT_P (stmt_info))
 210             {
 211               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 212               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 213
 214               if (dump_enabled_p ())
 215                 {
 216                   dump_printf_loc (MSG_NOTE, vect_location,
 217                                    "get vectype for scalar type:  ");
 218                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 219                 }
 220
 221               vectype = get_vectype_for_scalar_type (scalar_type);
 222               if (!vectype)
 223                 {
 224                   if (dump_enabled_p ())
 225                     {
 226                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 227                                        "not vectorized: unsupported "
 228                                        "data-type ");
 229                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 230                                          scalar_type);
 231                     }
 232                   return false;
 233                 }
 234               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 235
 236               if (dump_enabled_p ())
 237                 {
 238                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 239                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 240                 }
 241
 242               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 243               if (dump_enabled_p ())
 244                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 245
 246               if (!vectorization_factor
 247                   || (nunits > vectorization_factor))
 248                 vectorization_factor = nunits;
 249             }
 250         }
 251
 252       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 253         {
 254           tree vf_vectype;
 255
 256           if (analyze_pattern_stmt)
 257             stmt = pattern_stmt;
 258           else
 259             stmt = gsi_stmt (si);
 260
 261           stmt_info = vinfo_for_stmt (stmt);
 262
 263           if (dump_enabled_p ())
 264             {
 265               dump_printf_loc (MSG_NOTE, vect_location,
 266                                "==> examining statement: ");
 267               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 268             }
 269
 270           gcc_assert (stmt_info);
 271
 272           /* Skip stmts which do not need to be vectorized.  */
 273           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 274               && !STMT_VINFO_LIVE_P (stmt_info))
 275             {
 276               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 277                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 278                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 279                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 280                 {
 281                   stmt = pattern_stmt;
 282                   stmt_info = vinfo_for_stmt (pattern_stmt);
 283                   if (dump_enabled_p ())
 284                     {
 285                       dump_printf_loc (MSG_NOTE, vect_location,
 286                                        "==> examining pattern statement: ");
 287                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288                     }
 289                 }
 290               else
 291                 {
 292                   if (dump_enabled_p ())
 293                     dump_printf_loc (MSG_NOTE, vect_location, "skip.");
 294                   gsi_next (&si);
 295                   continue;
 296                 }
 297             }
 298           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302             analyze_pattern_stmt = true;
 303
 304           /* If a pattern statement has def stmts, analyze them too.  */
 305           if (is_pattern_stmt_p (stmt_info))
 306             {
 307               if (pattern_def_seq == NULL)
 308                 {
 309                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 310                   pattern_def_si = gsi_start (pattern_def_seq);
 311                 }
 312               else if (!gsi_end_p (pattern_def_si))
 313                 gsi_next (&pattern_def_si);
 314               if (pattern_def_seq != NULL)
 315                 {
 316                   gimple pattern_def_stmt = NULL;
 317                   stmt_vec_info pattern_def_stmt_info = NULL;
 318
 319                   while (!gsi_end_p (pattern_def_si))
 320                     {
 321                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 322                       pattern_def_stmt_info
 323                         = vinfo_for_stmt (pattern_def_stmt);
 324                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 325                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 326                         break;
 327                       gsi_next (&pattern_def_si);
 328                     }
 329
 330                   if (!gsi_end_p (pattern_def_si))
 331                     {
 332                       if (dump_enabled_p ())
 333                         {
 334                           dump_printf_loc (MSG_NOTE, vect_location,
 335                                            "==> examining pattern def stmt: ");
 336                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 337                                             pattern_def_stmt, 0);
 338                         }
 339
 340                       stmt = pattern_def_stmt;
 341                       stmt_info = pattern_def_stmt_info;
 342                     }
 343                   else
 344                     {
 345                       pattern_def_si = gsi_none ();
 346                       analyze_pattern_stmt = false;
 347                     }
 348                 }
 349               else
 350                 analyze_pattern_stmt = false;
 351             }
 352
 353           if (gimple_get_lhs (stmt) == NULL_TREE)
 354             {
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 358                                    "not vectorized: irregular stmt.");
 359                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 360                                     0);
 361                 }
 362               return false;
 363             }
 364
 365           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 366             {
 367               if (dump_enabled_p ())
 368                 {
 369                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                                    "not vectorized: vector stmt in loop:");
 371                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 372                 }
 373               return false;
 374             }
 375
 376           if (STMT_VINFO_VECTYPE (stmt_info))
 377             {
 378               /* The only case when a vectype had been already set is for stmts
 379                  that contain a dataref, or for "pattern-stmts" (stmts
 380                  generated by the vectorizer to represent/replace a certain
 381                  idiom).  */
 382               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 383                           || is_pattern_stmt_p (stmt_info)
 384                           || !gsi_end_p (pattern_def_si));
 385               vectype = STMT_VINFO_VECTYPE (stmt_info);
 386             }
 387           else
 388             {
 389               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 390               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_NOTE, vect_location,
 394                                    "get vectype for scalar type:  ");
 395                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 396                 }
 397               vectype = get_vectype_for_scalar_type (scalar_type);
 398               if (!vectype)
 399                 {
 400                   if (dump_enabled_p ())
 401                     {
 402                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                        "not vectorized: unsupported "
 404                                        "data-type ");
 405                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                          scalar_type);
 407                     }
 408                   return false;
 409                 }
 410
 411               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 412             }
 413
 414           /* The vectorization factor is according to the smallest
 415              scalar type (or the largest vector size, but we only
 416              support one vector size per loop).  */
 417           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 418                                                        &dummy);
 419           if (dump_enabled_p ())
 420             {
 421               dump_printf_loc (MSG_NOTE, vect_location,
 422                                "get vectype for scalar type:  ");
 423               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 424             }
 425           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 426           if (!vf_vectype)
 427             {
 428               if (dump_enabled_p ())
 429                 {
 430                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 431                                    "not vectorized: unsupported data-type ");
 432                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 433                                      scalar_type);
 434                 }
 435               return false;
 436             }
 437
 438           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 439                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 440             {
 441               if (dump_enabled_p ())
 442                 {
 443                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 444                                    "not vectorized: different sized vector "
 445                                    "types in statement, ");
 446                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 447                                      vectype);
 448                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 449                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 450                                      vf_vectype);
 451                 }
 452               return false;
 453             }
 454
 455           if (dump_enabled_p ())
 456             {
 457               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 458               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 459             }
 460
 461           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 462           if (dump_enabled_p ())
 463             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 464           if (!vectorization_factor
 465               || (nunits > vectorization_factor))
 466             vectorization_factor = nunits;
 467
 468           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 469             {
 470               pattern_def_seq = NULL;
 471               gsi_next (&si);
 472             }
 473         }
 474     }
 475
 476   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 477   if (dump_enabled_p ())
 478     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
 479                      vectorization_factor);
 480   if (vectorization_factor <= 1)
 481     {
 482       if (dump_enabled_p ())
 483         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 484                          "not vectorized: unsupported data-type");
 485       return false;
 486     }
 487   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 488
 489   return true;
 490 }
 491
 492
 493 /* Function vect_is_simple_iv_evolution.
 494
 495    FORNOW: A simple evolution of an induction variables in the loop is
 496    considered a polynomial evolution with constant step.  */
 497
 498 static bool
 499 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 500                              tree * step)
 501 {
 502   tree init_expr;
 503   tree step_expr;
 504   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 505
 506   /* When there is no evolution in this loop, the evolution function
 507      is not "simple".  */
 508   if (evolution_part == NULL_TREE)
 509     return false;
 510
 511   /* When the evolution is a polynomial of degree >= 2
 512      the evolution function is not "simple".  */
 513   if (tree_is_chrec (evolution_part))
 514     return false;
 515
 516   step_expr = evolution_part;
 517   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 518
 519   if (dump_enabled_p ())
 520     {
 521       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 522       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 523       dump_printf (MSG_NOTE, ",  init: ");
 524       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 525     }
 526
 527   *init = init_expr;
 528   *step = step_expr;
 529
 530   if (TREE_CODE (step_expr) != INTEGER_CST)
 531     {
 532       if (dump_enabled_p ())
 533         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                          "step unknown.");
 535       return false;
 536     }
 537
 538   return true;
 539 }
 540
 541 /* Function vect_analyze_scalar_cycles_1.
 542
 543    Examine the cross iteration def-use cycles of scalar variables
 544    in LOOP.  LOOP_VINFO represents the loop that is now being
 545    considered for vectorization (can be LOOP, or an outer-loop
 546    enclosing LOOP).  */
 547
 548 static void
 549 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 550 {
 551   basic_block bb = loop->header;
 552   tree dumy;
 553   vec<gimple> worklist;
 554   worklist.create (64);
 555   gimple_stmt_iterator gsi;
 556   bool double_reduc;
 557
 558   if (dump_enabled_p ())
 559     dump_printf_loc (MSG_NOTE, vect_location,
 560                      "=== vect_analyze_scalar_cycles ===");
 561
 562   /* First - identify all inductions.  Reduction detection assumes that all the
 563      inductions have been identified, therefore, this order must not be
 564      changed.  */
 565   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 566     {
 567       gimple phi = gsi_stmt (gsi);
 568       tree access_fn = NULL;
 569       tree def = PHI_RESULT (phi);
 570       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 571
 572       if (dump_enabled_p ())
 573         {
 574           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 575           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 576         }
 577
 578       /* Skip virtual phi's.  The data dependences that are associated with
 579          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 580       if (virtual_operand_p (def))
 581         continue;
 582
 583       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 584
 585       /* Analyze the evolution function.  */
 586       access_fn = analyze_scalar_evolution (loop, def);
 587       if (access_fn)
 588         {
 589           STRIP_NOPS (access_fn);
 590           if (dump_enabled_p ())
 591             {
 592               dump_printf_loc (MSG_NOTE, vect_location,
 593                                "Access function of PHI: ");
 594               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 595             }
 596           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 597             = evolution_part_in_loop_num (access_fn, loop->num);
 598         }
 599
 600       if (!access_fn
 601           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 602         {
 603           worklist.safe_push (phi);
 604           continue;
 605         }
 606
 607       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
 611       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 612     }
 613
 614
 615   /* Second - identify all reductions and nested cycles.  */
 616   while (worklist.length () > 0)
 617     {
 618       gimple phi = worklist.pop ();
 619       tree def = PHI_RESULT (phi);
 620       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 621       gimple reduc_stmt;
 622       bool nested_cycle;
 623
 624       if (dump_enabled_p ())
 625         {
 626           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 627           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 628         }
 629
 630       gcc_assert (!virtual_operand_p (def)
 631                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 632
 633       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 634       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 635                                                 &double_reduc);
 636       if (reduc_stmt)
 637         {
 638           if (double_reduc)
 639             {
 640               if (dump_enabled_p ())
 641                 dump_printf_loc (MSG_NOTE, vect_location,
 642                                  "Detected double reduction.");
 643
 644               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 645               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 646                                                     vect_double_reduction_def;
 647             }
 648           else
 649             {
 650               if (nested_cycle)
 651                 {
 652                   if (dump_enabled_p ())
 653                     dump_printf_loc (MSG_NOTE, vect_location,
 654                                      "Detected vectorizable nested cycle.");
 655
 656                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 657                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 658                                                              vect_nested_cycle;
 659                 }
 660               else
 661                 {
 662                   if (dump_enabled_p ())
 663                     dump_printf_loc (MSG_NOTE, vect_location,
 664                                      "Detected reduction.");
 665
 666                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 667                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 668                                                            vect_reduction_def;
 669                   /* Store the reduction cycles for possible vectorization in
 670                      loop-aware SLP.  */
 671                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 672                 }
 673             }
 674         }
 675       else
 676         if (dump_enabled_p ())
 677           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 678                            "Unknown def-use cycle pattern.");
 679     }
 680
 681   worklist.release ();
 682 }
 683
 684
 685 /* Function vect_analyze_scalar_cycles.
 686
 687    Examine the cross iteration def-use cycles of scalar variables, by
 688    analyzing the loop-header PHIs of scalar variables.  Classify each
 689    cycle as one of the following: invariant, induction, reduction, unknown.
 690    We do that for the loop represented by LOOP_VINFO, and also to its
 691    inner-loop, if exists.
 692    Examples for scalar cycles:
 693
 694    Example1: reduction:
 695
 696               loop1:
 697               for (i=0; i<N; i++)
 698                  sum += a[i];
 699
 700    Example2: induction:
 701
 702               loop2:
 703               for (i=0; i<N; i++)
 704                  a[i] = i;  */
 705
 706 static void
 707 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 708 {
 709   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 710
 711   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 712
 713   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 714      Reductions in such inner-loop therefore have different properties than
 715      the reductions in the nest that gets vectorized:
 716      1. When vectorized, they are executed in the same order as in the original
 717         scalar loop, so we can't change the order of computation when
 718         vectorizing them.
 719      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 720         current checks are too strict.  */
 721
 722   if (loop->inner)
 723     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 724 }
 725
 726 /* Function vect_get_loop_niters.
 727
 728    Determine how many iterations the loop is executed.
 729    If an expression that represents the number of iterations
 730    can be constructed, place it in NUMBER_OF_ITERATIONS.
 731    Return the loop exit condition.  */
 732
 733 static gimple
 734 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 735 {
 736   tree niters;
 737
 738   if (dump_enabled_p ())
 739     dump_printf_loc (MSG_NOTE, vect_location,
 740                      "=== get_loop_niters ===");
 741   niters = number_of_exit_cond_executions (loop);
 742
 743   if (niters != NULL_TREE
 744       && niters != chrec_dont_know)
 745     {
 746       *number_of_iterations = niters;
 747
 748       if (dump_enabled_p ())
 749         {
 750           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 751           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 752         }
 753     }
 754
 755   return get_loop_exit_condition (loop);
 756 }
 757
 758
 759 /* Function bb_in_loop_p
 760
 761    Used as predicate for dfs order traversal of the loop bbs.  */
 762
 763 static bool
 764 bb_in_loop_p (const_basic_block bb, const void *data)
 765 {
 766   const struct loop *const loop = (const struct loop *)data;
 767   if (flow_bb_inside_loop_p (loop, bb))
 768     return true;
 769   return false;
 770 }
 771
 772
 773 /* Function new_loop_vec_info.
 774
 775    Create and initialize a new loop_vec_info struct for LOOP, as well as
 776    stmt_vec_info structs for all the stmts in LOOP.  */
 777
 778 static loop_vec_info
 779 new_loop_vec_info (struct loop *loop)
 780 {
 781   loop_vec_info res;
 782   basic_block *bbs;
 783   gimple_stmt_iterator si;
 784   unsigned int i, nbbs;
 785
 786   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 787   LOOP_VINFO_LOOP (res) = loop;
 788
 789   bbs = get_loop_body (loop);
 790
 791   /* Create/Update stmt_info for all stmts in the loop.  */
 792   for (i = 0; i < loop->num_nodes; i++)
 793     {
 794       basic_block bb = bbs[i];
 795
 796       /* BBs in a nested inner-loop will have been already processed (because
 797          we will have called vect_analyze_loop_form for any nested inner-loop).
 798          Therefore, for stmts in an inner-loop we just want to update the
 799          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 800          loop_info of the outer-loop we are currently considering to vectorize
 801          (instead of the loop_info of the inner-loop).
 802          For stmts in other BBs we need to create a stmt_info from scratch.  */
 803       if (bb->loop_father != loop)
 804         {
 805           /* Inner-loop bb.  */
 806           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 807           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 808             {
 809               gimple phi = gsi_stmt (si);
 810               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 811               loop_vec_info inner_loop_vinfo =
 812                 STMT_VINFO_LOOP_VINFO (stmt_info);
 813               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 814               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 815             }
 816           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 817            {
 818               gimple stmt = gsi_stmt (si);
 819               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 820               loop_vec_info inner_loop_vinfo =
 821                  STMT_VINFO_LOOP_VINFO (stmt_info);
 822               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 823               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 824            }
 825         }
 826       else
 827         {
 828           /* bb in current nest.  */
 829           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 830             {
 831               gimple phi = gsi_stmt (si);
 832               gimple_set_uid (phi, 0);
 833               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 834             }
 835
 836           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 837             {
 838               gimple stmt = gsi_stmt (si);
 839               gimple_set_uid (stmt, 0);
 840               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 841             }
 842         }
 843     }
 844
 845   /* CHECKME: We want to visit all BBs before their successors (except for
 846      latch blocks, for which this assertion wouldn't hold).  In the simple
 847      case of the loop forms we allow, a dfs order of the BBs would the same
 848      as reversed postorder traversal, so we are safe.  */
 849
 850    free (bbs);
 851    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 852    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 853                               bbs, loop->num_nodes, loop);
 854    gcc_assert (nbbs == loop->num_nodes);
 855
 856   LOOP_VINFO_BBS (res) = bbs;
 857   LOOP_VINFO_NITERS (res) = NULL;
 858   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 859   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 860   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 861   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 862   LOOP_VINFO_VECT_FACTOR (res) = 0;
 863   LOOP_VINFO_LOOP_NEST (res).create (3);
 864   LOOP_VINFO_DATAREFS (res).create (10);
 865   LOOP_VINFO_DDRS (res).create (10 * 10);
 866   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 867   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 868              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 869   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 870              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 871   LOOP_VINFO_GROUPED_STORES (res).create (10);
 872   LOOP_VINFO_REDUCTIONS (res).create (10);
 873   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 874   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 875   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 876   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 877   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 878   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 879   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 880
 881   return res;
 882 }
 883
 884
 885 /* Function destroy_loop_vec_info.
 886
 887    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 888    stmts in the loop.  */
 889
 890 void
 891 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 892 {
 893   struct loop *loop;
 894   basic_block *bbs;
 895   int nbbs;
 896   gimple_stmt_iterator si;
 897   int j;
 898   vec<slp_instance> slp_instances;
 899   slp_instance instance;
 900   bool swapped;
 901
 902   if (!loop_vinfo)
 903     return;
 904
 905   loop = LOOP_VINFO_LOOP (loop_vinfo);
 906
 907   bbs = LOOP_VINFO_BBS (loop_vinfo);
 908   nbbs = clean_stmts ? loop->num_nodes : 0;
 909   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 910
 911   for (j = 0; j < nbbs; j++)
 912     {
 913       basic_block bb = bbs[j];
 914       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 915         free_stmt_vec_info (gsi_stmt (si));
 916
 917       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 918         {
 919           gimple stmt = gsi_stmt (si);
 920
 921           /* We may have broken canonical form by moving a constant
 922              into RHS1 of a commutative op.  Fix such occurrences.  */
 923           if (swapped && is_gimple_assign (stmt))
 924             {
 925               enum tree_code code = gimple_assign_rhs_code (stmt);
 926
 927               if ((code == PLUS_EXPR
 928                    || code == POINTER_PLUS_EXPR
 929                    || code == MULT_EXPR)
 930                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 931                 swap_tree_operands (stmt,
 932                                     gimple_assign_rhs1_ptr (stmt),
 933                                     gimple_assign_rhs2_ptr (stmt));
 934             }
 935
 936           /* Free stmt_vec_info.  */
 937           free_stmt_vec_info (stmt);
 938           gsi_next (&si);
 939         }
 940     }
 941
 942   free (LOOP_VINFO_BBS (loop_vinfo));
 943   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 944   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 945   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 946   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 947   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 948   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 949   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 950     vect_free_slp_instance (instance);
 951
 952   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
 953   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
 954   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
 955   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
 956
 957   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 958     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 959
 960   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
 961
 962   free (loop_vinfo);
 963   loop->aux = NULL;
 964 }
 965
 966
 967 /* Function vect_analyze_loop_1.
 968
 969    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 970    for it. The different analyses will record information in the
 971    loop_vec_info struct.  This is a subset of the analyses applied in
 972    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 973    that is now considered for (outer-loop) vectorization.  */
 974
 975 static loop_vec_info
 976 vect_analyze_loop_1 (struct loop *loop)
 977 {
 978   loop_vec_info loop_vinfo;
 979
 980   if (dump_enabled_p ())
 981     dump_printf_loc (MSG_NOTE, vect_location,
 982                      "===== analyze_loop_nest_1 =====");
 983
 984   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 985
 986   loop_vinfo = vect_analyze_loop_form (loop);
 987   if (!loop_vinfo)
 988     {
 989       if (dump_enabled_p ())
 990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 991                          "bad inner-loop form.");
 992       return NULL;
 993     }
 994
 995   return loop_vinfo;
 996 }
 997
 998
 999 /* Function vect_analyze_loop_form.
1000
1001    Verify that certain CFG restrictions hold, including:
1002    - the loop has a pre-header
1003    - the loop has a single entry and exit
1004    - the loop exit condition is simple enough, and the number of iterations
1005      can be analyzed (a countable loop).  */
1006
1007 loop_vec_info
1008 vect_analyze_loop_form (struct loop *loop)
1009 {
1010   loop_vec_info loop_vinfo;
1011   gimple loop_cond;
1012   tree number_of_iterations = NULL;
1013   loop_vec_info inner_loop_vinfo = NULL;
1014
1015   if (dump_enabled_p ())
1016     dump_printf_loc (MSG_NOTE, vect_location,
1017                      "=== vect_analyze_loop_form ===");
1018
1019   /* Different restrictions apply when we are considering an inner-most loop,
1020      vs. an outer (nested) loop.
1021      (FORNOW. May want to relax some of these restrictions in the future).  */
1022
1023   if (!loop->inner)
1024     {
1025       /* Inner-most loop.  We currently require that the number of BBs is
1026          exactly 2 (the header and latch).  Vectorizable inner-most loops
1027          look like this:
1028
1029                         (pre-header)
1030                            |
1031                           header <--------+
1032                            | |            |
1033                            | +--> latch --+
1034                            |
1035                         (exit-bb)  */
1036
1037       if (loop->num_nodes != 2)
1038         {
1039           if (dump_enabled_p ())
1040             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1041                              "not vectorized: control flow in loop.");
1042           return NULL;
1043         }
1044
1045       if (empty_block_p (loop->header))
1046     {
1047           if (dump_enabled_p ())
1048             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049                              "not vectorized: empty loop.");
1050       return NULL;
1051     }
1052     }
1053   else
1054     {
1055       struct loop *innerloop = loop->inner;
1056       edge entryedge;
1057
1058       /* Nested loop. We currently require that the loop is doubly-nested,
1059          contains a single inner loop, and the number of BBs is exactly 5.
1060          Vectorizable outer-loops look like this:
1061
1062                         (pre-header)
1063                            |
1064                           header <---+
1065                            |         |
1066                           inner-loop |
1067                            |         |
1068                           tail ------+
1069                            |
1070                         (exit-bb)
1071
1072          The inner-loop has the properties expected of inner-most loops
1073          as described above.  */
1074
1075       if ((loop->inner)->inner || (loop->inner)->next)
1076         {
1077           if (dump_enabled_p ())
1078             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079                              "not vectorized: multiple nested loops.");
1080           return NULL;
1081         }
1082
1083       /* Analyze the inner-loop.  */
1084       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1085       if (!inner_loop_vinfo)
1086         {
1087           if (dump_enabled_p ())
1088             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089                              "not vectorized: Bad inner loop.");
1090           return NULL;
1091         }
1092
1093       if (!expr_invariant_in_loop_p (loop,
1094                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1095         {
1096           if (dump_enabled_p ())
1097             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1098                              "not vectorized: inner-loop count not invariant.");
1099           destroy_loop_vec_info (inner_loop_vinfo, true);
1100           return NULL;
1101         }
1102
1103       if (loop->num_nodes != 5)
1104         {
1105           if (dump_enabled_p ())
1106             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1107                              "not vectorized: control flow in loop.");
1108           destroy_loop_vec_info (inner_loop_vinfo, true);
1109           return NULL;
1110         }
1111
1112       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1113       entryedge = EDGE_PRED (innerloop->header, 0);
1114       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1115         entryedge = EDGE_PRED (innerloop->header, 1);
1116
1117       if (entryedge->src != loop->header
1118           || !single_exit (innerloop)
1119           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1120         {
1121           if (dump_enabled_p ())
1122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123                              "not vectorized: unsupported outerloop form.");
1124           destroy_loop_vec_info (inner_loop_vinfo, true);
1125           return NULL;
1126         }
1127
1128       if (dump_enabled_p ())
1129         dump_printf_loc (MSG_NOTE, vect_location,
1130                          "Considering outer-loop vectorization.");
1131     }
1132
1133   if (!single_exit (loop)
1134       || EDGE_COUNT (loop->header->preds) != 2)
1135     {
1136       if (dump_enabled_p ())
1137         {
1138           if (!single_exit (loop))
1139             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140                              "not vectorized: multiple exits.");
1141           else if (EDGE_COUNT (loop->header->preds) != 2)
1142             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1143                              "not vectorized: too many incoming edges.");
1144         }
1145       if (inner_loop_vinfo)
1146         destroy_loop_vec_info (inner_loop_vinfo, true);
1147       return NULL;
1148     }
1149
1150   /* We assume that the loop exit condition is at the end of the loop. i.e,
1151      that the loop is represented as a do-while (with a proper if-guard
1152      before the loop if needed), where the loop header contains all the
1153      executable statements, and the latch is empty.  */
1154   if (!empty_block_p (loop->latch)
1155       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1156     {
1157       if (dump_enabled_p ())
1158         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159                          "not vectorized: latch block not empty.");
1160       if (inner_loop_vinfo)
1161         destroy_loop_vec_info (inner_loop_vinfo, true);
1162       return NULL;
1163     }
1164
1165   /* Make sure there exists a single-predecessor exit bb:  */
1166   if (!single_pred_p (single_exit (loop)->dest))
1167     {
1168       edge e = single_exit (loop);
1169       if (!(e->flags & EDGE_ABNORMAL))
1170         {
1171           split_loop_exit_edge (e);
1172           if (dump_enabled_p ())
1173             dump_printf (MSG_NOTE, "split exit edge.");
1174         }
1175       else
1176         {
1177           if (dump_enabled_p ())
1178             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179                              "not vectorized: abnormal loop exit edge.");
1180           if (inner_loop_vinfo)
1181             destroy_loop_vec_info (inner_loop_vinfo, true);
1182           return NULL;
1183         }
1184     }
1185
1186   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1187   if (!loop_cond)
1188     {
1189       if (dump_enabled_p ())
1190         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                          "not vectorized: complicated exit condition.");
1192       if (inner_loop_vinfo)
1193         destroy_loop_vec_info (inner_loop_vinfo, true);
1194       return NULL;
1195     }
1196
1197   if (!number_of_iterations)
1198     {
1199       if (dump_enabled_p ())
1200         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1201                          "not vectorized: number of iterations cannot be "
1202                          "computed.");
1203       if (inner_loop_vinfo)
1204         destroy_loop_vec_info (inner_loop_vinfo, true);
1205       return NULL;
1206     }
1207
1208   if (chrec_contains_undetermined (number_of_iterations))
1209     {
1210       if (dump_enabled_p ())
1211             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1212                              "Infinite number of iterations.");
1213       if (inner_loop_vinfo)
1214         destroy_loop_vec_info (inner_loop_vinfo, true);
1215       return NULL;
1216     }
1217
1218   if (!NITERS_KNOWN_P (number_of_iterations))
1219     {
1220       if (dump_enabled_p ())
1221         {
1222           dump_printf_loc (MSG_NOTE, vect_location,
1223                            "Symbolic number of iterations is ");
1224           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1225         }
1226     }
1227   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1228     {
1229       if (dump_enabled_p ())
1230         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231                          "not vectorized: number of iterations = 0.");
1232       if (inner_loop_vinfo)
1233         destroy_loop_vec_info (inner_loop_vinfo, true);
1234       return NULL;
1235     }
1236
1237   loop_vinfo = new_loop_vec_info (loop);
1238   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1239   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1240
1241   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1242
1243   /* CHECKME: May want to keep it around it in the future.  */
1244   if (inner_loop_vinfo)
1245     destroy_loop_vec_info (inner_loop_vinfo, false);
1246
1247   gcc_assert (!loop->aux);
1248   loop->aux = loop_vinfo;
1249   return loop_vinfo;
1250 }
1251
1252
1253 /* Function vect_analyze_loop_operations.
1254
1255    Scan the loop stmts and make sure they are all vectorizable.  */
1256
1257 static bool
1258 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1259 {
1260   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1261   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1262   int nbbs = loop->num_nodes;
1263   gimple_stmt_iterator si;
1264   unsigned int vectorization_factor = 0;
1265   int i;
1266   gimple phi;
1267   stmt_vec_info stmt_info;
1268   bool need_to_vectorize = false;
1269   int min_profitable_iters;
1270   int min_scalar_loop_bound;
1271   unsigned int th;
1272   bool only_slp_in_loop = true, ok;
1273   HOST_WIDE_INT max_niter;
1274   HOST_WIDE_INT estimated_niter;
1275   int min_profitable_estimate;
1276
1277   if (dump_enabled_p ())
1278     dump_printf_loc (MSG_NOTE, vect_location,
1279                      "=== vect_analyze_loop_operations ===");
1280
1281   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1282   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1283   if (slp)
1284     {
1285       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1286          vectorization factor of the loop is the unrolling factor required by
1287          the SLP instances.  If that unrolling factor is 1, we say, that we
1288          perform pure SLP on loop - cross iteration parallelism is not
1289          exploited.  */
1290       for (i = 0; i < nbbs; i++)
1291         {
1292           basic_block bb = bbs[i];
1293           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1294             {
1295               gimple stmt = gsi_stmt (si);
1296               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1297               gcc_assert (stmt_info);
1298               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1299                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1300                   && !PURE_SLP_STMT (stmt_info))
1301                 /* STMT needs both SLP and loop-based vectorization.  */
1302                 only_slp_in_loop = false;
1303             }
1304         }
1305
1306       if (only_slp_in_loop)
1307         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1308       else
1309         vectorization_factor = least_common_multiple (vectorization_factor,
1310                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1311
1312       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1313       if (dump_enabled_p ())
1314         dump_printf_loc (MSG_NOTE, vect_location,
1315                          "Updating vectorization factor to %d ",
1316                          vectorization_factor);
1317     }
1318
1319   for (i = 0; i < nbbs; i++)
1320     {
1321       basic_block bb = bbs[i];
1322
1323       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1324         {
1325           phi = gsi_stmt (si);
1326           ok = true;
1327
1328           stmt_info = vinfo_for_stmt (phi);
1329           if (dump_enabled_p ())
1330             {
1331               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1332               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1333             }
1334
1335           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1336              (i.e., a phi in the tail of the outer-loop).  */
1337           if (! is_loop_header_bb_p (bb))
1338             {
1339               /* FORNOW: we currently don't support the case that these phis
1340                  are not used in the outerloop (unless it is double reduction,
1341                  i.e., this phi is vect_reduction_def), cause this case
1342                  requires to actually do something here.  */
1343               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1344                    || STMT_VINFO_LIVE_P (stmt_info))
1345                   && STMT_VINFO_DEF_TYPE (stmt_info)
1346                      != vect_double_reduction_def)
1347                 {
1348                   if (dump_enabled_p ())
1349                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350                                      "Unsupported loop-closed phi in "
1351                                      "outer-loop.");
1352                   return false;
1353                 }
1354
1355               /* If PHI is used in the outer loop, we check that its operand
1356                  is defined in the inner loop.  */
1357               if (STMT_VINFO_RELEVANT_P (stmt_info))
1358                 {
1359                   tree phi_op;
1360                   gimple op_def_stmt;
1361
1362                   if (gimple_phi_num_args (phi) != 1)
1363                     return false;
1364
1365                   phi_op = PHI_ARG_DEF (phi, 0);
1366                   if (TREE_CODE (phi_op) != SSA_NAME)
1367                     return false;
1368
1369                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1370                   if (!op_def_stmt
1371                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1372                       || !vinfo_for_stmt (op_def_stmt))
1373                     return false;
1374
1375                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1376                         != vect_used_in_outer
1377                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1378                            != vect_used_in_outer_by_reduction)
1379                     return false;
1380                 }
1381
1382               continue;
1383             }
1384
1385           gcc_assert (stmt_info);
1386
1387           if (STMT_VINFO_LIVE_P (stmt_info))
1388             {
1389               /* FORNOW: not yet supported.  */
1390               if (dump_enabled_p ())
1391                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1392                                  "not vectorized: value used after loop.");
1393               return false;
1394             }
1395
1396           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1397               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1398             {
1399               /* A scalar-dependence cycle that we don't support.  */
1400               if (dump_enabled_p ())
1401                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402                                  "not vectorized: scalar dependence cycle.");
1403               return false;
1404             }
1405
1406           if (STMT_VINFO_RELEVANT_P (stmt_info))
1407             {
1408               need_to_vectorize = true;
1409               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1410                 ok = vectorizable_induction (phi, NULL, NULL);
1411             }
1412
1413           if (!ok)
1414             {
1415               if (dump_enabled_p ())
1416                 {
1417                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1418                                    "not vectorized: relevant phi not "
1419                                    "supported: ");
1420                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1421                 }
1422               return false;
1423             }
1424         }
1425
1426       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1427         {
1428           gimple stmt = gsi_stmt (si);
1429           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1430             return false;
1431         }
1432     } /* bbs */
1433
1434   /* All operations in the loop are either irrelevant (deal with loop
1435      control, or dead), or only used outside the loop and can be moved
1436      out of the loop (e.g. invariants, inductions).  The loop can be
1437      optimized away by scalar optimizations.  We're better off not
1438      touching this loop.  */
1439   if (!need_to_vectorize)
1440     {
1441       if (dump_enabled_p ())
1442         dump_printf_loc (MSG_NOTE, vect_location,
1443                          "All the computation can be taken out of the loop.");
1444       if (dump_enabled_p ())
1445         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1446                          "not vectorized: redundant loop. no profit to "
1447                          "vectorize.");
1448       return false;
1449     }
1450
1451   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1452     dump_printf_loc (MSG_NOTE, vect_location,
1453                      "vectorization_factor = %d, niters = "
1454                      HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1455                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1456
1457   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1458        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1459       || ((max_niter = max_stmt_executions_int (loop)) != -1
1460           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1461     {
1462       if (dump_enabled_p ())
1463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464                          "not vectorized: iteration count too small.");
1465       if (dump_enabled_p ())
1466         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467                          "not vectorized: iteration count smaller than "
1468                          "vectorization factor.");
1469       return false;
1470     }
1471
1472   /* Analyze cost.  Decide if worth while to vectorize.  */
1473
1474   /* Once VF is set, SLP costs should be updated since the number of created
1475      vector stmts depends on VF.  */
1476   vect_update_slp_costs_according_to_vf (loop_vinfo);
1477
1478   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1479                                       &min_profitable_estimate);
1480   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1481
1482   if (min_profitable_iters < 0)
1483     {
1484       if (dump_enabled_p ())
1485         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1486                          "not vectorized: vectorization not profitable.");
1487       if (dump_enabled_p ())
1488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489                          "not vectorized: vector version will never be "
1490                          "profitable.");
1491       return false;
1492     }
1493
1494   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1495                             * vectorization_factor) - 1);
1496
1497
1498   /* Use the cost model only if it is more conservative than user specified
1499      threshold.  */
1500
1501   th = (unsigned) min_scalar_loop_bound;
1502   if (min_profitable_iters
1503       && (!min_scalar_loop_bound
1504           || min_profitable_iters > min_scalar_loop_bound))
1505     th = (unsigned) min_profitable_iters;
1506
1507   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1508       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1509     {
1510       if (dump_enabled_p ())
1511         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1512                          "not vectorized: vectorization not profitable.");
1513       if (dump_enabled_p ())
1514         dump_printf_loc (MSG_NOTE, vect_location,
1515                          "not vectorized: iteration count smaller than user "
1516                          "specified loop bound parameter or minimum profitable "
1517                          "iterations (whichever is more conservative).");
1518       return false;
1519     }
1520
1521   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1522       && ((unsigned HOST_WIDE_INT) estimated_niter
1523           <= MAX (th, (unsigned)min_profitable_estimate)))
1524     {
1525       if (dump_enabled_p ())
1526         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1527                          "not vectorized: estimated iteration count too "
1528                          "small.");
1529       if (dump_enabled_p ())
1530         dump_printf_loc (MSG_NOTE, vect_location,
1531                          "not vectorized: estimated iteration count smaller "
1532                          "than specified loop bound parameter or minimum "
1533                          "profitable iterations (whichever is more "
1534                          "conservative).");
1535       return false;
1536     }
1537
1538   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1539       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1540       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1541     {
1542       if (dump_enabled_p ())
1543         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1544       if (!vect_can_advance_ivs_p (loop_vinfo))
1545         {
1546           if (dump_enabled_p ())
1547             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1548                              "not vectorized: can't create epilog loop 1.");
1549           return false;
1550         }
1551       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1552         {
1553           if (dump_enabled_p ())
1554             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555                              "not vectorized: can't create epilog loop 2.");
1556           return false;
1557         }
1558     }
1559
1560   return true;
1561 }
1562
1563
1564 /* Function vect_analyze_loop_2.
1565
1566    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1567    for it.  The different analyses will record information in the
1568    loop_vec_info struct.  */
1569 static bool
1570 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1571 {
1572   bool ok, slp = false;
1573   int max_vf = MAX_VECTORIZATION_FACTOR;
1574   int min_vf = 2;
1575
1576   /* Find all data references in the loop (which correspond to vdefs/vuses)
1577      and analyze their evolution in the loop.  Also adjust the minimal
1578      vectorization factor according to the loads and stores.
1579
1580      FORNOW: Handle only simple, array references, which
1581      alignment can be forced, and aligned pointer-references.  */
1582
1583   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1584   if (!ok)
1585     {
1586       if (dump_enabled_p ())
1587         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1588                          "bad data references.");
1589       return false;
1590     }
1591
1592   /* Classify all cross-iteration scalar data-flow cycles.
1593      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1594
1595   vect_analyze_scalar_cycles (loop_vinfo);
1596
1597   vect_pattern_recog (loop_vinfo, NULL);
1598
1599   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1600
1601   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1602   if (!ok)
1603     {
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                          "unexpected pattern.");
1607       return false;
1608     }
1609
1610   /* Analyze data dependences between the data-refs in the loop
1611      and adjust the maximum vectorization factor according to
1612      the dependences.
1613      FORNOW: fail at the first data dependence that we encounter.  */
1614
1615   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1616   if (!ok
1617       || max_vf < min_vf)
1618     {
1619       if (dump_enabled_p ())
1620             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1621                              "bad data dependence.");
1622       return false;
1623     }
1624
1625   ok = vect_determine_vectorization_factor (loop_vinfo);
1626   if (!ok)
1627     {
1628       if (dump_enabled_p ())
1629         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630                          "can't determine vectorization factor.");
1631       return false;
1632     }
1633   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1634     {
1635       if (dump_enabled_p ())
1636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637                          "bad data dependence.");
1638       return false;
1639     }
1640
1641   /* Analyze the alignment of the data-refs in the loop.
1642      Fail if a data reference is found that cannot be vectorized.  */
1643
1644   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1645   if (!ok)
1646     {
1647       if (dump_enabled_p ())
1648         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649                          "bad data alignment.");
1650       return false;
1651     }
1652
1653   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1654      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1655
1656   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1657   if (!ok)
1658     {
1659       if (dump_enabled_p ())
1660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1661                          "bad data access.");
1662       return false;
1663     }
1664
1665   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1666      It is important to call pruning after vect_analyze_data_ref_accesses,
1667      since we use grouping information gathered by interleaving analysis.  */
1668   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1669   if (!ok)
1670     {
1671       if (dump_enabled_p ())
1672         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673                          "too long list of versioning for alias "
1674                          "run-time tests.");
1675       return false;
1676     }
1677
1678   /* This pass will decide on using loop versioning and/or loop peeling in
1679      order to enhance the alignment of data references in the loop.  */
1680
1681   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1682   if (!ok)
1683     {
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                          "bad data alignment.");
1687       return false;
1688     }
1689
1690   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1691   ok = vect_analyze_slp (loop_vinfo, NULL);
1692   if (ok)
1693     {
1694       /* Decide which possible SLP instances to SLP.  */
1695       slp = vect_make_slp_decision (loop_vinfo);
1696
1697       /* Find stmts that need to be both vectorized and SLPed.  */
1698       vect_detect_hybrid_slp (loop_vinfo);
1699     }
1700   else
1701     return false;
1702
1703   /* Scan all the operations in the loop and make sure they are
1704      vectorizable.  */
1705
1706   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1707   if (!ok)
1708     {
1709       if (dump_enabled_p ())
1710         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1711                          "bad operation or unsupported loop bound.");
1712       return false;
1713     }
1714
1715   return true;
1716 }
1717
1718 /* Function vect_analyze_loop.
1719
1720    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1721    for it.  The different analyses will record information in the
1722    loop_vec_info struct.  */
1723 loop_vec_info
1724 vect_analyze_loop (struct loop *loop)
1725 {
1726   loop_vec_info loop_vinfo;
1727   unsigned int vector_sizes;
1728
1729   /* Autodetect first vector size we try.  */
1730   current_vector_size = 0;
1731   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1732
1733   if (dump_enabled_p ())
1734     dump_printf_loc (MSG_NOTE, vect_location,
1735                      "===== analyze_loop_nest =====");
1736
1737   if (loop_outer (loop)
1738       && loop_vec_info_for_loop (loop_outer (loop))
1739       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1740     {
1741       if (dump_enabled_p ())
1742         dump_printf_loc (MSG_NOTE, vect_location,
1743                          "outer-loop already vectorized.");
1744       return NULL;
1745     }
1746
1747   while (1)
1748     {
1749       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1750       loop_vinfo = vect_analyze_loop_form (loop);
1751       if (!loop_vinfo)
1752         {
1753           if (dump_enabled_p ())
1754             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1755                              "bad loop form.");
1756           return NULL;
1757         }
1758
1759       if (vect_analyze_loop_2 (loop_vinfo))
1760         {
1761           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1762
1763           return loop_vinfo;
1764         }
1765
1766       destroy_loop_vec_info (loop_vinfo, true);
1767
1768       vector_sizes &= ~current_vector_size;
1769       if (vector_sizes == 0
1770           || current_vector_size == 0)
1771         return NULL;
1772
1773       /* Try the next biggest vector size.  */
1774       current_vector_size = 1 << floor_log2 (vector_sizes);
1775       if (dump_enabled_p ())
1776         dump_printf_loc (MSG_NOTE, vect_location,
1777                          "***** Re-trying analysis with "
1778                          "vector size %d\n", current_vector_size);
1779     }
1780 }
1781
1782
1783 /* Function reduction_code_for_scalar_code
1784
1785    Input:
1786    CODE - tree_code of a reduction operations.
1787
1788    Output:
1789    REDUC_CODE - the corresponding tree-code to be used to reduce the
1790       vector of partial results into a single scalar result (which
1791       will also reside in a vector) or ERROR_MARK if the operation is
1792       a supported reduction operation, but does not have such tree-code.
1793
1794    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1795
1796 static bool
1797 reduction_code_for_scalar_code (enum tree_code code,
1798                                 enum tree_code *reduc_code)
1799 {
1800   switch (code)
1801     {
1802       case MAX_EXPR:
1803         *reduc_code = REDUC_MAX_EXPR;
1804         return true;
1805
1806       case MIN_EXPR:
1807         *reduc_code = REDUC_MIN_EXPR;
1808         return true;
1809
1810       case PLUS_EXPR:
1811         *reduc_code = REDUC_PLUS_EXPR;
1812         return true;
1813
1814       case MULT_EXPR:
1815       case MINUS_EXPR:
1816       case BIT_IOR_EXPR:
1817       case BIT_XOR_EXPR:
1818       case BIT_AND_EXPR:
1819         *reduc_code = ERROR_MARK;
1820         return true;
1821
1822       default:
1823        return false;
1824     }
1825 }
1826
1827
1828 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1829    STMT is printed with a message MSG. */
1830
1831 static void
1832 report_vect_op (int msg_type, gimple stmt, const char *msg)
1833 {
1834   dump_printf_loc (msg_type, vect_location, "%s", msg);
1835   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1836 }
1837
1838
1839 /* Detect SLP reduction of the form:
1840
1841    #a1 = phi <a5, a0>
1842    a2 = operation (a1)
1843    a3 = operation (a2)
1844    a4 = operation (a3)
1845    a5 = operation (a4)
1846
1847    #a = phi <a5>
1848
1849    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1850    FIRST_STMT is the first reduction stmt in the chain
1851    (a2 = operation (a1)).
1852
1853    Return TRUE if a reduction chain was detected.  */
1854
1855 static bool
1856 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1857 {
1858   struct loop *loop = (gimple_bb (phi))->loop_father;
1859   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1860   enum tree_code code;
1861   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1862   stmt_vec_info use_stmt_info, current_stmt_info;
1863   tree lhs;
1864   imm_use_iterator imm_iter;
1865   use_operand_p use_p;
1866   int nloop_uses, size = 0, n_out_of_loop_uses;
1867   bool found = false;
1868
1869   if (loop != vect_loop)
1870     return false;
1871
1872   lhs = PHI_RESULT (phi);
1873   code = gimple_assign_rhs_code (first_stmt);
1874   while (1)
1875     {
1876       nloop_uses = 0;
1877       n_out_of_loop_uses = 0;
1878       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1879         {
1880           gimple use_stmt = USE_STMT (use_p);
1881           if (is_gimple_debug (use_stmt))
1882             continue;
1883
1884           use_stmt = USE_STMT (use_p);
1885
1886           /* Check if we got back to the reduction phi.  */
1887           if (use_stmt == phi)
1888             {
1889               loop_use_stmt = use_stmt;
1890               found = true;
1891               break;
1892             }
1893
1894           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1895             {
1896               if (vinfo_for_stmt (use_stmt)
1897                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1898                 {
1899                   loop_use_stmt = use_stmt;
1900                   nloop_uses++;
1901                 }
1902             }
1903            else
1904              n_out_of_loop_uses++;
1905
1906            /* There are can be either a single use in the loop or two uses in
1907               phi nodes.  */
1908            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1909              return false;
1910         }
1911
1912       if (found)
1913         break;
1914
1915       /* We reached a statement with no loop uses.  */
1916       if (nloop_uses == 0)
1917         return false;
1918
1919       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1920       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1921         return false;
1922
1923       if (!is_gimple_assign (loop_use_stmt)
1924           || code != gimple_assign_rhs_code (loop_use_stmt)
1925           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1926         return false;
1927
1928       /* Insert USE_STMT into reduction chain.  */
1929       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1930       if (current_stmt)
1931         {
1932           current_stmt_info = vinfo_for_stmt (current_stmt);
1933           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1934           GROUP_FIRST_ELEMENT (use_stmt_info)
1935             = GROUP_FIRST_ELEMENT (current_stmt_info);
1936         }
1937       else
1938         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1939
1940       lhs = gimple_assign_lhs (loop_use_stmt);
1941       current_stmt = loop_use_stmt;
1942       size++;
1943    }
1944
1945   if (!found || loop_use_stmt != phi || size < 2)
1946     return false;
1947
1948   /* Swap the operands, if needed, to make the reduction operand be the second
1949      operand.  */
1950   lhs = PHI_RESULT (phi);
1951   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1952   while (next_stmt)
1953     {
1954       if (gimple_assign_rhs2 (next_stmt) == lhs)
1955         {
1956           tree op = gimple_assign_rhs1 (next_stmt);
1957           gimple def_stmt = NULL;
1958
1959           if (TREE_CODE (op) == SSA_NAME)
1960             def_stmt = SSA_NAME_DEF_STMT (op);
1961
1962           /* Check that the other def is either defined in the loop
1963              ("vect_internal_def"), or it's an induction (defined by a
1964              loop-header phi-node).  */
1965           if (def_stmt
1966               && gimple_bb (def_stmt)
1967               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1968               && (is_gimple_assign (def_stmt)
1969                   || is_gimple_call (def_stmt)
1970                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1971                            == vect_induction_def
1972                   || (gimple_code (def_stmt) == GIMPLE_PHI
1973                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1974                                   == vect_internal_def
1975                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1976             {
1977               lhs = gimple_assign_lhs (next_stmt);
1978               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1979               continue;
1980             }
1981
1982           return false;
1983         }
1984       else
1985         {
1986           tree op = gimple_assign_rhs2 (next_stmt);
1987           gimple def_stmt = NULL;
1988
1989           if (TREE_CODE (op) == SSA_NAME)
1990             def_stmt = SSA_NAME_DEF_STMT (op);
1991
1992           /* Check that the other def is either defined in the loop
1993             ("vect_internal_def"), or it's an induction (defined by a
1994             loop-header phi-node).  */
1995           if (def_stmt
1996               && gimple_bb (def_stmt)
1997               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1998               && (is_gimple_assign (def_stmt)
1999                   || is_gimple_call (def_stmt)
2000                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2001                               == vect_induction_def
2002                   || (gimple_code (def_stmt) == GIMPLE_PHI
2003                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2004                                   == vect_internal_def
2005                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2006             {
2007               if (dump_enabled_p ())
2008                 {
2009                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2010                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2011                 }
2012
2013               swap_tree_operands (next_stmt,
2014                                   gimple_assign_rhs1_ptr (next_stmt),
2015                                   gimple_assign_rhs2_ptr (next_stmt));
2016               update_stmt (next_stmt);
2017
2018               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2019                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2020             }
2021           else
2022             return false;
2023         }
2024
2025       lhs = gimple_assign_lhs (next_stmt);
2026       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2027     }
2028
2029   /* Save the chain for further analysis in SLP detection.  */
2030   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2031   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2032   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2033
2034   return true;
2035 }
2036
2037
2038 /* Function vect_is_simple_reduction_1
2039
2040    (1) Detect a cross-iteration def-use cycle that represents a simple
2041    reduction computation.  We look for the following pattern:
2042
2043    loop_header:
2044      a1 = phi < a0, a2 >
2045      a3 = ...
2046      a2 = operation (a3, a1)
2047
2048    such that:
2049    1. operation is commutative and associative and it is safe to
2050       change the order of the computation (if CHECK_REDUCTION is true)
2051    2. no uses for a2 in the loop (a2 is used out of the loop)
2052    3. no uses of a1 in the loop besides the reduction operation
2053    4. no uses of a1 outside the loop.
2054
2055    Conditions 1,4 are tested here.
2056    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2057
2058    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2059    nested cycles, if CHECK_REDUCTION is false.
2060
2061    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2062    reductions:
2063
2064      a1 = phi < a0, a2 >
2065      inner loop (def of a3)
2066      a2 = phi < a3 >
2067
2068    If MODIFY is true it tries also to rework the code in-place to enable
2069    detection of more reduction patterns.  For the time being we rewrite
2070    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2071 */
2072
2073 static gimple
2074 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2075                             bool check_reduction, bool *double_reduc,
2076                             bool modify)
2077 {
2078   struct loop *loop = (gimple_bb (phi))->loop_father;
2079   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2080   edge latch_e = loop_latch_edge (loop);
2081   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2082   gimple def_stmt, def1 = NULL, def2 = NULL;
2083   enum tree_code orig_code, code;
2084   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2085   tree type;
2086   int nloop_uses;
2087   tree name;
2088   imm_use_iterator imm_iter;
2089   use_operand_p use_p;
2090   bool phi_def;
2091
2092   *double_reduc = false;
2093
2094   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2095      otherwise, we assume outer loop vectorization.  */
2096   gcc_assert ((check_reduction && loop == vect_loop)
2097               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2098
2099   name = PHI_RESULT (phi);
2100   nloop_uses = 0;
2101   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2102     {
2103       gimple use_stmt = USE_STMT (use_p);
2104       if (is_gimple_debug (use_stmt))
2105         continue;
2106
2107       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2108         {
2109           if (dump_enabled_p ())
2110             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2111                              "intermediate value used outside loop.");
2112
2113           return NULL;
2114         }
2115
2116       if (vinfo_for_stmt (use_stmt)
2117           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2118         nloop_uses++;
2119       if (nloop_uses > 1)
2120         {
2121           if (dump_enabled_p ())
2122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123                              "reduction used in loop.");
2124           return NULL;
2125         }
2126     }
2127
2128   if (TREE_CODE (loop_arg) != SSA_NAME)
2129     {
2130       if (dump_enabled_p ())
2131         {
2132           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133                            "reduction: not ssa_name: ");
2134           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2135         }
2136       return NULL;
2137     }
2138
2139   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2140   if (!def_stmt)
2141     {
2142       if (dump_enabled_p ())
2143         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144                          "reduction: no def_stmt.");
2145       return NULL;
2146     }
2147
2148   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2149     {
2150       if (dump_enabled_p ())
2151         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2152       return NULL;
2153     }
2154
2155   if (is_gimple_assign (def_stmt))
2156     {
2157       name = gimple_assign_lhs (def_stmt);
2158       phi_def = false;
2159     }
2160   else
2161     {
2162       name = PHI_RESULT (def_stmt);
2163       phi_def = true;
2164     }
2165
2166   nloop_uses = 0;
2167   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2168     {
2169       gimple use_stmt = USE_STMT (use_p);
2170       if (is_gimple_debug (use_stmt))
2171         continue;
2172       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2173           && vinfo_for_stmt (use_stmt)
2174           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2175         nloop_uses++;
2176       if (nloop_uses > 1)
2177         {
2178           if (dump_enabled_p ())
2179             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180                              "reduction used in loop.");
2181           return NULL;
2182         }
2183     }
2184
2185   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2186      defined in the inner loop.  */
2187   if (phi_def)
2188     {
2189       op1 = PHI_ARG_DEF (def_stmt, 0);
2190
2191       if (gimple_phi_num_args (def_stmt) != 1
2192           || TREE_CODE (op1) != SSA_NAME)
2193         {
2194           if (dump_enabled_p ())
2195             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2196                              "unsupported phi node definition.");
2197
2198           return NULL;
2199         }
2200
2201       def1 = SSA_NAME_DEF_STMT (op1);
2202       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2203           && loop->inner
2204           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2205           && is_gimple_assign (def1))
2206         {
2207           if (dump_enabled_p ())
2208             report_vect_op (MSG_NOTE, def_stmt,
2209                             "detected double reduction: ");
2210
2211           *double_reduc = true;
2212           return def_stmt;
2213         }
2214
2215       return NULL;
2216     }
2217
2218   code = orig_code = gimple_assign_rhs_code (def_stmt);
2219
2220   /* We can handle "res -= x[i]", which is non-associative by
2221      simply rewriting this into "res += -x[i]".  Avoid changing
2222      gimple instruction for the first simple tests and only do this
2223      if we're allowed to change code at all.  */
2224   if (code == MINUS_EXPR
2225       && modify
2226       && (op1 = gimple_assign_rhs1 (def_stmt))
2227       && TREE_CODE (op1) == SSA_NAME
2228       && SSA_NAME_DEF_STMT (op1) == phi)
2229     code = PLUS_EXPR;
2230
2231   if (check_reduction
2232       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2233     {
2234       if (dump_enabled_p ())
2235         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2236                         "reduction: not commutative/associative: ");
2237       return NULL;
2238     }
2239
2240   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2241     {
2242       if (code != COND_EXPR)
2243         {
2244           if (dump_enabled_p ())
2245             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2246                             "reduction: not binary operation: ");
2247
2248           return NULL;
2249         }
2250
2251       op3 = gimple_assign_rhs1 (def_stmt);
2252       if (COMPARISON_CLASS_P (op3))
2253         {
2254           op4 = TREE_OPERAND (op3, 1);
2255           op3 = TREE_OPERAND (op3, 0);
2256         }
2257
2258       op1 = gimple_assign_rhs2 (def_stmt);
2259       op2 = gimple_assign_rhs3 (def_stmt);
2260
2261       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2262         {
2263           if (dump_enabled_p ())
2264             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2265                             "reduction: uses not ssa_names: ");
2266
2267           return NULL;
2268         }
2269     }
2270   else
2271     {
2272       op1 = gimple_assign_rhs1 (def_stmt);
2273       op2 = gimple_assign_rhs2 (def_stmt);
2274
2275       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2276         {
2277           if (dump_enabled_p ())
2278             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2279                             "reduction: uses not ssa_names: ");
2280
2281           return NULL;
2282         }
2283    }
2284
2285   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2286   if ((TREE_CODE (op1) == SSA_NAME
2287        && !types_compatible_p (type,TREE_TYPE (op1)))
2288       || (TREE_CODE (op2) == SSA_NAME
2289           && !types_compatible_p (type, TREE_TYPE (op2)))
2290       || (op3 && TREE_CODE (op3) == SSA_NAME
2291           && !types_compatible_p (type, TREE_TYPE (op3)))
2292       || (op4 && TREE_CODE (op4) == SSA_NAME
2293           && !types_compatible_p (type, TREE_TYPE (op4))))
2294     {
2295       if (dump_enabled_p ())
2296         {
2297           dump_printf_loc (MSG_NOTE, vect_location,
2298                            "reduction: multiple types: operation type: ");
2299           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2300           dump_printf (MSG_NOTE, ", operands types: ");
2301           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2302                              TREE_TYPE (op1));
2303           dump_printf (MSG_NOTE, ",");
2304           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2305                              TREE_TYPE (op2));
2306           if (op3)
2307             {
2308               dump_printf (MSG_NOTE, ",");
2309               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2310                                  TREE_TYPE (op3));
2311             }
2312
2313           if (op4)
2314             {
2315               dump_printf (MSG_NOTE, ",");
2316               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2317                                  TREE_TYPE (op4));
2318             }
2319         }
2320
2321       return NULL;
2322     }
2323
2324   /* Check that it's ok to change the order of the computation.
2325      Generally, when vectorizing a reduction we change the order of the
2326      computation.  This may change the behavior of the program in some
2327      cases, so we need to check that this is ok.  One exception is when
2328      vectorizing an outer-loop: the inner-loop is executed sequentially,
2329      and therefore vectorizing reductions in the inner-loop during
2330      outer-loop vectorization is safe.  */
2331
2332   /* CHECKME: check for !flag_finite_math_only too?  */
2333   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2334       && check_reduction)
2335     {
2336       /* Changing the order of operations changes the semantics.  */
2337       if (dump_enabled_p ())
2338         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2339                         "reduction: unsafe fp math optimization: ");
2340       return NULL;
2341     }
2342   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2343            && check_reduction)
2344     {
2345       /* Changing the order of operations changes the semantics.  */
2346       if (dump_enabled_p ())
2347         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2348                         "reduction: unsafe int math optimization: ");
2349       return NULL;
2350     }
2351   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2352     {
2353       /* Changing the order of operations changes the semantics.  */
2354       if (dump_enabled_p ())
2355         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2356                         "reduction: unsafe fixed-point math optimization: ");
2357       return NULL;
2358     }
2359
2360   /* If we detected "res -= x[i]" earlier, rewrite it into
2361      "res += -x[i]" now.  If this turns out to be useless reassoc
2362      will clean it up again.  */
2363   if (orig_code == MINUS_EXPR)
2364     {
2365       tree rhs = gimple_assign_rhs2 (def_stmt);
2366       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2367       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2368                                                          rhs, NULL);
2369       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2370       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2371                                                           loop_info, NULL));
2372       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2373       gimple_assign_set_rhs2 (def_stmt, negrhs);
2374       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2375       update_stmt (def_stmt);
2376     }
2377
2378   /* Reduction is safe. We're dealing with one of the following:
2379      1) integer arithmetic and no trapv
2380      2) floating point arithmetic, and special flags permit this optimization
2381      3) nested cycle (i.e., outer loop vectorization).  */
2382   if (TREE_CODE (op1) == SSA_NAME)
2383     def1 = SSA_NAME_DEF_STMT (op1);
2384
2385   if (TREE_CODE (op2) == SSA_NAME)
2386     def2 = SSA_NAME_DEF_STMT (op2);
2387
2388   if (code != COND_EXPR
2389       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2390     {
2391       if (dump_enabled_p ())
2392         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2393       return NULL;
2394     }
2395
2396   /* Check that one def is the reduction def, defined by PHI,
2397      the other def is either defined in the loop ("vect_internal_def"),
2398      or it's an induction (defined by a loop-header phi-node).  */
2399
2400   if (def2 && def2 == phi
2401       && (code == COND_EXPR
2402           || !def1 || gimple_nop_p (def1)
2403           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2404               && (is_gimple_assign (def1)
2405                   || is_gimple_call (def1)
2406                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2407                       == vect_induction_def
2408                   || (gimple_code (def1) == GIMPLE_PHI
2409                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2410                           == vect_internal_def
2411                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2412     {
2413       if (dump_enabled_p ())
2414         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2415       return def_stmt;
2416     }
2417
2418   if (def1 && def1 == phi
2419       && (code == COND_EXPR
2420           || !def2 || gimple_nop_p (def2)
2421           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2422               && (is_gimple_assign (def2)
2423                   || is_gimple_call (def2)
2424                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2425                       == vect_induction_def
2426                   || (gimple_code (def2) == GIMPLE_PHI
2427                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2428                           == vect_internal_def
2429                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2430     {
2431       if (check_reduction)
2432         {
2433           /* Swap operands (just for simplicity - so that the rest of the code
2434              can assume that the reduction variable is always the last (second)
2435              argument).  */
2436           if (dump_enabled_p ())
2437             report_vect_op (MSG_NOTE, def_stmt,
2438                             "detected reduction: need to swap operands: ");
2439
2440           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2441                               gimple_assign_rhs2_ptr (def_stmt));
2442
2443           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2444             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2445         }
2446       else
2447         {
2448           if (dump_enabled_p ())
2449             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2450         }
2451
2452       return def_stmt;
2453     }
2454
2455   /* Try to find SLP reduction chain.  */
2456   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2457     {
2458       if (dump_enabled_p ())
2459         report_vect_op (MSG_NOTE, def_stmt,
2460                         "reduction: detected reduction chain: ");
2461
2462       return def_stmt;
2463     }
2464
2465   if (dump_enabled_p ())
2466     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2467                     "reduction: unknown pattern: ");
2468
2469   return NULL;
2470 }
2471
2472 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2473    in-place.  Arguments as there.  */
2474
2475 static gimple
2476 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2477                           bool check_reduction, bool *double_reduc)
2478 {
2479   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2480                                      double_reduc, false);
2481 }
2482
2483 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2484    in-place if it enables detection of more reductions.  Arguments
2485    as there.  */
2486
2487 gimple
2488 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2489                           bool check_reduction, bool *double_reduc)
2490 {
2491   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2492                                      double_reduc, true);
2493 }
2494
2495 /* Calculate the cost of one scalar iteration of the loop.  */
2496 int
2497 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2498 {
2499   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2500   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2501   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2502   int innerloop_iters, i, stmt_cost;
2503
2504   /* Count statements in scalar loop.  Using this as scalar cost for a single
2505      iteration for now.
2506
2507      TODO: Add outer loop support.
2508
2509      TODO: Consider assigning different costs to different scalar
2510      statements.  */
2511
2512   /* FORNOW.  */
2513   innerloop_iters = 1;
2514   if (loop->inner)
2515     innerloop_iters = 50; /* FIXME */
2516
2517   for (i = 0; i < nbbs; i++)
2518     {
2519       gimple_stmt_iterator si;
2520       basic_block bb = bbs[i];
2521
2522       if (bb->loop_father == loop->inner)
2523         factor = innerloop_iters;
2524       else
2525         factor = 1;
2526
2527       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2528         {
2529           gimple stmt = gsi_stmt (si);
2530           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2531
2532           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2533             continue;
2534
2535           /* Skip stmts that are not vectorized inside the loop.  */
2536           if (stmt_info
2537               && !STMT_VINFO_RELEVANT_P (stmt_info)
2538               && (!STMT_VINFO_LIVE_P (stmt_info)
2539                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2540               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2541             continue;
2542
2543           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2544             {
2545               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2546                stmt_cost = vect_get_stmt_cost (scalar_load);
2547              else
2548                stmt_cost = vect_get_stmt_cost (scalar_store);
2549             }
2550           else
2551             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2552
2553           scalar_single_iter_cost += stmt_cost * factor;
2554         }
2555     }
2556   return scalar_single_iter_cost;
2557 }
2558
2559 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2560 int
2561 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2562                              int *peel_iters_epilogue,
2563                              int scalar_single_iter_cost,
2564                              stmt_vector_for_cost *prologue_cost_vec,
2565                              stmt_vector_for_cost *epilogue_cost_vec)
2566 {
2567   int retval = 0;
2568   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2569
2570   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2571     {
2572       *peel_iters_epilogue = vf/2;
2573       if (dump_enabled_p ())
2574         dump_printf_loc (MSG_NOTE, vect_location,
2575                          "cost model: epilogue peel iters set to vf/2 "
2576                          "because loop iterations are unknown .");
2577
2578       /* If peeled iterations are known but number of scalar loop
2579          iterations are unknown, count a taken branch per peeled loop.  */
2580       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2581                                  NULL, 0, vect_prologue);
2582     }
2583   else
2584     {
2585       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2586       peel_iters_prologue = niters < peel_iters_prologue ?
2587                             niters : peel_iters_prologue;
2588       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2589       /* If we need to peel for gaps, but no peeling is required, we have to
2590          peel VF iterations.  */
2591       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2592         *peel_iters_epilogue = vf;
2593     }
2594
2595   if (peel_iters_prologue)
2596     retval += record_stmt_cost (prologue_cost_vec,
2597                                 peel_iters_prologue * scalar_single_iter_cost,
2598                                 scalar_stmt, NULL, 0, vect_prologue);
2599   if (*peel_iters_epilogue)
2600     retval += record_stmt_cost (epilogue_cost_vec,
2601                                 *peel_iters_epilogue * scalar_single_iter_cost,
2602                                 scalar_stmt, NULL, 0, vect_epilogue);
2603   return retval;
2604 }
2605
2606 /* Function vect_estimate_min_profitable_iters
2607
2608    Return the number of iterations required for the vector version of the
2609    loop to be profitable relative to the cost of the scalar version of the
2610    loop.  */
2611
2612 static void
2613 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2614                                     int *ret_min_profitable_niters,
2615                                     int *ret_min_profitable_estimate)
2616 {
2617   int min_profitable_iters;
2618   int min_profitable_estimate;
2619   int peel_iters_prologue;
2620   int peel_iters_epilogue;
2621   unsigned vec_inside_cost = 0;
2622   int vec_outside_cost = 0;
2623   unsigned vec_prologue_cost = 0;
2624   unsigned vec_epilogue_cost = 0;
2625   int scalar_single_iter_cost = 0;
2626   int scalar_outside_cost = 0;
2627   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2628   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2629   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2630
2631   /* Cost model disabled.  */
2632   if (!flag_vect_cost_model)
2633     {
2634       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2635       *ret_min_profitable_niters = 0;
2636       *ret_min_profitable_estimate = 0;
2637       return;
2638     }
2639
2640   /* Requires loop versioning tests to handle misalignment.  */
2641   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2642     {
2643       /*  FIXME: Make cost depend on complexity of individual check.  */
2644       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2645       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2646                             vect_prologue);
2647       dump_printf (MSG_NOTE,
2648                    "cost model: Adding cost of checks for loop "
2649                    "versioning to treat misalignment.\n");
2650     }
2651
2652   /* Requires loop versioning with alias checks.  */
2653   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2654     {
2655       /*  FIXME: Make cost depend on complexity of individual check.  */
2656       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2657       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2658                             vect_prologue);
2659       dump_printf (MSG_NOTE,
2660                    "cost model: Adding cost of checks for loop "
2661                    "versioning aliasing.\n");
2662     }
2663
2664   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2665       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2666     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2667                           vect_prologue);
2668
2669   /* Count statements in scalar loop.  Using this as scalar cost for a single
2670      iteration for now.
2671
2672      TODO: Add outer loop support.
2673
2674      TODO: Consider assigning different costs to different scalar
2675      statements.  */
2676
2677   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2678
2679   /* Add additional cost for the peeled instructions in prologue and epilogue
2680      loop.
2681
2682      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2683      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2684
2685      TODO: Build an expression that represents peel_iters for prologue and
2686      epilogue to be used in a run-time test.  */
2687
2688   if (npeel  < 0)
2689     {
2690       peel_iters_prologue = vf/2;
2691       dump_printf (MSG_NOTE, "cost model: "
2692                    "prologue peel iters set to vf/2.");
2693
2694       /* If peeling for alignment is unknown, loop bound of main loop becomes
2695          unknown.  */
2696       peel_iters_epilogue = vf/2;
2697       dump_printf (MSG_NOTE, "cost model: "
2698                    "epilogue peel iters set to vf/2 because "
2699                    "peeling for alignment is unknown.");
2700
2701       /* If peeled iterations are unknown, count a taken branch and a not taken
2702          branch per peeled loop. Even if scalar loop iterations are known,
2703          vector iterations are not known since peeled prologue iterations are
2704          not known. Hence guards remain the same.  */
2705       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2706                             NULL, 0, vect_prologue);
2707       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2708                             NULL, 0, vect_prologue);
2709       /* FORNOW: Don't attempt to pass individual scalar instructions to
2710          the model; just assume linear cost for scalar iterations.  */
2711       (void) add_stmt_cost (target_cost_data,
2712                             peel_iters_prologue * scalar_single_iter_cost,
2713                             scalar_stmt, NULL, 0, vect_prologue);
2714       (void) add_stmt_cost (target_cost_data,
2715                             peel_iters_epilogue * scalar_single_iter_cost,
2716                             scalar_stmt, NULL, 0, vect_epilogue);
2717     }
2718   else
2719     {
2720       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2721       stmt_info_for_cost *si;
2722       int j;
2723       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2724
2725       prologue_cost_vec.create (2);
2726       epilogue_cost_vec.create (2);
2727       peel_iters_prologue = npeel;
2728
2729       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2730                                           &peel_iters_epilogue,
2731                                           scalar_single_iter_cost,
2732                                           &prologue_cost_vec,
2733                                           &epilogue_cost_vec);
2734
2735       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2736         {
2737           struct _stmt_vec_info *stmt_info
2738             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2739           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2740                                 si->misalign, vect_prologue);
2741         }
2742
2743       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2744         {
2745           struct _stmt_vec_info *stmt_info
2746             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2747           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2748                                 si->misalign, vect_epilogue);
2749         }
2750
2751       prologue_cost_vec.release ();
2752       epilogue_cost_vec.release ();
2753     }
2754
2755   /* FORNOW: The scalar outside cost is incremented in one of the
2756      following ways:
2757
2758      1. The vectorizer checks for alignment and aliasing and generates
2759      a condition that allows dynamic vectorization.  A cost model
2760      check is ANDED with the versioning condition.  Hence scalar code
2761      path now has the added cost of the versioning check.
2762
2763        if (cost > th & versioning_check)
2764          jmp to vector code
2765
2766      Hence run-time scalar is incremented by not-taken branch cost.
2767
2768      2. The vectorizer then checks if a prologue is required.  If the
2769      cost model check was not done before during versioning, it has to
2770      be done before the prologue check.
2771
2772        if (cost <= th)
2773          prologue = scalar_iters
2774        if (prologue == 0)
2775          jmp to vector code
2776        else
2777          execute prologue
2778        if (prologue == num_iters)
2779          go to exit
2780
2781      Hence the run-time scalar cost is incremented by a taken branch,
2782      plus a not-taken branch, plus a taken branch cost.
2783
2784      3. The vectorizer then checks if an epilogue is required.  If the
2785      cost model check was not done before during prologue check, it
2786      has to be done with the epilogue check.
2787
2788        if (prologue == 0)
2789          jmp to vector code
2790        else
2791          execute prologue
2792        if (prologue == num_iters)
2793          go to exit
2794        vector code:
2795          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2796            jmp to epilogue
2797
2798      Hence the run-time scalar cost should be incremented by 2 taken
2799      branches.
2800
2801      TODO: The back end may reorder the BBS's differently and reverse
2802      conditions/branch directions.  Change the estimates below to
2803      something more reasonable.  */
2804
2805   /* If the number of iterations is known and we do not do versioning, we can
2806      decide whether to vectorize at compile time.  Hence the scalar version
2807      do not carry cost model guard costs.  */
2808   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2809       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2810       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2811     {
2812       /* Cost model check occurs at versioning.  */
2813       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2814           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2815         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2816       else
2817         {
2818           /* Cost model check occurs at prologue generation.  */
2819           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2820             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2821               + vect_get_stmt_cost (cond_branch_not_taken);
2822           /* Cost model check occurs at epilogue generation.  */
2823           else
2824             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2825         }
2826     }
2827
2828   /* Complete the target-specific cost calculations.  */
2829   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2830                &vec_inside_cost, &vec_epilogue_cost);
2831
2832   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2833
2834   /* Calculate number of iterations required to make the vector version
2835      profitable, relative to the loop bodies only.  The following condition
2836      must hold true:
2837      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2838      where
2839      SIC = scalar iteration cost, VIC = vector iteration cost,
2840      VOC = vector outside cost, VF = vectorization factor,
2841      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2842      SOC = scalar outside cost for run time cost model check.  */
2843
2844   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2845     {
2846       if (vec_outside_cost <= 0)
2847         min_profitable_iters = 1;
2848       else
2849         {
2850           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2851                                   - vec_inside_cost * peel_iters_prologue
2852                                   - vec_inside_cost * peel_iters_epilogue)
2853                                  / ((scalar_single_iter_cost * vf)
2854                                     - vec_inside_cost);
2855
2856           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2857               <= (((int) vec_inside_cost * min_profitable_iters)
2858                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2859             min_profitable_iters++;
2860         }
2861     }
2862   /* vector version will never be profitable.  */
2863   else
2864     {
2865       if (dump_enabled_p ())
2866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2867                          "cost model: the vector iteration cost = %d "
2868                          "divided by the scalar iteration cost = %d "
2869                          "is greater or equal to the vectorization factor = %d.",
2870                          vec_inside_cost, scalar_single_iter_cost, vf);
2871       *ret_min_profitable_niters = -1;
2872       *ret_min_profitable_estimate = -1;
2873       return;
2874     }
2875
2876   if (dump_enabled_p ())
2877     {
2878       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2879       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2880                    vec_inside_cost);
2881       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2882                    vec_prologue_cost);
2883       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2884                    vec_epilogue_cost);
2885       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2886                    scalar_single_iter_cost);
2887       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2888                    scalar_outside_cost);
2889       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2890                    vec_outside_cost);
2891       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2892                    peel_iters_prologue);
2893       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2894                    peel_iters_epilogue);
2895       dump_printf (MSG_NOTE,
2896                    "  Calculated minimum iters for profitability: %d\n",
2897                    min_profitable_iters);
2898     }
2899
2900   min_profitable_iters =
2901         min_profitable_iters < vf ? vf : min_profitable_iters;
2902
2903   /* Because the condition we create is:
2904      if (niters <= min_profitable_iters)
2905        then skip the vectorized loop.  */
2906   min_profitable_iters--;
2907
2908   if (dump_enabled_p ())
2909     dump_printf_loc (MSG_NOTE, vect_location,
2910                      "  Runtime profitability threshold = %d\n", min_profitable_iters);
2911
2912   *ret_min_profitable_niters = min_profitable_iters;
2913
2914   /* Calculate number of iterations required to make the vector version
2915      profitable, relative to the loop bodies only.
2916
2917      Non-vectorized variant is SIC * niters and it must win over vector
2918      variant on the expected loop trip count.  The following condition must hold true:
2919      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2920
2921   if (vec_outside_cost <= 0)
2922     min_profitable_estimate = 1;
2923   else
2924     {
2925       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2926                                  - vec_inside_cost * peel_iters_prologue
2927                                  - vec_inside_cost * peel_iters_epilogue)
2928                                  / ((scalar_single_iter_cost * vf)
2929                                    - vec_inside_cost);
2930     }
2931   min_profitable_estimate --;
2932   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2933   if (dump_enabled_p ())
2934     dump_printf_loc (MSG_NOTE, vect_location,
2935                      "  Static estimate profitability threshold = %d\n",
2936                       min_profitable_iters);
2937
2938   *ret_min_profitable_estimate = min_profitable_estimate;
2939 }
2940
2941
2942 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2943    functions. Design better to avoid maintenance issues.  */
2944
2945 /* Function vect_model_reduction_cost.
2946
2947    Models cost for a reduction operation, including the vector ops
2948    generated within the strip-mine loop, the initial definition before
2949    the loop, and the epilogue code that must be generated.  */
2950
2951 static bool
2952 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2953                            int ncopies)
2954 {
2955   int prologue_cost = 0, epilogue_cost = 0;
2956   enum tree_code code;
2957   optab optab;
2958   tree vectype;
2959   gimple stmt, orig_stmt;
2960   tree reduction_op;
2961   enum machine_mode mode;
2962   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2963   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2964   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2965
2966   /* Cost of reduction op inside loop.  */
2967   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2968                                         stmt_info, 0, vect_body);
2969   stmt = STMT_VINFO_STMT (stmt_info);
2970
2971   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2972     {
2973     case GIMPLE_SINGLE_RHS:
2974       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2975       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2976       break;
2977     case GIMPLE_UNARY_RHS:
2978       reduction_op = gimple_assign_rhs1 (stmt);
2979       break;
2980     case GIMPLE_BINARY_RHS:
2981       reduction_op = gimple_assign_rhs2 (stmt);
2982       break;
2983     case GIMPLE_TERNARY_RHS:
2984       reduction_op = gimple_assign_rhs3 (stmt);
2985       break;
2986     default:
2987       gcc_unreachable ();
2988     }
2989
2990   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2991   if (!vectype)
2992     {
2993       if (dump_enabled_p ())
2994         {
2995           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2996                            "unsupported data-type ");
2997           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2998                              TREE_TYPE (reduction_op));
2999         }
3000       return false;
3001    }
3002
3003   mode = TYPE_MODE (vectype);
3004   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3005
3006   if (!orig_stmt)
3007     orig_stmt = STMT_VINFO_STMT (stmt_info);
3008
3009   code = gimple_assign_rhs_code (orig_stmt);
3010
3011   /* Add in cost for initial definition.  */
3012   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3013                                   stmt_info, 0, vect_prologue);
3014
3015   /* Determine cost of epilogue code.
3016
3017      We have a reduction operator that will reduce the vector in one statement.
3018      Also requires scalar extract.  */
3019
3020   if (!nested_in_vect_loop_p (loop, orig_stmt))
3021     {
3022       if (reduc_code != ERROR_MARK)
3023         {
3024           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3025                                           stmt_info, 0, vect_epilogue);
3026           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3027                                           stmt_info, 0, vect_epilogue);
3028         }
3029       else
3030         {
3031           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3032           tree bitsize =
3033             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3034           int element_bitsize = tree_low_cst (bitsize, 1);
3035           int nelements = vec_size_in_bits / element_bitsize;
3036
3037           optab = optab_for_tree_code (code, vectype, optab_default);
3038
3039           /* We have a whole vector shift available.  */
3040           if (VECTOR_MODE_P (mode)
3041               && optab_handler (optab, mode) != CODE_FOR_nothing
3042               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3043             {
3044               /* Final reduction via vector shifts and the reduction operator.
3045                  Also requires scalar extract.  */
3046               epilogue_cost += add_stmt_cost (target_cost_data,
3047                                               exact_log2 (nelements) * 2,
3048                                               vector_stmt, stmt_info, 0,
3049                                               vect_epilogue);
3050               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3051                                               vec_to_scalar, stmt_info, 0,
3052                                               vect_epilogue);
3053             }
3054           else
3055             /* Use extracts and reduction op for final reduction.  For N
3056                elements, we have N extracts and N-1 reduction ops.  */
3057             epilogue_cost += add_stmt_cost (target_cost_data,
3058                                             nelements + nelements - 1,
3059                                             vector_stmt, stmt_info, 0,
3060                                             vect_epilogue);
3061         }
3062     }
3063
3064   if (dump_enabled_p ())
3065     dump_printf (MSG_NOTE,
3066                  "vect_model_reduction_cost: inside_cost = %d, "
3067                  "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3068                  prologue_cost, epilogue_cost);
3069
3070   return true;
3071 }
3072
3073
3074 /* Function vect_model_induction_cost.
3075
3076    Models cost for induction operations.  */
3077
3078 static void
3079 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3080 {
3081   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3082   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3083   unsigned inside_cost, prologue_cost;
3084
3085   /* loop cost for vec_loop.  */
3086   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3087                                stmt_info, 0, vect_body);
3088
3089   /* prologue cost for vec_init and vec_step.  */
3090   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3091                                  stmt_info, 0, vect_prologue);
3092
3093   if (dump_enabled_p ())
3094     dump_printf_loc (MSG_NOTE, vect_location,
3095                      "vect_model_induction_cost: inside_cost = %d, "
3096                      "prologue_cost = %d .", inside_cost, prologue_cost);
3097 }
3098
3099
3100 /* Function get_initial_def_for_induction
3101
3102    Input:
3103    STMT - a stmt that performs an induction operation in the loop.
3104    IV_PHI - the initial value of the induction variable
3105
3106    Output:
3107    Return a vector variable, initialized with the first VF values of
3108    the induction variable.  E.g., for an iv with IV_PHI='X' and
3109    evolution S, for a vector of 4 units, we want to return:
3110    [X, X + S, X + 2*S, X + 3*S].  */
3111
3112 static tree
3113 get_initial_def_for_induction (gimple iv_phi)
3114 {
3115   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3116   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3117   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3118   tree scalar_type;
3119   tree vectype;
3120   int nunits;
3121   edge pe = loop_preheader_edge (loop);
3122   struct loop *iv_loop;
3123   basic_block new_bb;
3124   tree new_vec, vec_init, vec_step, t;
3125   tree access_fn;
3126   tree new_var;
3127   tree new_name;
3128   gimple init_stmt, induction_phi, new_stmt;
3129   tree induc_def, vec_def, vec_dest;
3130   tree init_expr, step_expr;
3131   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3132   int i;
3133   bool ok;
3134   int ncopies;
3135   tree expr;
3136   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3137   bool nested_in_vect_loop = false;
3138   gimple_seq stmts = NULL;
3139   imm_use_iterator imm_iter;
3140   use_operand_p use_p;
3141   gimple exit_phi;
3142   edge latch_e;
3143   tree loop_arg;
3144   gimple_stmt_iterator si;
3145   basic_block bb = gimple_bb (iv_phi);
3146   tree stepvectype;
3147   tree resvectype;
3148
3149   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3150   if (nested_in_vect_loop_p (loop, iv_phi))
3151     {
3152       nested_in_vect_loop = true;
3153       iv_loop = loop->inner;
3154     }
3155   else
3156     iv_loop = loop;
3157   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3158
3159   latch_e = loop_latch_edge (iv_loop);
3160   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3161
3162   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3163   gcc_assert (access_fn);
3164   STRIP_NOPS (access_fn);
3165   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3166                                     &init_expr, &step_expr);
3167   gcc_assert (ok);
3168   pe = loop_preheader_edge (iv_loop);
3169
3170   scalar_type = TREE_TYPE (init_expr);
3171   vectype = get_vectype_for_scalar_type (scalar_type);
3172   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3173   gcc_assert (vectype);
3174   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3175   ncopies = vf / nunits;
3176
3177   gcc_assert (phi_info);
3178   gcc_assert (ncopies >= 1);
3179
3180   /* Find the first insertion point in the BB.  */
3181   si = gsi_after_labels (bb);
3182
3183   /* Create the vector that holds the initial_value of the induction.  */
3184   if (nested_in_vect_loop)
3185     {
3186       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3187          been created during vectorization of previous stmts.  We obtain it
3188          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3189       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3190                                            loop_preheader_edge (iv_loop));
3191       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3192       /* If the initial value is not of proper type, convert it.  */
3193       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3194         {
3195           new_stmt = gimple_build_assign_with_ops
3196               (VIEW_CONVERT_EXPR,
3197                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3198                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3199           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3200           gimple_assign_set_lhs (new_stmt, vec_init);
3201           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3202                                                  new_stmt);
3203           gcc_assert (!new_bb);
3204           set_vinfo_for_stmt (new_stmt,
3205                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3206         }
3207     }
3208   else
3209     {
3210       vec<constructor_elt, va_gc> *v;
3211
3212       /* iv_loop is the loop to be vectorized. Create:
3213          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3214       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3215       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3216       if (stmts)
3217         {
3218           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3219           gcc_assert (!new_bb);
3220         }
3221
3222       vec_alloc (v, nunits);
3223       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3224       for (i = 1; i < nunits; i++)
3225         {
3226           /* Create: new_name_i = new_name + step_expr  */
3227           enum tree_code code = POINTER_TYPE_P (scalar_type)
3228                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3229           init_stmt = gimple_build_assign_with_ops (code, new_var,
3230                                                     new_name, step_expr);
3231           new_name = make_ssa_name (new_var, init_stmt);
3232           gimple_assign_set_lhs (init_stmt, new_name);
3233
3234           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3235           gcc_assert (!new_bb);
3236
3237           if (dump_enabled_p ())
3238             {
3239               dump_printf_loc (MSG_NOTE, vect_location,
3240                                "created new init_stmt: ");
3241               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3242             }
3243           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3244         }
3245       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3246       new_vec = build_constructor (vectype, v);
3247       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3248     }
3249
3250
3251   /* Create the vector that holds the step of the induction.  */
3252   if (nested_in_vect_loop)
3253     /* iv_loop is nested in the loop to be vectorized. Generate:
3254        vec_step = [S, S, S, S]  */
3255     new_name = step_expr;
3256   else
3257     {
3258       /* iv_loop is the loop to be vectorized. Generate:
3259           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3260       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3261       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3262                               expr, step_expr);
3263     }
3264
3265   t = unshare_expr (new_name);
3266   gcc_assert (CONSTANT_CLASS_P (new_name));
3267   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3268   gcc_assert (stepvectype);
3269   new_vec = build_vector_from_val (stepvectype, t);
3270   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3271
3272
3273   /* Create the following def-use cycle:
3274      loop prolog:
3275          vec_init = ...
3276          vec_step = ...
3277      loop:
3278          vec_iv = PHI <vec_init, vec_loop>
3279          ...
3280          STMT
3281          ...
3282          vec_loop = vec_iv + vec_step;  */
3283
3284   /* Create the induction-phi that defines the induction-operand.  */
3285   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3286   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3287   set_vinfo_for_stmt (induction_phi,
3288                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3289   induc_def = PHI_RESULT (induction_phi);
3290
3291   /* Create the iv update inside the loop  */
3292   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3293                                            induc_def, vec_step);
3294   vec_def = make_ssa_name (vec_dest, new_stmt);
3295   gimple_assign_set_lhs (new_stmt, vec_def);
3296   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3297   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3298                                                    NULL));
3299
3300   /* Set the arguments of the phi node:  */
3301   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3302   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3303                UNKNOWN_LOCATION);
3304
3305
3306   /* In case that vectorization factor (VF) is bigger than the number
3307      of elements that we can fit in a vectype (nunits), we have to generate
3308      more than one vector stmt - i.e - we need to "unroll" the
3309      vector stmt by a factor VF/nunits.  For more details see documentation
3310      in vectorizable_operation.  */
3311
3312   if (ncopies > 1)
3313     {
3314       stmt_vec_info prev_stmt_vinfo;
3315       /* FORNOW. This restriction should be relaxed.  */
3316       gcc_assert (!nested_in_vect_loop);
3317
3318       /* Create the vector that holds the step of the induction.  */
3319       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3320       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3321                               expr, step_expr);
3322       t = unshare_expr (new_name);
3323       gcc_assert (CONSTANT_CLASS_P (new_name));
3324       new_vec = build_vector_from_val (stepvectype, t);
3325       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3326
3327       vec_def = induc_def;
3328       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3329       for (i = 1; i < ncopies; i++)
3330         {
3331           /* vec_i = vec_prev + vec_step  */
3332           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3333                                                    vec_def, vec_step);
3334           vec_def = make_ssa_name (vec_dest, new_stmt);
3335           gimple_assign_set_lhs (new_stmt, vec_def);
3336
3337           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3338           if (!useless_type_conversion_p (resvectype, vectype))
3339             {
3340               new_stmt = gimple_build_assign_with_ops
3341                   (VIEW_CONVERT_EXPR,
3342                    vect_get_new_vect_var (resvectype, vect_simple_var,
3343                                           "vec_iv_"),
3344                    build1 (VIEW_CONVERT_EXPR, resvectype,
3345                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3346               gimple_assign_set_lhs (new_stmt,
3347                                      make_ssa_name
3348                                        (gimple_assign_lhs (new_stmt), new_stmt));
3349               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3350             }
3351           set_vinfo_for_stmt (new_stmt,
3352                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3353           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3354           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3355         }
3356     }
3357
3358   if (nested_in_vect_loop)
3359     {
3360       /* Find the loop-closed exit-phi of the induction, and record
3361          the final vector of induction results:  */
3362       exit_phi = NULL;
3363       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3364         {
3365           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3366             {
3367               exit_phi = USE_STMT (use_p);
3368               break;
3369             }
3370         }
3371       if (exit_phi)
3372         {
3373           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3374           /* FORNOW. Currently not supporting the case that an inner-loop induction
3375              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3376           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3377                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3378
3379           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3380           if (dump_enabled_p ())
3381             {
3382               dump_printf_loc (MSG_NOTE, vect_location,
3383                                "vector of inductions after inner-loop:");
3384               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3385             }
3386         }
3387     }
3388
3389
3390   if (dump_enabled_p ())
3391     {
3392       dump_printf_loc (MSG_NOTE, vect_location,
3393                        "transform induction: created def-use cycle: ");
3394       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3395       dump_printf (MSG_NOTE, "\n");
3396       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3397                         SSA_NAME_DEF_STMT (vec_def), 0);
3398     }
3399
3400   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3401   if (!useless_type_conversion_p (resvectype, vectype))
3402     {
3403       new_stmt = gimple_build_assign_with_ops
3404          (VIEW_CONVERT_EXPR,
3405           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3406           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3407       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3408       gimple_assign_set_lhs (new_stmt, induc_def);
3409       si = gsi_after_labels (bb);
3410       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3411       set_vinfo_for_stmt (new_stmt,
3412                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3413       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3414         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3415     }
3416
3417   return induc_def;
3418 }
3419
3420
3421 /* Function get_initial_def_for_reduction
3422
3423    Input:
3424    STMT - a stmt that performs a reduction operation in the loop.
3425    INIT_VAL - the initial value of the reduction variable
3426
3427    Output:
3428    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3429         of the reduction (used for adjusting the epilog - see below).
3430    Return a vector variable, initialized according to the operation that STMT
3431         performs. This vector will be used as the initial value of the
3432         vector of partial results.
3433
3434    Option1 (adjust in epilog): Initialize the vector as follows:
3435      add/bit or/xor:    [0,0,...,0,0]
3436      mult/bit and:      [1,1,...,1,1]
3437      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3438    and when necessary (e.g. add/mult case) let the caller know
3439    that it needs to adjust the result by init_val.
3440
3441    Option2: Initialize the vector as follows:
3442      add/bit or/xor:    [init_val,0,0,...,0]
3443      mult/bit and:      [init_val,1,1,...,1]
3444      min/max/cond_expr: [init_val,init_val,...,init_val]
3445    and no adjustments are needed.
3446
3447    For example, for the following code:
3448
3449    s = init_val;
3450    for (i=0;i<n;i++)
3451      s = s + a[i];
3452
3453    STMT is 's = s + a[i]', and the reduction variable is 's'.
3454    For a vector of 4 units, we want to return either [0,0,0,init_val],
3455    or [0,0,0,0] and let the caller know that it needs to adjust
3456    the result at the end by 'init_val'.
3457
3458    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3459    initialization vector is simpler (same element in all entries), if
3460    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3461
3462    A cost model should help decide between these two schemes.  */
3463
3464 tree
3465 get_initial_def_for_reduction (gimple stmt, tree init_val,
3466                                tree *adjustment_def)
3467 {
3468   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3469   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3470   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3471   tree scalar_type = TREE_TYPE (init_val);
3472   tree vectype = get_vectype_for_scalar_type (scalar_type);
3473   int nunits;
3474   enum tree_code code = gimple_assign_rhs_code (stmt);
3475   tree def_for_init;
3476   tree init_def;
3477   tree *elts;
3478   int i;
3479   bool nested_in_vect_loop = false;
3480   tree init_value;
3481   REAL_VALUE_TYPE real_init_val = dconst0;
3482   int int_init_val = 0;
3483   gimple def_stmt = NULL;
3484
3485   gcc_assert (vectype);
3486   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3487
3488   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3489               || SCALAR_FLOAT_TYPE_P (scalar_type));
3490
3491   if (nested_in_vect_loop_p (loop, stmt))
3492     nested_in_vect_loop = true;
3493   else
3494     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3495
3496   /* In case of double reduction we only create a vector variable to be put
3497      in the reduction phi node.  The actual statement creation is done in
3498      vect_create_epilog_for_reduction.  */
3499   if (adjustment_def && nested_in_vect_loop
3500       && TREE_CODE (init_val) == SSA_NAME
3501       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3502       && gimple_code (def_stmt) == GIMPLE_PHI
3503       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3504       && vinfo_for_stmt (def_stmt)
3505       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3506           == vect_double_reduction_def)
3507     {
3508       *adjustment_def = NULL;
3509       return vect_create_destination_var (init_val, vectype);
3510     }
3511
3512   if (TREE_CONSTANT (init_val))
3513     {
3514       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3515         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3516       else
3517         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3518     }
3519   else
3520     init_value = init_val;
3521
3522   switch (code)
3523     {
3524       case WIDEN_SUM_EXPR:
3525       case DOT_PROD_EXPR:
3526       case PLUS_EXPR:
3527       case MINUS_EXPR:
3528       case BIT_IOR_EXPR:
3529       case BIT_XOR_EXPR:
3530       case MULT_EXPR:
3531       case BIT_AND_EXPR:
3532         /* ADJUSMENT_DEF is NULL when called from
3533            vect_create_epilog_for_reduction to vectorize double reduction.  */
3534         if (adjustment_def)
3535           {
3536             if (nested_in_vect_loop)
3537               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3538                                                               NULL);
3539             else
3540               *adjustment_def = init_val;
3541           }
3542
3543         if (code == MULT_EXPR)
3544           {
3545             real_init_val = dconst1;
3546             int_init_val = 1;
3547           }
3548
3549         if (code == BIT_AND_EXPR)
3550           int_init_val = -1;
3551
3552         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3553           def_for_init = build_real (scalar_type, real_init_val);
3554         else
3555           def_for_init = build_int_cst (scalar_type, int_init_val);
3556
3557         /* Create a vector of '0' or '1' except the first element.  */
3558         elts = XALLOCAVEC (tree, nunits);
3559         for (i = nunits - 2; i >= 0; --i)
3560           elts[i + 1] = def_for_init;
3561
3562         /* Option1: the first element is '0' or '1' as well.  */
3563         if (adjustment_def)
3564           {
3565             elts[0] = def_for_init;
3566             init_def = build_vector (vectype, elts);
3567             break;
3568           }
3569
3570         /* Option2: the first element is INIT_VAL.  */
3571         elts[0] = init_val;
3572         if (TREE_CONSTANT (init_val))
3573           init_def = build_vector (vectype, elts);
3574         else
3575           {
3576             vec<constructor_elt, va_gc> *v;
3577             vec_alloc (v, nunits);
3578             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3579             for (i = 1; i < nunits; ++i)
3580               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3581             init_def = build_constructor (vectype, v);
3582           }
3583
3584         break;
3585
3586       case MIN_EXPR:
3587       case MAX_EXPR:
3588       case COND_EXPR:
3589         if (adjustment_def)
3590           {
3591             *adjustment_def = NULL_TREE;
3592             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3593             break;
3594           }
3595
3596         init_def = build_vector_from_val (vectype, init_value);
3597         break;
3598
3599       default:
3600         gcc_unreachable ();
3601     }
3602
3603   return init_def;
3604 }
3605
3606
3607 /* Function vect_create_epilog_for_reduction
3608
3609    Create code at the loop-epilog to finalize the result of a reduction
3610    computation.
3611
3612    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3613      reduction statements.
3614    STMT is the scalar reduction stmt that is being vectorized.
3615    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3616      number of elements that we can fit in a vectype (nunits).  In this case
3617      we have to generate more than one vector stmt - i.e - we need to "unroll"
3618      the vector stmt by a factor VF/nunits.  For more details see documentation
3619      in vectorizable_operation.
3620    REDUC_CODE is the tree-code for the epilog reduction.
3621    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3622      computation.
3623    REDUC_INDEX is the index of the operand in the right hand side of the
3624      statement that is defined by REDUCTION_PHI.
3625    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3626    SLP_NODE is an SLP node containing a group of reduction statements. The
3627      first one in this group is STMT.
3628
3629    This function:
3630    1. Creates the reduction def-use cycles: sets the arguments for
3631       REDUCTION_PHIS:
3632       The loop-entry argument is the vectorized initial-value of the reduction.
3633       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3634       sums.
3635    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3636       by applying the operation specified by REDUC_CODE if available, or by
3637       other means (whole-vector shifts or a scalar loop).
3638       The function also creates a new phi node at the loop exit to preserve
3639       loop-closed form, as illustrated below.
3640
3641      The flow at the entry to this function:
3642
3643         loop:
3644           vec_def = phi <null, null>            # REDUCTION_PHI
3645           VECT_DEF = vector_stmt                # vectorized form of STMT
3646           s_loop = scalar_stmt                  # (scalar) STMT
3647         loop_exit:
3648           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3649           use <s_out0>
3650           use <s_out0>
3651
3652      The above is transformed by this function into:
3653
3654         loop:
3655           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3656           VECT_DEF = vector_stmt                # vectorized form of STMT
3657           s_loop = scalar_stmt                  # (scalar) STMT
3658         loop_exit:
3659           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3660           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3661           v_out2 = reduce <v_out1>
3662           s_out3 = extract_field <v_out2, 0>
3663           s_out4 = adjust_result <s_out3>
3664           use <s_out4>
3665           use <s_out4>
3666 */
3667
3668 static void
3669 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3670                                   int ncopies, enum tree_code reduc_code,
3671                                   vec<gimple> reduction_phis,
3672                                   int reduc_index, bool double_reduc,
3673                                   slp_tree slp_node)
3674 {
3675   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3676   stmt_vec_info prev_phi_info;
3677   tree vectype;
3678   enum machine_mode mode;
3679   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3680   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3681   basic_block exit_bb;
3682   tree scalar_dest;
3683   tree scalar_type;
3684   gimple new_phi = NULL, phi;
3685   gimple_stmt_iterator exit_gsi;
3686   tree vec_dest;
3687   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3688   gimple epilog_stmt = NULL;
3689   enum tree_code code = gimple_assign_rhs_code (stmt);
3690   gimple exit_phi;
3691   tree bitsize, bitpos;
3692   tree adjustment_def = NULL;
3693   tree vec_initial_def = NULL;
3694   tree reduction_op, expr, def;
3695   tree orig_name, scalar_result;
3696   imm_use_iterator imm_iter, phi_imm_iter;
3697   use_operand_p use_p, phi_use_p;
3698   bool extract_scalar_result = false;
3699   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3700   bool nested_in_vect_loop = false;
3701   vec<gimple> new_phis = vNULL;
3702   vec<gimple> inner_phis = vNULL;
3703   enum vect_def_type dt = vect_unknown_def_type;
3704   int j, i;
3705   vec<tree> scalar_results = vNULL;
3706   unsigned int group_size = 1, k, ratio;
3707   vec<tree> vec_initial_defs = vNULL;
3708   vec<gimple> phis;
3709   bool slp_reduc = false;
3710   tree new_phi_result;
3711   gimple inner_phi = NULL;
3712
3713   if (slp_node)
3714     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3715
3716   if (nested_in_vect_loop_p (loop, stmt))
3717     {
3718       outer_loop = loop;
3719       loop = loop->inner;
3720       nested_in_vect_loop = true;
3721       gcc_assert (!slp_node);
3722     }
3723
3724   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3725     {
3726     case GIMPLE_SINGLE_RHS:
3727       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3728                   == ternary_op);
3729       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3730       break;
3731     case GIMPLE_UNARY_RHS:
3732       reduction_op = gimple_assign_rhs1 (stmt);
3733       break;
3734     case GIMPLE_BINARY_RHS:
3735       reduction_op = reduc_index ?
3736                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3737       break;
3738     case GIMPLE_TERNARY_RHS:
3739       reduction_op = gimple_op (stmt, reduc_index + 1);
3740       break;
3741     default:
3742       gcc_unreachable ();
3743     }
3744
3745   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3746   gcc_assert (vectype);
3747   mode = TYPE_MODE (vectype);
3748
3749   /* 1. Create the reduction def-use cycle:
3750      Set the arguments of REDUCTION_PHIS, i.e., transform
3751
3752         loop:
3753           vec_def = phi <null, null>            # REDUCTION_PHI
3754           VECT_DEF = vector_stmt                # vectorized form of STMT
3755           ...
3756
3757      into:
3758
3759         loop:
3760           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3761           VECT_DEF = vector_stmt                # vectorized form of STMT
3762           ...
3763
3764      (in case of SLP, do it for all the phis). */
3765
3766   /* Get the loop-entry arguments.  */
3767   if (slp_node)
3768     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3769                        NULL, slp_node, reduc_index);
3770   else
3771     {
3772       vec_initial_defs.create (1);
3773      /* For the case of reduction, vect_get_vec_def_for_operand returns
3774         the scalar def before the loop, that defines the initial value
3775         of the reduction variable.  */
3776       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3777                                                       &adjustment_def);
3778       vec_initial_defs.quick_push (vec_initial_def);
3779     }
3780
3781   /* Set phi nodes arguments.  */
3782   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3783     {
3784       tree vec_init_def = vec_initial_defs[i];
3785       tree def = vect_defs[i];
3786       for (j = 0; j < ncopies; j++)
3787         {
3788           /* Set the loop-entry arg of the reduction-phi.  */
3789           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3790                        UNKNOWN_LOCATION);
3791
3792           /* Set the loop-latch arg for the reduction-phi.  */
3793           if (j > 0)
3794             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3795
3796           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3797
3798           if (dump_enabled_p ())
3799             {
3800               dump_printf_loc (MSG_NOTE, vect_location,
3801                                "transform reduction: created def-use cycle: ");
3802               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3803               dump_printf (MSG_NOTE, "\n");
3804               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3805             }
3806
3807           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3808         }
3809     }
3810
3811   vec_initial_defs.release ();
3812
3813   /* 2. Create epilog code.
3814         The reduction epilog code operates across the elements of the vector
3815         of partial results computed by the vectorized loop.
3816         The reduction epilog code consists of:
3817
3818         step 1: compute the scalar result in a vector (v_out2)
3819         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3820         step 3: adjust the scalar result (s_out3) if needed.
3821
3822         Step 1 can be accomplished using one the following three schemes:
3823           (scheme 1) using reduc_code, if available.
3824           (scheme 2) using whole-vector shifts, if available.
3825           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3826                      combined.
3827
3828           The overall epilog code looks like this:
3829
3830           s_out0 = phi <s_loop>         # original EXIT_PHI
3831           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3832           v_out2 = reduce <v_out1>              # step 1
3833           s_out3 = extract_field <v_out2, 0>    # step 2
3834           s_out4 = adjust_result <s_out3>       # step 3
3835
3836           (step 3 is optional, and steps 1 and 2 may be combined).
3837           Lastly, the uses of s_out0 are replaced by s_out4.  */
3838
3839
3840   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3841          v_out1 = phi <VECT_DEF>
3842          Store them in NEW_PHIS.  */
3843
3844   exit_bb = single_exit (loop)->dest;
3845   prev_phi_info = NULL;
3846   new_phis.create (vect_defs.length ());
3847   FOR_EACH_VEC_ELT (vect_defs, i, def)
3848     {
3849       for (j = 0; j < ncopies; j++)
3850         {
3851           tree new_def = copy_ssa_name (def, NULL);
3852           phi = create_phi_node (new_def, exit_bb);
3853           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3854           if (j == 0)
3855             new_phis.quick_push (phi);
3856           else
3857             {
3858               def = vect_get_vec_def_for_stmt_copy (dt, def);
3859               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3860             }
3861
3862           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3863           prev_phi_info = vinfo_for_stmt (phi);
3864         }
3865     }
3866
3867   /* The epilogue is created for the outer-loop, i.e., for the loop being
3868      vectorized.  Create exit phis for the outer loop.  */
3869   if (double_reduc)
3870     {
3871       loop = outer_loop;
3872       exit_bb = single_exit (loop)->dest;
3873       inner_phis.create (vect_defs.length ());
3874       FOR_EACH_VEC_ELT (new_phis, i, phi)
3875         {
3876           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3877           gimple outer_phi = create_phi_node (new_result, exit_bb);
3878           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3879                            PHI_RESULT (phi));
3880           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3881                                                             loop_vinfo, NULL));
3882           inner_phis.quick_push (phi);
3883           new_phis[i] = outer_phi;
3884           prev_phi_info = vinfo_for_stmt (outer_phi);
3885           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3886             {
3887               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3888               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3889               outer_phi = create_phi_node (new_result, exit_bb);
3890               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3891                                PHI_RESULT (phi));
3892               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3893                                                         loop_vinfo, NULL));
3894               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3895               prev_phi_info = vinfo_for_stmt (outer_phi);
3896             }
3897         }
3898     }
3899
3900   exit_gsi = gsi_after_labels (exit_bb);
3901
3902   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3903          (i.e. when reduc_code is not available) and in the final adjustment
3904          code (if needed).  Also get the original scalar reduction variable as
3905          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3906          represents a reduction pattern), the tree-code and scalar-def are
3907          taken from the original stmt that the pattern-stmt (STMT) replaces.
3908          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3909          are taken from STMT.  */
3910
3911   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3912   if (!orig_stmt)
3913     {
3914       /* Regular reduction  */
3915       orig_stmt = stmt;
3916     }
3917   else
3918     {
3919       /* Reduction pattern  */
3920       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3921       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3922       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3923     }
3924
3925   code = gimple_assign_rhs_code (orig_stmt);
3926   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3927      partial results are added and not subtracted.  */
3928   if (code == MINUS_EXPR)
3929     code = PLUS_EXPR;
3930
3931   scalar_dest = gimple_assign_lhs (orig_stmt);
3932   scalar_type = TREE_TYPE (scalar_dest);
3933   scalar_results.create (group_size);
3934   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3935   bitsize = TYPE_SIZE (scalar_type);
3936
3937   /* In case this is a reduction in an inner-loop while vectorizing an outer
3938      loop - we don't need to extract a single scalar result at the end of the
3939      inner-loop (unless it is double reduction, i.e., the use of reduction is
3940      outside the outer-loop).  The final vector of partial results will be used
3941      in the vectorized outer-loop, or reduced to a scalar result at the end of
3942      the outer-loop.  */
3943   if (nested_in_vect_loop && !double_reduc)
3944     goto vect_finalize_reduction;
3945
3946   /* SLP reduction without reduction chain, e.g.,
3947      # a1 = phi <a2, a0>
3948      # b1 = phi <b2, b0>
3949      a2 = operation (a1)
3950      b2 = operation (b1)  */
3951   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3952
3953   /* In case of reduction chain, e.g.,
3954      # a1 = phi <a3, a0>
3955      a2 = operation (a1)
3956      a3 = operation (a2),
3957
3958      we may end up with more than one vector result.  Here we reduce them to
3959      one vector.  */
3960   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3961     {
3962       tree first_vect = PHI_RESULT (new_phis[0]);
3963       tree tmp;
3964       gimple new_vec_stmt = NULL;
3965
3966       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3967       for (k = 1; k < new_phis.length (); k++)
3968         {
3969           gimple next_phi = new_phis[k];
3970           tree second_vect = PHI_RESULT (next_phi);
3971
3972           tmp = build2 (code, vectype,  first_vect, second_vect);
3973           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3974           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3975           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3976           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3977         }
3978
3979       new_phi_result = first_vect;
3980       if (new_vec_stmt)
3981         {
3982           new_phis.truncate (0);
3983           new_phis.safe_push (new_vec_stmt);
3984         }
3985     }
3986   else
3987     new_phi_result = PHI_RESULT (new_phis[0]);
3988
3989   /* 2.3 Create the reduction code, using one of the three schemes described
3990          above. In SLP we simply need to extract all the elements from the
3991          vector (without reducing them), so we use scalar shifts.  */
3992   if (reduc_code != ERROR_MARK && !slp_reduc)
3993     {
3994       tree tmp;
3995
3996       /*** Case 1:  Create:
3997            v_out2 = reduc_expr <v_out1>  */
3998
3999       if (dump_enabled_p ())
4000         dump_printf_loc (MSG_NOTE, vect_location,
4001                          "Reduce using direct vector reduction.");
4002
4003       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4004       tmp = build1 (reduc_code, vectype, new_phi_result);
4005       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4006       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4007       gimple_assign_set_lhs (epilog_stmt, new_temp);
4008       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4009
4010       extract_scalar_result = true;
4011     }
4012   else
4013     {
4014       enum tree_code shift_code = ERROR_MARK;
4015       bool have_whole_vector_shift = true;
4016       int bit_offset;
4017       int element_bitsize = tree_low_cst (bitsize, 1);
4018       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4019       tree vec_temp;
4020
4021       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4022         shift_code = VEC_RSHIFT_EXPR;
4023       else
4024         have_whole_vector_shift = false;
4025
4026       /* Regardless of whether we have a whole vector shift, if we're
4027          emulating the operation via tree-vect-generic, we don't want
4028          to use it.  Only the first round of the reduction is likely
4029          to still be profitable via emulation.  */
4030       /* ??? It might be better to emit a reduction tree code here, so that
4031          tree-vect-generic can expand the first round via bit tricks.  */
4032       if (!VECTOR_MODE_P (mode))
4033         have_whole_vector_shift = false;
4034       else
4035         {
4036           optab optab = optab_for_tree_code (code, vectype, optab_default);
4037           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4038             have_whole_vector_shift = false;
4039         }
4040
4041       if (have_whole_vector_shift && !slp_reduc)
4042         {
4043           /*** Case 2: Create:
4044              for (offset = VS/2; offset >= element_size; offset/=2)
4045                 {
4046                   Create:  va' = vec_shift <va, offset>
4047                   Create:  va = vop <va, va'>
4048                 }  */
4049
4050           if (dump_enabled_p ())
4051             dump_printf_loc (MSG_NOTE, vect_location,
4052                              "Reduce using vector shifts");
4053
4054           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4055           new_temp = new_phi_result;
4056           for (bit_offset = vec_size_in_bits/2;
4057                bit_offset >= element_bitsize;
4058                bit_offset /= 2)
4059             {
4060               tree bitpos = size_int (bit_offset);
4061
4062               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4063                                                vec_dest, new_temp, bitpos);
4064               new_name = make_ssa_name (vec_dest, epilog_stmt);
4065               gimple_assign_set_lhs (epilog_stmt, new_name);
4066               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4067
4068               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4069                                                           new_name, new_temp);
4070               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4071               gimple_assign_set_lhs (epilog_stmt, new_temp);
4072               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4073             }
4074
4075           extract_scalar_result = true;
4076         }
4077       else
4078         {
4079           tree rhs;
4080
4081           /*** Case 3: Create:
4082              s = extract_field <v_out2, 0>
4083              for (offset = element_size;
4084                   offset < vector_size;
4085                   offset += element_size;)
4086                {
4087                  Create:  s' = extract_field <v_out2, offset>
4088                  Create:  s = op <s, s'>  // For non SLP cases
4089                }  */
4090
4091           if (dump_enabled_p ())
4092             dump_printf_loc (MSG_NOTE, vect_location,
4093                              "Reduce using scalar code. ");
4094
4095           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4096           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4097             {
4098               if (gimple_code (new_phi) == GIMPLE_PHI)
4099                 vec_temp = PHI_RESULT (new_phi);
4100               else
4101                 vec_temp = gimple_assign_lhs (new_phi);
4102               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4103                             bitsize_zero_node);
4104               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4105               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4106               gimple_assign_set_lhs (epilog_stmt, new_temp);
4107               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4108
4109               /* In SLP we don't need to apply reduction operation, so we just
4110                  collect s' values in SCALAR_RESULTS.  */
4111               if (slp_reduc)
4112                 scalar_results.safe_push (new_temp);
4113
4114               for (bit_offset = element_bitsize;
4115                    bit_offset < vec_size_in_bits;
4116                    bit_offset += element_bitsize)
4117                 {
4118                   tree bitpos = bitsize_int (bit_offset);
4119                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4120                                      bitsize, bitpos);
4121
4122                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4123                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4124                   gimple_assign_set_lhs (epilog_stmt, new_name);
4125                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4126
4127                   if (slp_reduc)
4128                     {
4129                       /* In SLP we don't need to apply reduction operation, so
4130                          we just collect s' values in SCALAR_RESULTS.  */
4131                       new_temp = new_name;
4132                       scalar_results.safe_push (new_name);
4133                     }
4134                   else
4135                     {
4136                       epilog_stmt = gimple_build_assign_with_ops (code,
4137                                           new_scalar_dest, new_name, new_temp);
4138                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4139                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4140                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4141                     }
4142                 }
4143             }
4144
4145           /* The only case where we need to reduce scalar results in SLP, is
4146              unrolling.  If the size of SCALAR_RESULTS is greater than
4147              GROUP_SIZE, we reduce them combining elements modulo
4148              GROUP_SIZE.  */
4149           if (slp_reduc)
4150             {
4151               tree res, first_res, new_res;
4152               gimple new_stmt;
4153
4154               /* Reduce multiple scalar results in case of SLP unrolling.  */
4155               for (j = group_size; scalar_results.iterate (j, &res);
4156                    j++)
4157                 {
4158                   first_res = scalar_results[j % group_size];
4159                   new_stmt = gimple_build_assign_with_ops (code,
4160                                               new_scalar_dest, first_res, res);
4161                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4162                   gimple_assign_set_lhs (new_stmt, new_res);
4163                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4164                   scalar_results[j % group_size] = new_res;
4165                 }
4166             }
4167           else
4168             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4169             scalar_results.safe_push (new_temp);
4170
4171           extract_scalar_result = false;
4172         }
4173     }
4174
4175   /* 2.4  Extract the final scalar result.  Create:
4176           s_out3 = extract_field <v_out2, bitpos>  */
4177
4178   if (extract_scalar_result)
4179     {
4180       tree rhs;
4181
4182       if (dump_enabled_p ())
4183         dump_printf_loc (MSG_NOTE, vect_location,
4184                          "extract scalar result");
4185
4186       if (BYTES_BIG_ENDIAN)
4187         bitpos = size_binop (MULT_EXPR,
4188                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4189                              TYPE_SIZE (scalar_type));
4190       else
4191         bitpos = bitsize_zero_node;
4192
4193       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4194       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4195       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4196       gimple_assign_set_lhs (epilog_stmt, new_temp);
4197       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4198       scalar_results.safe_push (new_temp);
4199     }
4200
4201 vect_finalize_reduction:
4202
4203   if (double_reduc)
4204     loop = loop->inner;
4205
4206   /* 2.5 Adjust the final result by the initial value of the reduction
4207          variable. (When such adjustment is not needed, then
4208          'adjustment_def' is zero).  For example, if code is PLUS we create:
4209          new_temp = loop_exit_def + adjustment_def  */
4210
4211   if (adjustment_def)
4212     {
4213       gcc_assert (!slp_reduc);
4214       if (nested_in_vect_loop)
4215         {
4216           new_phi = new_phis[0];
4217           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4218           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4219           new_dest = vect_create_destination_var (scalar_dest, vectype);
4220         }
4221       else
4222         {
4223           new_temp = scalar_results[0];
4224           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4225           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4226           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4227         }
4228
4229       epilog_stmt = gimple_build_assign (new_dest, expr);
4230       new_temp = make_ssa_name (new_dest, epilog_stmt);
4231       gimple_assign_set_lhs (epilog_stmt, new_temp);
4232       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4233       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4234       if (nested_in_vect_loop)
4235         {
4236           set_vinfo_for_stmt (epilog_stmt,
4237                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4238                                                  NULL));
4239           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4240                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4241
4242           if (!double_reduc)
4243             scalar_results.quick_push (new_temp);
4244           else
4245             scalar_results[0] = new_temp;
4246         }
4247       else
4248         scalar_results[0] = new_temp;
4249
4250       new_phis[0] = epilog_stmt;
4251     }
4252
4253   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4254           phis with new adjusted scalar results, i.e., replace use <s_out0>
4255           with use <s_out4>.
4256
4257      Transform:
4258         loop_exit:
4259           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4260           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4261           v_out2 = reduce <v_out1>
4262           s_out3 = extract_field <v_out2, 0>
4263           s_out4 = adjust_result <s_out3>
4264           use <s_out0>
4265           use <s_out0>
4266
4267      into:
4268
4269         loop_exit:
4270           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4271           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4272           v_out2 = reduce <v_out1>
4273           s_out3 = extract_field <v_out2, 0>
4274           s_out4 = adjust_result <s_out3>
4275           use <s_out4>
4276           use <s_out4> */
4277
4278
4279   /* In SLP reduction chain we reduce vector results into one vector if
4280      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4281      the last stmt in the reduction chain, since we are looking for the loop
4282      exit phi node.  */
4283   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4284     {
4285       scalar_dest = gimple_assign_lhs (
4286                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4287       group_size = 1;
4288     }
4289
4290   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4291      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4292      need to match SCALAR_RESULTS with corresponding statements.  The first
4293      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4294      the first vector stmt, etc.
4295      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4296   if (group_size > new_phis.length ())
4297     {
4298       ratio = group_size / new_phis.length ();
4299       gcc_assert (!(group_size % new_phis.length ()));
4300     }
4301   else
4302     ratio = 1;
4303
4304   for (k = 0; k < group_size; k++)
4305     {
4306       if (k % ratio == 0)
4307         {
4308           epilog_stmt = new_phis[k / ratio];
4309           reduction_phi = reduction_phis[k / ratio];
4310           if (double_reduc)
4311             inner_phi = inner_phis[k / ratio];
4312         }
4313
4314       if (slp_reduc)
4315         {
4316           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4317
4318           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4319           /* SLP statements can't participate in patterns.  */
4320           gcc_assert (!orig_stmt);
4321           scalar_dest = gimple_assign_lhs (current_stmt);
4322         }
4323
4324       phis.create (3);
4325       /* Find the loop-closed-use at the loop exit of the original scalar
4326          result.  (The reduction result is expected to have two immediate uses -
4327          one at the latch block, and one at the loop exit).  */
4328       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4329         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4330             && !is_gimple_debug (USE_STMT (use_p)))
4331           phis.safe_push (USE_STMT (use_p));
4332
4333       /* While we expect to have found an exit_phi because of loop-closed-ssa
4334          form we can end up without one if the scalar cycle is dead.  */
4335
4336       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4337         {
4338           if (outer_loop)
4339             {
4340               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4341               gimple vect_phi;
4342
4343               /* FORNOW. Currently not supporting the case that an inner-loop
4344                  reduction is not used in the outer-loop (but only outside the
4345                  outer-loop), unless it is double reduction.  */
4346               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4347                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4348                           || double_reduc);
4349
4350               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4351               if (!double_reduc
4352                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4353                       != vect_double_reduction_def)
4354                 continue;
4355
4356               /* Handle double reduction:
4357
4358                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4359                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4360                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4361                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4362
4363                  At that point the regular reduction (stmt2 and stmt3) is
4364                  already vectorized, as well as the exit phi node, stmt4.
4365                  Here we vectorize the phi node of double reduction, stmt1, and
4366                  update all relevant statements.  */
4367
4368               /* Go through all the uses of s2 to find double reduction phi
4369                  node, i.e., stmt1 above.  */
4370               orig_name = PHI_RESULT (exit_phi);
4371               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4372                 {
4373                   stmt_vec_info use_stmt_vinfo;
4374                   stmt_vec_info new_phi_vinfo;
4375                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4376                   basic_block bb = gimple_bb (use_stmt);
4377                   gimple use;
4378
4379                   /* Check that USE_STMT is really double reduction phi
4380                      node.  */
4381                   if (gimple_code (use_stmt) != GIMPLE_PHI
4382                       || gimple_phi_num_args (use_stmt) != 2
4383                       || bb->loop_father != outer_loop)
4384                     continue;
4385                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4386                   if (!use_stmt_vinfo
4387                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4388                           != vect_double_reduction_def)
4389                     continue;
4390
4391                   /* Create vector phi node for double reduction:
4392                      vs1 = phi <vs0, vs2>
4393                      vs1 was created previously in this function by a call to
4394                        vect_get_vec_def_for_operand and is stored in
4395                        vec_initial_def;
4396                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4397                      vs0 is created here.  */
4398
4399                   /* Create vector phi node.  */
4400                   vect_phi = create_phi_node (vec_initial_def, bb);
4401                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4402                                     loop_vec_info_for_loop (outer_loop), NULL);
4403                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4404
4405                   /* Create vs0 - initial def of the double reduction phi.  */
4406                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4407                                              loop_preheader_edge (outer_loop));
4408                   init_def = get_initial_def_for_reduction (stmt,
4409                                                           preheader_arg, NULL);
4410                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4411                                                     vectype, NULL);
4412
4413                   /* Update phi node arguments with vs0 and vs2.  */
4414                   add_phi_arg (vect_phi, vect_phi_init,
4415                                loop_preheader_edge (outer_loop),
4416                                UNKNOWN_LOCATION);
4417                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4418                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4419                   if (dump_enabled_p ())
4420                     {
4421                       dump_printf_loc (MSG_NOTE, vect_location,
4422                                        "created double reduction phi node: ");
4423                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4424                     }
4425
4426                   vect_phi_res = PHI_RESULT (vect_phi);
4427
4428                   /* Replace the use, i.e., set the correct vs1 in the regular
4429                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4430                      loop is redundant.  */
4431                   use = reduction_phi;
4432                   for (j = 0; j < ncopies; j++)
4433                     {
4434                       edge pr_edge = loop_preheader_edge (loop);
4435                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4436                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4437                     }
4438                 }
4439             }
4440         }
4441
4442       phis.release ();
4443       if (nested_in_vect_loop)
4444         {
4445           if (double_reduc)
4446             loop = outer_loop;
4447           else
4448             continue;
4449         }
4450
4451       phis.create (3);
4452       /* Find the loop-closed-use at the loop exit of the original scalar
4453          result.  (The reduction result is expected to have two immediate uses,
4454          one at the latch block, and one at the loop exit).  For double
4455          reductions we are looking for exit phis of the outer loop.  */
4456       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4457         {
4458           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4459             {
4460               if (!is_gimple_debug (USE_STMT (use_p)))
4461                 phis.safe_push (USE_STMT (use_p));
4462             }
4463           else
4464             {
4465               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4466                 {
4467                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4468
4469                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4470                     {
4471                       if (!flow_bb_inside_loop_p (loop,
4472                                              gimple_bb (USE_STMT (phi_use_p)))
4473                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4474                         phis.safe_push (USE_STMT (phi_use_p));
4475                     }
4476                 }
4477             }
4478         }
4479
4480       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4481         {
4482           /* Replace the uses:  */
4483           orig_name = PHI_RESULT (exit_phi);
4484           scalar_result = scalar_results[k];
4485           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4486             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4487               SET_USE (use_p, scalar_result);
4488         }
4489
4490       phis.release ();
4491     }
4492
4493   scalar_results.release ();
4494   inner_phis.release ();
4495   new_phis.release ();
4496 }
4497
4498
4499 /* Function vectorizable_reduction.
4500
4501    Check if STMT performs a reduction operation that can be vectorized.
4502    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4503    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4504    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4505
4506    This function also handles reduction idioms (patterns) that have been
4507    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4508    of this form:
4509      X = pattern_expr (arg0, arg1, ..., X)
4510    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4511    sequence that had been detected and replaced by the pattern-stmt (STMT).
4512
4513    In some cases of reduction patterns, the type of the reduction variable X is
4514    different than the type of the other arguments of STMT.
4515    In such cases, the vectype that is used when transforming STMT into a vector
4516    stmt is different than the vectype that is used to determine the
4517    vectorization factor, because it consists of a different number of elements
4518    than the actual number of elements that are being operated upon in parallel.
4519
4520    For example, consider an accumulation of shorts into an int accumulator.
4521    On some targets it's possible to vectorize this pattern operating on 8
4522    shorts at a time (hence, the vectype for purposes of determining the
4523    vectorization factor should be V8HI); on the other hand, the vectype that
4524    is used to create the vector form is actually V4SI (the type of the result).
4525
4526    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4527    indicates what is the actual level of parallelism (V8HI in the example), so
4528    that the right vectorization factor would be derived.  This vectype
4529    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4530    be used to create the vectorized stmt.  The right vectype for the vectorized
4531    stmt is obtained from the type of the result X:
4532         get_vectype_for_scalar_type (TREE_TYPE (X))
4533
4534    This means that, contrary to "regular" reductions (or "regular" stmts in
4535    general), the following equation:
4536       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4537    does *NOT* necessarily hold for reduction patterns.  */
4538
4539 bool
4540 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4541                         gimple *vec_stmt, slp_tree slp_node)
4542 {
4543   tree vec_dest;
4544   tree scalar_dest;
4545   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4546   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4547   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4548   tree vectype_in = NULL_TREE;
4549   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4550   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4551   enum tree_code code, orig_code, epilog_reduc_code;
4552   enum machine_mode vec_mode;
4553   int op_type;
4554   optab optab, reduc_optab;
4555   tree new_temp = NULL_TREE;
4556   tree def;
4557   gimple def_stmt;
4558   enum vect_def_type dt;
4559   gimple new_phi = NULL;
4560   tree scalar_type;
4561   bool is_simple_use;
4562   gimple orig_stmt;
4563   stmt_vec_info orig_stmt_info;
4564   tree expr = NULL_TREE;
4565   int i;
4566   int ncopies;
4567   int epilog_copies;
4568   stmt_vec_info prev_stmt_info, prev_phi_info;
4569   bool single_defuse_cycle = false;
4570   tree reduc_def = NULL_TREE;
4571   gimple new_stmt = NULL;
4572   int j;
4573   tree ops[3];
4574   bool nested_cycle = false, found_nested_cycle_def = false;
4575   gimple reduc_def_stmt = NULL;
4576   /* The default is that the reduction variable is the last in statement.  */
4577   int reduc_index = 2;
4578   bool double_reduc = false, dummy;
4579   basic_block def_bb;
4580   struct loop * def_stmt_loop, *outer_loop = NULL;
4581   tree def_arg;
4582   gimple def_arg_stmt;
4583   vec<tree> vec_oprnds0 = vNULL;
4584   vec<tree> vec_oprnds1 = vNULL;
4585   vec<tree> vect_defs = vNULL;
4586   vec<gimple> phis = vNULL;
4587   int vec_num;
4588   tree def0, def1, tem, op0, op1 = NULL_TREE;
4589
4590   /* In case of reduction chain we switch to the first stmt in the chain, but
4591      we don't update STMT_INFO, since only the last stmt is marked as reduction
4592      and has reduction properties.  */
4593   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4594     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4595
4596   if (nested_in_vect_loop_p (loop, stmt))
4597     {
4598       outer_loop = loop;
4599       loop = loop->inner;
4600       nested_cycle = true;
4601     }
4602
4603   /* 1. Is vectorizable reduction?  */
4604   /* Not supportable if the reduction variable is used in the loop, unless
4605      it's a reduction chain.  */
4606   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4607       && !GROUP_FIRST_ELEMENT (stmt_info))
4608     return false;
4609
4610   /* Reductions that are not used even in an enclosing outer-loop,
4611      are expected to be "live" (used out of the loop).  */
4612   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4613       && !STMT_VINFO_LIVE_P (stmt_info))
4614     return false;
4615
4616   /* Make sure it was already recognized as a reduction computation.  */
4617   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4618       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4619     return false;
4620
4621   /* 2. Has this been recognized as a reduction pattern?
4622
4623      Check if STMT represents a pattern that has been recognized
4624      in earlier analysis stages.  For stmts that represent a pattern,
4625      the STMT_VINFO_RELATED_STMT field records the last stmt in
4626      the original sequence that constitutes the pattern.  */
4627
4628   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4629   if (orig_stmt)
4630     {
4631       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4632       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4633       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4634     }
4635
4636   /* 3. Check the operands of the operation.  The first operands are defined
4637         inside the loop body. The last operand is the reduction variable,
4638         which is defined by the loop-header-phi.  */
4639
4640   gcc_assert (is_gimple_assign (stmt));
4641
4642   /* Flatten RHS.  */
4643   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4644     {
4645     case GIMPLE_SINGLE_RHS:
4646       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4647       if (op_type == ternary_op)
4648         {
4649           tree rhs = gimple_assign_rhs1 (stmt);
4650           ops[0] = TREE_OPERAND (rhs, 0);
4651           ops[1] = TREE_OPERAND (rhs, 1);
4652           ops[2] = TREE_OPERAND (rhs, 2);
4653           code = TREE_CODE (rhs);
4654         }
4655       else
4656         return false;
4657       break;
4658
4659     case GIMPLE_BINARY_RHS:
4660       code = gimple_assign_rhs_code (stmt);
4661       op_type = TREE_CODE_LENGTH (code);
4662       gcc_assert (op_type == binary_op);
4663       ops[0] = gimple_assign_rhs1 (stmt);
4664       ops[1] = gimple_assign_rhs2 (stmt);
4665       break;
4666
4667     case GIMPLE_TERNARY_RHS:
4668       code = gimple_assign_rhs_code (stmt);
4669       op_type = TREE_CODE_LENGTH (code);
4670       gcc_assert (op_type == ternary_op);
4671       ops[0] = gimple_assign_rhs1 (stmt);
4672       ops[1] = gimple_assign_rhs2 (stmt);
4673       ops[2] = gimple_assign_rhs3 (stmt);
4674       break;
4675
4676     case GIMPLE_UNARY_RHS:
4677       return false;
4678
4679     default:
4680       gcc_unreachable ();
4681     }
4682
4683   if (code == COND_EXPR && slp_node)
4684     return false;
4685
4686   scalar_dest = gimple_assign_lhs (stmt);
4687   scalar_type = TREE_TYPE (scalar_dest);
4688   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4689       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4690     return false;
4691
4692   /* Do not try to vectorize bit-precision reductions.  */
4693   if ((TYPE_PRECISION (scalar_type)
4694        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4695     return false;
4696
4697   /* All uses but the last are expected to be defined in the loop.
4698      The last use is the reduction variable.  In case of nested cycle this
4699      assumption is not true: we use reduc_index to record the index of the
4700      reduction variable.  */
4701   for (i = 0; i < op_type - 1; i++)
4702     {
4703       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4704       if (i == 0 && code == COND_EXPR)
4705         continue;
4706
4707       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4708                                             &def_stmt, &def, &dt, &tem);
4709       if (!vectype_in)
4710         vectype_in = tem;
4711       gcc_assert (is_simple_use);
4712
4713       if (dt != vect_internal_def
4714           && dt != vect_external_def
4715           && dt != vect_constant_def
4716           && dt != vect_induction_def
4717           && !(dt == vect_nested_cycle && nested_cycle))
4718         return false;
4719
4720       if (dt == vect_nested_cycle)
4721         {
4722           found_nested_cycle_def = true;
4723           reduc_def_stmt = def_stmt;
4724           reduc_index = i;
4725         }
4726     }
4727
4728   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4729                                         &def_stmt, &def, &dt, &tem);
4730   if (!vectype_in)
4731     vectype_in = tem;
4732   gcc_assert (is_simple_use);
4733   if (!(dt == vect_reduction_def
4734         || dt == vect_nested_cycle
4735         || ((dt == vect_internal_def || dt == vect_external_def
4736              || dt == vect_constant_def || dt == vect_induction_def)
4737             && nested_cycle && found_nested_cycle_def)))
4738     {
4739       /* For pattern recognized stmts, orig_stmt might be a reduction,
4740          but some helper statements for the pattern might not, or
4741          might be COND_EXPRs with reduction uses in the condition.  */
4742       gcc_assert (orig_stmt);
4743       return false;
4744     }
4745   if (!found_nested_cycle_def)
4746     reduc_def_stmt = def_stmt;
4747
4748   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4749   if (orig_stmt)
4750     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4751                                                        reduc_def_stmt,
4752                                                        !nested_cycle,
4753                                                        &dummy));
4754   else
4755     {
4756       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4757                                              !nested_cycle, &dummy);
4758       /* We changed STMT to be the first stmt in reduction chain, hence we
4759          check that in this case the first element in the chain is STMT.  */
4760       gcc_assert (stmt == tmp
4761                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4762     }
4763
4764   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4765     return false;
4766
4767   if (slp_node || PURE_SLP_STMT (stmt_info))
4768     ncopies = 1;
4769   else
4770     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4771                / TYPE_VECTOR_SUBPARTS (vectype_in));
4772
4773   gcc_assert (ncopies >= 1);
4774
4775   vec_mode = TYPE_MODE (vectype_in);
4776
4777   if (code == COND_EXPR)
4778     {
4779       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4780         {
4781           if (dump_enabled_p ())
4782             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4783                              "unsupported condition in reduction");
4784
4785             return false;
4786         }
4787     }
4788   else
4789     {
4790       /* 4. Supportable by target?  */
4791
4792       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4793           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4794         {
4795           /* Shifts and rotates are only supported by vectorizable_shifts,
4796              not vectorizable_reduction.  */
4797           if (dump_enabled_p ())
4798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4799                              "unsupported shift or rotation.");
4800           return false;
4801         }
4802
4803       /* 4.1. check support for the operation in the loop  */
4804       optab = optab_for_tree_code (code, vectype_in, optab_default);
4805       if (!optab)
4806         {
4807           if (dump_enabled_p ())
4808             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4809                              "no optab.");
4810
4811           return false;
4812         }
4813
4814       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4815         {
4816           if (dump_enabled_p ())
4817             dump_printf (MSG_NOTE, "op not supported by target.");
4818
4819           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4820               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4821                   < vect_min_worthwhile_factor (code))
4822             return false;
4823
4824           if (dump_enabled_p ())
4825             dump_printf (MSG_NOTE, "proceeding using word mode.");
4826         }
4827
4828       /* Worthwhile without SIMD support?  */
4829       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4830           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4831              < vect_min_worthwhile_factor (code))
4832         {
4833           if (dump_enabled_p ())
4834             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4835                              "not worthwhile without SIMD support.");
4836
4837           return false;
4838         }
4839     }
4840
4841   /* 4.2. Check support for the epilog operation.
4842
4843           If STMT represents a reduction pattern, then the type of the
4844           reduction variable may be different than the type of the rest
4845           of the arguments.  For example, consider the case of accumulation
4846           of shorts into an int accumulator; The original code:
4847                         S1: int_a = (int) short_a;
4848           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4849
4850           was replaced with:
4851                         STMT: int_acc = widen_sum <short_a, int_acc>
4852
4853           This means that:
4854           1. The tree-code that is used to create the vector operation in the
4855              epilog code (that reduces the partial results) is not the
4856              tree-code of STMT, but is rather the tree-code of the original
4857              stmt from the pattern that STMT is replacing.  I.e, in the example
4858              above we want to use 'widen_sum' in the loop, but 'plus' in the
4859              epilog.
4860           2. The type (mode) we use to check available target support
4861              for the vector operation to be created in the *epilog*, is
4862              determined by the type of the reduction variable (in the example
4863              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4864              However the type (mode) we use to check available target support
4865              for the vector operation to be created *inside the loop*, is
4866              determined by the type of the other arguments to STMT (in the
4867              example we'd check this: optab_handler (widen_sum_optab,
4868              vect_short_mode)).
4869
4870           This is contrary to "regular" reductions, in which the types of all
4871           the arguments are the same as the type of the reduction variable.
4872           For "regular" reductions we can therefore use the same vector type
4873           (and also the same tree-code) when generating the epilog code and
4874           when generating the code inside the loop.  */
4875
4876   if (orig_stmt)
4877     {
4878       /* This is a reduction pattern: get the vectype from the type of the
4879          reduction variable, and get the tree-code from orig_stmt.  */
4880       orig_code = gimple_assign_rhs_code (orig_stmt);
4881       gcc_assert (vectype_out);
4882       vec_mode = TYPE_MODE (vectype_out);
4883     }
4884   else
4885     {
4886       /* Regular reduction: use the same vectype and tree-code as used for
4887          the vector code inside the loop can be used for the epilog code. */
4888       orig_code = code;
4889     }
4890
4891   if (nested_cycle)
4892     {
4893       def_bb = gimple_bb (reduc_def_stmt);
4894       def_stmt_loop = def_bb->loop_father;
4895       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4896                                        loop_preheader_edge (def_stmt_loop));
4897       if (TREE_CODE (def_arg) == SSA_NAME
4898           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4899           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4900           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4901           && vinfo_for_stmt (def_arg_stmt)
4902           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4903               == vect_double_reduction_def)
4904         double_reduc = true;
4905     }
4906
4907   epilog_reduc_code = ERROR_MARK;
4908   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4909     {
4910       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4911                                          optab_default);
4912       if (!reduc_optab)
4913         {
4914           if (dump_enabled_p ())
4915             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4916                              "no optab for reduction.");
4917
4918           epilog_reduc_code = ERROR_MARK;
4919         }
4920
4921       if (reduc_optab
4922           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4923         {
4924           if (dump_enabled_p ())
4925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4926                              "reduc op not supported by target.");
4927
4928           epilog_reduc_code = ERROR_MARK;
4929         }
4930     }
4931   else
4932     {
4933       if (!nested_cycle || double_reduc)
4934         {
4935           if (dump_enabled_p ())
4936             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4937                              "no reduc code for scalar code.");
4938
4939           return false;
4940         }
4941     }
4942
4943   if (double_reduc && ncopies > 1)
4944     {
4945       if (dump_enabled_p ())
4946         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4947                          "multiple types in double reduction");
4948
4949       return false;
4950     }
4951
4952   /* In case of widenning multiplication by a constant, we update the type
4953      of the constant to be the type of the other operand.  We check that the
4954      constant fits the type in the pattern recognition pass.  */
4955   if (code == DOT_PROD_EXPR
4956       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4957     {
4958       if (TREE_CODE (ops[0]) == INTEGER_CST)
4959         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4960       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4961         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4962       else
4963         {
4964           if (dump_enabled_p ())
4965             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4966                              "invalid types in dot-prod");
4967
4968           return false;
4969         }
4970     }
4971
4972   if (!vec_stmt) /* transformation not required.  */
4973     {
4974       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4975         return false;
4976       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4977       return true;
4978     }
4979
4980   /** Transform.  **/
4981
4982   if (dump_enabled_p ())
4983     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4984
4985   /* FORNOW: Multiple types are not supported for condition.  */
4986   if (code == COND_EXPR)
4987     gcc_assert (ncopies == 1);
4988
4989   /* Create the destination vector  */
4990   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4991
4992   /* In case the vectorization factor (VF) is bigger than the number
4993      of elements that we can fit in a vectype (nunits), we have to generate
4994      more than one vector stmt - i.e - we need to "unroll" the
4995      vector stmt by a factor VF/nunits.  For more details see documentation
4996      in vectorizable_operation.  */
4997
4998   /* If the reduction is used in an outer loop we need to generate
4999      VF intermediate results, like so (e.g. for ncopies=2):
5000         r0 = phi (init, r0)
5001         r1 = phi (init, r1)
5002         r0 = x0 + r0;
5003         r1 = x1 + r1;
5004     (i.e. we generate VF results in 2 registers).
5005     In this case we have a separate def-use cycle for each copy, and therefore
5006     for each copy we get the vector def for the reduction variable from the
5007     respective phi node created for this copy.
5008
5009     Otherwise (the reduction is unused in the loop nest), we can combine
5010     together intermediate results, like so (e.g. for ncopies=2):
5011         r = phi (init, r)
5012         r = x0 + r;
5013         r = x1 + r;
5014    (i.e. we generate VF/2 results in a single register).
5015    In this case for each copy we get the vector def for the reduction variable
5016    from the vectorized reduction operation generated in the previous iteration.
5017   */
5018
5019   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5020     {
5021       single_defuse_cycle = true;
5022       epilog_copies = 1;
5023     }
5024   else
5025     epilog_copies = ncopies;
5026
5027   prev_stmt_info = NULL;
5028   prev_phi_info = NULL;
5029   if (slp_node)
5030     {
5031       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5032       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5033                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5034     }
5035   else
5036     {
5037       vec_num = 1;
5038       vec_oprnds0.create (1);
5039       if (op_type == ternary_op)
5040         vec_oprnds1.create (1);
5041     }
5042
5043   phis.create (vec_num);
5044   vect_defs.create (vec_num);
5045   if (!slp_node)
5046     vect_defs.quick_push (NULL_TREE);
5047
5048   for (j = 0; j < ncopies; j++)
5049     {
5050       if (j == 0 || !single_defuse_cycle)
5051         {
5052           for (i = 0; i < vec_num; i++)
5053             {
5054               /* Create the reduction-phi that defines the reduction
5055                  operand.  */
5056               new_phi = create_phi_node (vec_dest, loop->header);
5057               set_vinfo_for_stmt (new_phi,
5058                                   new_stmt_vec_info (new_phi, loop_vinfo,
5059                                                      NULL));
5060                if (j == 0 || slp_node)
5061                  phis.quick_push (new_phi);
5062             }
5063         }
5064
5065       if (code == COND_EXPR)
5066         {
5067           gcc_assert (!slp_node);
5068           vectorizable_condition (stmt, gsi, vec_stmt,
5069                                   PHI_RESULT (phis[0]),
5070                                   reduc_index, NULL);
5071           /* Multiple types are not supported for condition.  */
5072           break;
5073         }
5074
5075       /* Handle uses.  */
5076       if (j == 0)
5077         {
5078           op0 = ops[!reduc_index];
5079           if (op_type == ternary_op)
5080             {
5081               if (reduc_index == 0)
5082                 op1 = ops[2];
5083               else
5084                 op1 = ops[1];
5085             }
5086
5087           if (slp_node)
5088             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5089                                slp_node, -1);
5090           else
5091             {
5092               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5093                                                             stmt, NULL);
5094               vec_oprnds0.quick_push (loop_vec_def0);
5095               if (op_type == ternary_op)
5096                {
5097                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5098                                                                NULL);
5099                  vec_oprnds1.quick_push (loop_vec_def1);
5100                }
5101             }
5102         }
5103       else
5104         {
5105           if (!slp_node)
5106             {
5107               enum vect_def_type dt;
5108               gimple dummy_stmt;
5109               tree dummy;
5110
5111               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5112                                   &dummy_stmt, &dummy, &dt);
5113               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5114                                                               loop_vec_def0);
5115               vec_oprnds0[0] = loop_vec_def0;
5116               if (op_type == ternary_op)
5117                 {
5118                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5119                                       &dummy, &dt);
5120                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5121                                                                 loop_vec_def1);
5122                   vec_oprnds1[0] = loop_vec_def1;
5123                 }
5124             }
5125
5126           if (single_defuse_cycle)
5127             reduc_def = gimple_assign_lhs (new_stmt);
5128
5129           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5130         }
5131
5132       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5133         {
5134           if (slp_node)
5135             reduc_def = PHI_RESULT (phis[i]);
5136           else
5137             {
5138               if (!single_defuse_cycle || j == 0)
5139                 reduc_def = PHI_RESULT (new_phi);
5140             }
5141
5142           def1 = ((op_type == ternary_op)
5143                   ? vec_oprnds1[i] : NULL);
5144           if (op_type == binary_op)
5145             {
5146               if (reduc_index == 0)
5147                 expr = build2 (code, vectype_out, reduc_def, def0);
5148               else
5149                 expr = build2 (code, vectype_out, def0, reduc_def);
5150             }
5151           else
5152             {
5153               if (reduc_index == 0)
5154                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5155               else
5156                 {
5157                   if (reduc_index == 1)
5158                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5159                   else
5160                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5161                 }
5162             }
5163
5164           new_stmt = gimple_build_assign (vec_dest, expr);
5165           new_temp = make_ssa_name (vec_dest, new_stmt);
5166           gimple_assign_set_lhs (new_stmt, new_temp);
5167           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5168
5169           if (slp_node)
5170             {
5171               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5172               vect_defs.quick_push (new_temp);
5173             }
5174           else
5175             vect_defs[0] = new_temp;
5176         }
5177
5178       if (slp_node)
5179         continue;
5180
5181       if (j == 0)
5182         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5183       else
5184         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5185
5186       prev_stmt_info = vinfo_for_stmt (new_stmt);
5187       prev_phi_info = vinfo_for_stmt (new_phi);
5188     }
5189
5190   /* Finalize the reduction-phi (set its arguments) and create the
5191      epilog reduction code.  */
5192   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5193     {
5194       new_temp = gimple_assign_lhs (*vec_stmt);
5195       vect_defs[0] = new_temp;
5196     }
5197
5198   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5199                                     epilog_reduc_code, phis, reduc_index,
5200                                     double_reduc, slp_node);
5201
5202   phis.release ();
5203   vect_defs.release ();
5204   vec_oprnds0.release ();
5205   vec_oprnds1.release ();
5206
5207   return true;
5208 }
5209
5210 /* Function vect_min_worthwhile_factor.
5211
5212    For a loop where we could vectorize the operation indicated by CODE,
5213    return the minimum vectorization factor that makes it worthwhile
5214    to use generic vectors.  */
5215 int
5216 vect_min_worthwhile_factor (enum tree_code code)
5217 {
5218   switch (code)
5219     {
5220     case PLUS_EXPR:
5221     case MINUS_EXPR:
5222     case NEGATE_EXPR:
5223       return 4;
5224
5225     case BIT_AND_EXPR:
5226     case BIT_IOR_EXPR:
5227     case BIT_XOR_EXPR:
5228     case BIT_NOT_EXPR:
5229       return 2;
5230
5231     default:
5232       return INT_MAX;
5233     }
5234 }
5235
5236
5237 /* Function vectorizable_induction
5238
5239    Check if PHI performs an induction computation that can be vectorized.
5240    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5241    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5242    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5243
5244 bool
5245 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5246                         gimple *vec_stmt)
5247 {
5248   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5249   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5250   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5251   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5252   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5253   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5254   tree vec_def;
5255
5256   gcc_assert (ncopies >= 1);
5257   /* FORNOW. These restrictions should be relaxed.  */
5258   if (nested_in_vect_loop_p (loop, phi))
5259     {
5260       imm_use_iterator imm_iter;
5261       use_operand_p use_p;
5262       gimple exit_phi;
5263       edge latch_e;
5264       tree loop_arg;
5265
5266       if (ncopies > 1)
5267         {
5268           if (dump_enabled_p ())
5269             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5270                              "multiple types in nested loop.");
5271           return false;
5272         }
5273
5274       exit_phi = NULL;
5275       latch_e = loop_latch_edge (loop->inner);
5276       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5277       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5278         {
5279           if (!flow_bb_inside_loop_p (loop->inner,
5280                                       gimple_bb (USE_STMT (use_p))))
5281             {
5282               exit_phi = USE_STMT (use_p);
5283               break;
5284             }
5285         }
5286       if (exit_phi)
5287         {
5288           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5289           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5290                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5291             {
5292               if (dump_enabled_p ())
5293                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5294                                  "inner-loop induction only used outside "
5295                                  "of the outer vectorized loop.");
5296               return false;
5297             }
5298         }
5299     }
5300
5301   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5302     return false;
5303
5304   /* FORNOW: SLP not supported.  */
5305   if (STMT_SLP_TYPE (stmt_info))
5306     return false;
5307
5308   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5309
5310   if (gimple_code (phi) != GIMPLE_PHI)
5311     return false;
5312
5313   if (!vec_stmt) /* transformation not required.  */
5314     {
5315       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5316       if (dump_enabled_p ())
5317         dump_printf_loc (MSG_NOTE, vect_location,
5318                          "=== vectorizable_induction ===");
5319       vect_model_induction_cost (stmt_info, ncopies);
5320       return true;
5321     }
5322
5323   /** Transform.  **/
5324
5325   if (dump_enabled_p ())
5326     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5327
5328   vec_def = get_initial_def_for_induction (phi);
5329   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5330   return true;
5331 }
5332
5333 /* Function vectorizable_live_operation.
5334
5335    STMT computes a value that is used outside the loop.  Check if
5336    it can be supported.  */
5337
5338 bool
5339 vectorizable_live_operation (gimple stmt,
5340                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5341                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5342 {
5343   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5344   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5345   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5346   int i;
5347   int op_type;
5348   tree op;
5349   tree def;
5350   gimple def_stmt;
5351   enum vect_def_type dt;
5352   enum tree_code code;
5353   enum gimple_rhs_class rhs_class;
5354
5355   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5356
5357   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5358     return false;
5359
5360   if (!is_gimple_assign (stmt))
5361     return false;
5362
5363   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5364     return false;
5365
5366   /* FORNOW. CHECKME. */
5367   if (nested_in_vect_loop_p (loop, stmt))
5368     return false;
5369
5370   code = gimple_assign_rhs_code (stmt);
5371   op_type = TREE_CODE_LENGTH (code);
5372   rhs_class = get_gimple_rhs_class (code);
5373   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5374   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5375
5376   /* FORNOW: support only if all uses are invariant.  This means
5377      that the scalar operations can remain in place, unvectorized.
5378      The original last scalar value that they compute will be used.  */
5379
5380   for (i = 0; i < op_type; i++)
5381     {
5382       if (rhs_class == GIMPLE_SINGLE_RHS)
5383         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5384       else
5385         op = gimple_op (stmt, i + 1);
5386       if (op
5387           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5388                                   &dt))
5389         {
5390           if (dump_enabled_p ())
5391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5392                              "use not simple.");
5393           return false;
5394         }
5395
5396       if (dt != vect_external_def && dt != vect_constant_def)
5397         return false;
5398     }
5399
5400   /* No transformation is required for the cases we currently support.  */
5401   return true;
5402 }
5403
5404 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5405
5406 static void
5407 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5408 {
5409   ssa_op_iter op_iter;
5410   imm_use_iterator imm_iter;
5411   def_operand_p def_p;
5412   gimple ustmt;
5413
5414   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5415     {
5416       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5417         {
5418           basic_block bb;
5419
5420           if (!is_gimple_debug (ustmt))
5421             continue;
5422
5423           bb = gimple_bb (ustmt);
5424
5425           if (!flow_bb_inside_loop_p (loop, bb))
5426             {
5427               if (gimple_debug_bind_p (ustmt))
5428                 {
5429                   if (dump_enabled_p ())
5430                     dump_printf_loc (MSG_NOTE, vect_location,
5431                                      "killing debug use");
5432
5433                   gimple_debug_bind_reset_value (ustmt);
5434                   update_stmt (ustmt);
5435                 }
5436               else
5437                 gcc_unreachable ();
5438             }
5439         }
5440     }
5441 }
5442
5443 /* Function vect_transform_loop.
5444
5445    The analysis phase has determined that the loop is vectorizable.
5446    Vectorize the loop - created vectorized stmts to replace the scalar
5447    stmts in the loop, and update the loop exit condition.  */
5448
5449 void
5450 vect_transform_loop (loop_vec_info loop_vinfo)
5451 {
5452   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5453   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5454   int nbbs = loop->num_nodes;
5455   gimple_stmt_iterator si;
5456   int i;
5457   tree ratio = NULL;
5458   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5459   bool grouped_store;
5460   bool slp_scheduled = false;
5461   unsigned int nunits;
5462   gimple stmt, pattern_stmt;
5463   gimple_seq pattern_def_seq = NULL;
5464   gimple_stmt_iterator pattern_def_si = gsi_none ();
5465   bool transform_pattern_stmt = false;
5466   bool check_profitability = false;
5467   int th;
5468   /* Record number of iterations before we started tampering with the profile. */
5469   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5470
5471   if (dump_enabled_p ())
5472     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5473
5474   /* If profile is inprecise, we have chance to fix it up.  */
5475   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5476     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5477
5478   /* Use the more conservative vectorization threshold.  If the number
5479      of iterations is constant assume the cost check has been performed
5480      by our caller.  If the threshold makes all loops profitable that
5481      run at least the vectorization factor number of times checking
5482      is pointless, too.  */
5483   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5484          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5485   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5486   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5487       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5488     {
5489       if (dump_enabled_p ())
5490         dump_printf_loc (MSG_NOTE, vect_location,
5491                          "Profitability threshold is %d loop iterations.", th);
5492       check_profitability = true;
5493     }
5494
5495   /* Peel the loop if there are data refs with unknown alignment.
5496      Only one data ref with unknown store is allowed.  */
5497
5498   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5499     {
5500       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5501       check_profitability = false;
5502     }
5503
5504   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5505       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5506     {
5507       vect_loop_versioning (loop_vinfo, th, check_profitability);
5508       check_profitability = false;
5509     }
5510
5511   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5512      compile time constant), or it is a constant that doesn't divide by the
5513      vectorization factor, then an epilog loop needs to be created.
5514      We therefore duplicate the loop: the original loop will be vectorized,
5515      and will compute the first (n/VF) iterations.  The second copy of the loop
5516      will remain scalar and will compute the remaining (n%VF) iterations.
5517      (VF is the vectorization factor).  */
5518
5519   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5520        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5521            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5522        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5523     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5524                                     th, check_profitability);
5525   else
5526     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5527                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5528
5529   /* 1) Make sure the loop header has exactly two entries
5530      2) Make sure we have a preheader basic block.  */
5531
5532   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5533
5534   split_edge (loop_preheader_edge (loop));
5535
5536   /* FORNOW: the vectorizer supports only loops which body consist
5537      of one basic block (header + empty latch). When the vectorizer will
5538      support more involved loop forms, the order by which the BBs are
5539      traversed need to be reconsidered.  */
5540
5541   for (i = 0; i < nbbs; i++)
5542     {
5543       basic_block bb = bbs[i];
5544       stmt_vec_info stmt_info;
5545       gimple phi;
5546
5547       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5548         {
5549           phi = gsi_stmt (si);
5550           if (dump_enabled_p ())
5551             {
5552               dump_printf_loc (MSG_NOTE, vect_location,
5553                                "------>vectorizing phi: ");
5554               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5555             }
5556           stmt_info = vinfo_for_stmt (phi);
5557           if (!stmt_info)
5558             continue;
5559
5560           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5561             vect_loop_kill_debug_uses (loop, phi);
5562
5563           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5564               && !STMT_VINFO_LIVE_P (stmt_info))
5565             continue;
5566
5567           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5568                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5569               && dump_enabled_p ())
5570             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5571
5572           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5573             {
5574               if (dump_enabled_p ())
5575                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5576               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5577             }
5578         }
5579
5580       pattern_stmt = NULL;
5581       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5582         {
5583           bool is_store;
5584
5585           if (transform_pattern_stmt)
5586             stmt = pattern_stmt;
5587           else
5588             stmt = gsi_stmt (si);
5589
5590           if (dump_enabled_p ())
5591             {
5592               dump_printf_loc (MSG_NOTE, vect_location,
5593                                "------>vectorizing statement: ");
5594               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5595             }
5596
5597           stmt_info = vinfo_for_stmt (stmt);
5598
5599           /* vector stmts created in the outer-loop during vectorization of
5600              stmts in an inner-loop may not have a stmt_info, and do not
5601              need to be vectorized.  */
5602           if (!stmt_info)
5603             {
5604               gsi_next (&si);
5605               continue;
5606             }
5607
5608           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5609             vect_loop_kill_debug_uses (loop, stmt);
5610
5611           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5612               && !STMT_VINFO_LIVE_P (stmt_info))
5613             {
5614               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5615                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5616                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5617                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5618                 {
5619                   stmt = pattern_stmt;
5620                   stmt_info = vinfo_for_stmt (stmt);
5621                 }
5622               else
5623                 {
5624                   gsi_next (&si);
5625                   continue;
5626                 }
5627             }
5628           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5629                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5630                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5631                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5632             transform_pattern_stmt = true;
5633
5634           /* If pattern statement has def stmts, vectorize them too.  */
5635           if (is_pattern_stmt_p (stmt_info))
5636             {
5637               if (pattern_def_seq == NULL)
5638                 {
5639                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5640                   pattern_def_si = gsi_start (pattern_def_seq);
5641                 }
5642               else if (!gsi_end_p (pattern_def_si))
5643                 gsi_next (&pattern_def_si);
5644               if (pattern_def_seq != NULL)
5645                 {
5646                   gimple pattern_def_stmt = NULL;
5647                   stmt_vec_info pattern_def_stmt_info = NULL;
5648
5649                   while (!gsi_end_p (pattern_def_si))
5650                     {
5651                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5652                       pattern_def_stmt_info
5653                         = vinfo_for_stmt (pattern_def_stmt);
5654                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5655                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5656                         break;
5657                       gsi_next (&pattern_def_si);
5658                     }
5659
5660                   if (!gsi_end_p (pattern_def_si))
5661                     {
5662                       if (dump_enabled_p ())
5663                         {
5664                           dump_printf_loc (MSG_NOTE, vect_location,
5665                                            "==> vectorizing pattern def "
5666                                            "stmt: ");
5667                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5668                                             pattern_def_stmt, 0);
5669                         }
5670
5671                       stmt = pattern_def_stmt;
5672                       stmt_info = pattern_def_stmt_info;
5673                     }
5674                   else
5675                     {
5676                       pattern_def_si = gsi_none ();
5677                       transform_pattern_stmt = false;
5678                     }
5679                 }
5680               else
5681                 transform_pattern_stmt = false;
5682             }
5683
5684           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5685           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5686                                                STMT_VINFO_VECTYPE (stmt_info));
5687           if (!STMT_SLP_TYPE (stmt_info)
5688               && nunits != (unsigned int) vectorization_factor
5689               && dump_enabled_p ())
5690             /* For SLP VF is set according to unrolling factor, and not to
5691                vector size, hence for SLP this print is not valid.  */
5692             dump_printf_loc (MSG_NOTE, vect_location,
5693                              "multiple-types.");
5694
5695           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5696              reached.  */
5697           if (STMT_SLP_TYPE (stmt_info))
5698             {
5699               if (!slp_scheduled)
5700                 {
5701                   slp_scheduled = true;
5702
5703                   if (dump_enabled_p ())
5704                     dump_printf_loc (MSG_NOTE, vect_location,
5705                                      "=== scheduling SLP instances ===");
5706
5707                   vect_schedule_slp (loop_vinfo, NULL);
5708                 }
5709
5710               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5711               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5712                 {
5713                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5714                     {
5715                       pattern_def_seq = NULL;
5716                       gsi_next (&si);
5717                     }
5718                   continue;
5719                 }
5720             }
5721
5722           /* -------- vectorize statement ------------ */
5723           if (dump_enabled_p ())
5724             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5725
5726           grouped_store = false;
5727           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5728           if (is_store)
5729             {
5730               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5731                 {
5732                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5733                      interleaving chain was completed - free all the stores in
5734                      the chain.  */
5735                   gsi_next (&si);
5736                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5737                   continue;
5738                 }
5739               else
5740                 {
5741                   /* Free the attached stmt_vec_info and remove the stmt.  */
5742                   gimple store = gsi_stmt (si);
5743                   free_stmt_vec_info (store);
5744                   unlink_stmt_vdef (store);
5745                   gsi_remove (&si, true);
5746                   release_defs (store);
5747                   continue;
5748                 }
5749             }
5750
5751           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5752             {
5753               pattern_def_seq = NULL;
5754               gsi_next (&si);
5755             }
5756         }                       /* stmts in BB */
5757     }                           /* BBs in loop */
5758
5759   slpeel_make_loop_iterate_ntimes (loop, ratio);
5760
5761   /* Reduce loop iterations by the vectorization factor.  */
5762   scale_loop_profile (loop, RDIV (REG_BR_PROB_BASE , vectorization_factor),
5763                       expected_iterations / vectorization_factor);
5764   loop->nb_iterations_upper_bound
5765     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5766                                             FLOOR_DIV_EXPR);
5767   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5768       && loop->nb_iterations_upper_bound != double_int_zero)
5769     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5770   if (loop->any_estimate)
5771     {
5772       loop->nb_iterations_estimate
5773         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5774                                              FLOOR_DIV_EXPR);
5775        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5776            && loop->nb_iterations_estimate != double_int_zero)
5777          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5778     }
5779
5780   /* The memory tags and pointers in vectorized statements need to
5781      have their SSA forms updated.  FIXME, why can't this be delayed
5782      until all the loops have been transformed?  */
5783   update_ssa (TODO_update_ssa);
5784
5785   if (dump_enabled_p ())
5786     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5787   if (loop->inner && dump_enabled_p ())
5788     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5789                      "OUTER LOOP VECTORIZED.");
5790 }