1 /* Loop autoparallelization.
2 Copyright (C) 2006-2013 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr>
4 Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
25 #include "tree-flow.h"
27 #include "tree-data-ref.h"
28 #include "tree-scalar-evolution.h"
29 #include "gimple-pretty-print.h"
30 #include "tree-pass.h"
31 #include "langhooks.h"
32 #include "tree-vectorizer.h"
34 /* This pass tries to distribute iterations of loops into several threads.
35 The implementation is straightforward -- for each loop we test whether its
36 iterations are independent, and if it is the case (and some additional
37 conditions regarding profitability and correctness are satisfied), we
38 add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
41 The most of the complexity is in bringing the code into shape expected
43 -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
44 variable and that the exit test is at the start of the loop body
45 -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
46 variables by accesses through pointers, and breaking up ssa chains
47 by storing the values incoming to the parallelized loop to a structure
48 passed to the new function as an argument (something similar is done
49 in omp gimplification, unfortunately only a small part of the code
53 -- if there are several parallelizable loops in a function, it may be
54 possible to generate the threads just once (using synchronization to
55 ensure that cross-loop dependences are obeyed).
56 -- handling of common reduction patterns for outer loops.
58 More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC */
61 currently we use vect_force_simple_reduction() to detect reduction patterns.
62 The code transformation will be introduced by an example.
69 for (i = 0; i < N; i++)
79 # sum_29 = PHI <sum_11(5), 1(3)>
80 # i_28 = PHI <i_12(5), 0(3)>
83 sum_11 = D.1795_8 + sum_29;
91 # sum_21 = PHI <sum_11(4)>
92 printf (&"%d"[0], sum_21);
95 after reduction transformation (only relevant parts):
103 # Storing the initial value given by the user. #
105 .paral_data_store.32.sum.27 = 1;
107 #pragma omp parallel num_threads(4)
109 #pragma omp for schedule(static)
111 # The neutral element corresponding to the particular
112 reduction's operation, e.g. 0 for PLUS_EXPR,
113 1 for MULT_EXPR, etc. replaces the user's initial value. #
115 # sum.27_29 = PHI <sum.27_11, 0>
117 sum.27_11 = D.1827_8 + sum.27_29;
121 # Adding this reduction phi is done at create_phi_for_local_result() #
122 # sum.27_56 = PHI <sum.27_11, 0>
125 # Creating the atomic operation is done at
126 create_call_for_reduction_1() #
128 #pragma omp atomic_load
129 D.1839_59 = *&.paral_data_load.33_51->reduction.23;
130 D.1840_60 = sum.27_56 + D.1839_59;
131 #pragma omp atomic_store (D.1840_60);
135 # collecting the result after the join of the threads is done at
136 create_loads_for_reductions().
137 The value computed by the threads is loaded from the
141 .paral_data_load.33_52 = &.paral_data_store.32;
142 sum_37 = .paral_data_load.33_52->sum.27;
143 sum_43 = D.1795_41 + sum_37;
146 # sum_21 = PHI <sum_43, sum_26>
147 printf (&"%d"[0], sum_21);
155 /* Minimal number of iterations of a loop that should be executed in each
157 #define MIN_PER_THREAD 100
159 /* Element of the hashtable, representing a
160 reduction in the current loop. */
161 struct reduction_info
163 gimple reduc_stmt; /* reduction statement. */
164 gimple reduc_phi; /* The phi node defining the reduction. */
165 enum tree_code reduction_code;/* code for the reduction operation. */
166 unsigned reduc_version; /* SSA_NAME_VERSION of original reduc_phi
168 gimple keep_res; /* The PHI_RESULT of this phi is the resulting value
169 of the reduction variable when existing the loop. */
170 tree initial_value; /* The initial value of the reduction var before entering the loop. */
171 tree field; /* the name of the field in the parloop data structure intended for reduction. */
172 tree init; /* reduction initialization value. */
173 gimple new_phi; /* (helper field) Newly created phi node whose result
174 will be passed to the atomic operation. Represents
175 the local result each thread computed for the reduction
179 /* Equality and hash functions for hashtab code. */
182 reduction_info_eq (const void *aa, const void *bb)
184 const struct reduction_info *a = (const struct reduction_info *) aa;
185 const struct reduction_info *b = (const struct reduction_info *) bb;
187 return (a->reduc_phi == b->reduc_phi);
191 reduction_info_hash (const void *aa)
193 const struct reduction_info *a = (const struct reduction_info *) aa;
195 return a->reduc_version;
198 static struct reduction_info *
199 reduction_phi (htab_t reduction_list, gimple phi)
201 struct reduction_info tmpred, *red;
203 if (htab_elements (reduction_list) == 0 || phi == NULL)
206 tmpred.reduc_phi = phi;
207 tmpred.reduc_version = gimple_uid (phi);
208 red = (struct reduction_info *) htab_find (reduction_list, &tmpred);
213 /* Element of hashtable of names to copy. */
215 struct name_to_copy_elt
217 unsigned version; /* The version of the name to copy. */
218 tree new_name; /* The new name used in the copy. */
219 tree field; /* The field of the structure used to pass the
223 /* Equality and hash functions for hashtab code. */
226 name_to_copy_elt_eq (const void *aa, const void *bb)
228 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
229 const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb;
231 return a->version == b->version;
235 name_to_copy_elt_hash (const void *aa)
237 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
239 return (hashval_t) a->version;
242 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE
243 matrix. Rather than use floats, we simply keep a single DENOMINATOR that
244 represents the denominator for every element in the matrix. */
245 typedef struct lambda_trans_matrix_s
247 lambda_matrix matrix;
251 } *lambda_trans_matrix;
252 #define LTM_MATRIX(T) ((T)->matrix)
253 #define LTM_ROWSIZE(T) ((T)->rowsize)
254 #define LTM_COLSIZE(T) ((T)->colsize)
255 #define LTM_DENOMINATOR(T) ((T)->denominator)
257 /* Allocate a new transformation matrix. */
259 static lambda_trans_matrix
260 lambda_trans_matrix_new (int colsize, int rowsize,
261 struct obstack * lambda_obstack)
263 lambda_trans_matrix ret;
265 ret = (lambda_trans_matrix)
266 obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s));
267 LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack);
268 LTM_ROWSIZE (ret) = rowsize;
269 LTM_COLSIZE (ret) = colsize;
270 LTM_DENOMINATOR (ret) = 1;
274 /* Multiply a vector VEC by a matrix MAT.
275 MAT is an M*N matrix, and VEC is a vector with length N. The result
276 is stored in DEST which must be a vector of length M. */
279 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n,
280 lambda_vector vec, lambda_vector dest)
284 lambda_vector_clear (dest, m);
285 for (i = 0; i < m; i++)
286 for (j = 0; j < n; j++)
287 dest[i] += matrix[i][j] * vec[j];
290 /* Return true if TRANS is a legal transformation matrix that respects
291 the dependence vectors in DISTS and DIRS. The conservative answer
294 "Wolfe proves that a unimodular transformation represented by the
295 matrix T is legal when applied to a loop nest with a set of
296 lexicographically non-negative distance vectors RDG if and only if
297 for each vector d in RDG, (T.d >= 0) is lexicographically positive.
298 i.e.: if and only if it transforms the lexicographically positive
299 distance vectors to lexicographically positive vectors. Note that
300 a unimodular matrix must transform the zero vector (and only it) to
301 the zero vector." S.Muchnick. */
304 lambda_transform_legal_p (lambda_trans_matrix trans,
306 vec<ddr_p> dependence_relations)
309 lambda_vector distres;
310 struct data_dependence_relation *ddr;
312 gcc_assert (LTM_COLSIZE (trans) == nb_loops
313 && LTM_ROWSIZE (trans) == nb_loops);
315 /* When there are no dependences, the transformation is correct. */
316 if (dependence_relations.length () == 0)
319 ddr = dependence_relations[0];
323 /* When there is an unknown relation in the dependence_relations, we
324 know that it is no worth looking at this loop nest: give up. */
325 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
328 distres = lambda_vector_new (nb_loops);
330 /* For each distance vector in the dependence graph. */
331 FOR_EACH_VEC_ELT (dependence_relations, i, ddr)
333 /* Don't care about relations for which we know that there is no
334 dependence, nor about read-read (aka. output-dependences):
335 these data accesses can happen in any order. */
336 if (DDR_ARE_DEPENDENT (ddr) == chrec_known
337 || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))))
340 /* Conservatively answer: "this transformation is not valid". */
341 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
344 /* If the dependence could not be captured by a distance vector,
345 conservatively answer that the transform is not valid. */
346 if (DDR_NUM_DIST_VECTS (ddr) == 0)
349 /* Compute trans.dist_vect */
350 for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
352 lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops,
353 DDR_DIST_VECT (ddr, j), distres);
355 if (!lambda_vector_lexico_pos (distres, nb_loops))
362 /* Data dependency analysis. Returns true if the iterations of LOOP
363 are independent on each other (that is, if we can execute them
367 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack)
369 vec<loop_p> loop_nest;
370 vec<ddr_p> dependence_relations;
371 vec<data_reference_p> datarefs;
372 lambda_trans_matrix trans;
375 if (dump_file && (dump_flags & TDF_DETAILS))
377 fprintf (dump_file, "Considering loop %d\n", loop->num);
379 fprintf (dump_file, "loop is innermost\n");
381 fprintf (dump_file, "loop NOT innermost\n");
384 /* Check for problems with dependences. If the loop can be reversed,
385 the iterations are independent. */
386 datarefs.create (10);
387 dependence_relations.create (10 * 10);
388 loop_nest.create (3);
389 if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs,
390 &dependence_relations))
392 if (dump_file && (dump_flags & TDF_DETAILS))
393 fprintf (dump_file, " FAILED: cannot analyze data dependencies\n");
397 if (dump_file && (dump_flags & TDF_DETAILS))
398 dump_data_dependence_relations (dump_file, dependence_relations);
400 trans = lambda_trans_matrix_new (1, 1, parloop_obstack);
401 LTM_MATRIX (trans)[0][0] = -1;
403 if (lambda_transform_legal_p (trans, 1, dependence_relations))
406 if (dump_file && (dump_flags & TDF_DETAILS))
407 fprintf (dump_file, " SUCCESS: may be parallelized\n");
409 else if (dump_file && (dump_flags & TDF_DETAILS))
411 " FAILED: data dependencies exist across iterations\n");
414 loop_nest.release ();
415 free_dependence_relations (dependence_relations);
416 free_data_refs (datarefs);
421 /* Return true when LOOP contains basic blocks marked with the
422 BB_IRREDUCIBLE_LOOP flag. */
425 loop_has_blocks_with_irreducible_flag (struct loop *loop)
428 basic_block *bbs = get_loop_body_in_dom_order (loop);
431 for (i = 0; i < loop->num_nodes; i++)
432 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
441 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
442 The assignment statement is placed on edge ENTRY. DECL_ADDRESS maps decls
443 to their addresses that can be reused. The address of OBJ is known to
444 be invariant in the whole function. Other needed statements are placed
448 take_address_of (tree obj, tree type, edge entry, htab_t decl_address,
449 gimple_stmt_iterator *gsi)
453 struct int_tree_map ielt, *nielt;
454 tree *var_p, name, addr;
458 /* Since the address of OBJ is invariant, the trees may be shared.
459 Avoid rewriting unrelated parts of the code. */
460 obj = unshare_expr (obj);
462 handled_component_p (*var_p);
463 var_p = &TREE_OPERAND (*var_p, 0))
466 /* Canonicalize the access to base on a MEM_REF. */
468 *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p));
470 /* Assign a canonical SSA name to the address of the base decl used
471 in the address and share it for all accesses and addresses based
473 uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
475 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
480 addr = TREE_OPERAND (*var_p, 0);
482 = get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
484 name = make_temp_ssa_name (TREE_TYPE (addr), NULL, obj_name);
486 name = make_ssa_name (TREE_TYPE (addr), NULL);
487 stmt = gimple_build_assign (name, addr);
488 gsi_insert_on_edge_immediate (entry, stmt);
490 nielt = XNEW (struct int_tree_map);
496 name = ((struct int_tree_map *) *dslot)->to;
498 /* Express the address in terms of the canonical SSA name. */
499 TREE_OPERAND (*var_p, 0) = name;
501 return build_fold_addr_expr_with_type (obj, type);
503 name = force_gimple_operand (build_addr (obj, current_function_decl),
504 &stmts, true, NULL_TREE);
505 if (!gimple_seq_empty_p (stmts))
506 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
508 if (!useless_type_conversion_p (type, TREE_TYPE (name)))
510 name = force_gimple_operand (fold_convert (type, name), &stmts, true,
512 if (!gimple_seq_empty_p (stmts))
513 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
519 /* Callback for htab_traverse. Create the initialization statement
520 for reduction described in SLOT, and place it at the preheader of
521 the loop described in DATA. */
524 initialize_reductions (void **slot, void *data)
527 tree bvar, type, arg;
530 struct reduction_info *const reduc = (struct reduction_info *) *slot;
531 struct loop *loop = (struct loop *) data;
533 /* Create initialization in preheader:
534 reduction_variable = initialization value of reduction. */
536 /* In the phi node at the header, replace the argument coming
537 from the preheader with the reduction initialization value. */
539 /* Create a new variable to initialize the reduction. */
540 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
541 bvar = create_tmp_var (type, "reduction");
543 c = build_omp_clause (gimple_location (reduc->reduc_stmt),
544 OMP_CLAUSE_REDUCTION);
545 OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
546 OMP_CLAUSE_DECL (c) = SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt));
548 init = omp_reduction_init (c, TREE_TYPE (bvar));
551 /* Replace the argument representing the initialization value
552 with the initialization value for the reduction (neutral
553 element for the particular operation, e.g. 0 for PLUS_EXPR,
554 1 for MULT_EXPR, etc).
555 Keep the old value in a new variable "reduction_initial",
556 that will be taken in consideration after the parallel
557 computing is done. */
559 e = loop_preheader_edge (loop);
560 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
561 /* Create new variable to hold the initial value. */
563 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
564 (reduc->reduc_phi, loop_preheader_edge (loop)), init);
565 reduc->initial_value = arg;
571 struct walk_stmt_info info;
574 gimple_stmt_iterator *gsi;
579 /* Eliminates references to local variables in *TP out of the single
580 entry single exit region starting at DTA->ENTRY.
581 DECL_ADDRESS contains addresses of the references that had their
582 address taken already. If the expression is changed, CHANGED is
583 set to true. Callback for walk_tree. */
586 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
588 struct elv_data *const dta = (struct elv_data *) data;
589 tree t = *tp, var, addr, addr_type, type, obj;
595 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
598 type = TREE_TYPE (t);
599 addr_type = build_pointer_type (type);
600 addr = take_address_of (t, addr_type, dta->entry, dta->decl_address,
602 if (dta->gsi == NULL && addr == NULL_TREE)
608 *tp = build_simple_mem_ref (addr);
614 if (TREE_CODE (t) == ADDR_EXPR)
616 /* ADDR_EXPR may appear in two contexts:
617 -- as a gimple operand, when the address taken is a function invariant
618 -- as gimple rhs, when the resulting address in not a function
620 We do not need to do anything special in the latter case (the base of
621 the memory reference whose address is taken may be replaced in the
622 DECL_P case). The former case is more complicated, as we need to
623 ensure that the new address is still a gimple operand. Thus, it
624 is not sufficient to replace just the base of the memory reference --
625 we need to move the whole computation of the address out of the
627 if (!is_gimple_val (t))
631 obj = TREE_OPERAND (t, 0);
632 var = get_base_address (obj);
633 if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
636 addr_type = TREE_TYPE (t);
637 addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address,
639 if (dta->gsi == NULL && addr == NULL_TREE)
656 /* Moves the references to local variables in STMT at *GSI out of the single
657 entry single exit region starting at ENTRY. DECL_ADDRESS contains
658 addresses of the references that had their address taken
662 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi,
666 gimple stmt = gsi_stmt (*gsi);
668 memset (&dta.info, '\0', sizeof (dta.info));
670 dta.decl_address = decl_address;
674 if (gimple_debug_bind_p (stmt))
677 walk_tree (gimple_debug_bind_get_value_ptr (stmt),
678 eliminate_local_variables_1, &dta.info, NULL);
681 gimple_debug_bind_reset_value (stmt);
685 else if (gimple_clobber_p (stmt))
687 stmt = gimple_build_nop ();
688 gsi_replace (gsi, stmt, false);
694 walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
701 /* Eliminates the references to local variables from the single entry
702 single exit region between the ENTRY and EXIT edges.
705 1) Taking address of a local variable -- these are moved out of the
706 region (and temporary variable is created to hold the address if
709 2) Dereferencing a local variable -- these are replaced with indirect
713 eliminate_local_variables (edge entry, edge exit)
716 vec<basic_block> body;
719 gimple_stmt_iterator gsi;
720 bool has_debug_stmt = false;
721 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
723 basic_block entry_bb = entry->src;
724 basic_block exit_bb = exit->dest;
726 gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
728 FOR_EACH_VEC_ELT (body, i, bb)
729 if (bb != entry_bb && bb != exit_bb)
730 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
731 if (is_gimple_debug (gsi_stmt (gsi)))
733 if (gimple_debug_bind_p (gsi_stmt (gsi)))
734 has_debug_stmt = true;
737 eliminate_local_variables_stmt (entry, &gsi, decl_address);
740 FOR_EACH_VEC_ELT (body, i, bb)
741 if (bb != entry_bb && bb != exit_bb)
742 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
743 if (gimple_debug_bind_p (gsi_stmt (gsi)))
744 eliminate_local_variables_stmt (entry, &gsi, decl_address);
746 htab_delete (decl_address);
750 /* Returns true if expression EXPR is not defined between ENTRY and
751 EXIT, i.e. if all its operands are defined outside of the region. */
754 expr_invariant_in_region_p (edge entry, edge exit, tree expr)
756 basic_block entry_bb = entry->src;
757 basic_block exit_bb = exit->dest;
760 if (is_gimple_min_invariant (expr))
763 if (TREE_CODE (expr) == SSA_NAME)
765 def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
767 && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
768 && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
777 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
778 The copies are stored to NAME_COPIES, if NAME was already duplicated,
779 its duplicate stored in NAME_COPIES is returned.
781 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
782 duplicated, storing the copies in DECL_COPIES. */
785 separate_decls_in_region_name (tree name,
786 htab_t name_copies, htab_t decl_copies,
789 tree copy, var, var_copy;
790 unsigned idx, uid, nuid;
791 struct int_tree_map ielt, *nielt;
792 struct name_to_copy_elt elt, *nelt;
793 void **slot, **dslot;
795 if (TREE_CODE (name) != SSA_NAME)
798 idx = SSA_NAME_VERSION (name);
800 slot = htab_find_slot_with_hash (name_copies, &elt, idx,
801 copy_name_p ? INSERT : NO_INSERT);
803 return ((struct name_to_copy_elt *) *slot)->new_name;
807 copy = duplicate_ssa_name (name, NULL);
808 nelt = XNEW (struct name_to_copy_elt);
810 nelt->new_name = copy;
811 nelt->field = NULL_TREE;
820 var = SSA_NAME_VAR (name);
824 uid = DECL_UID (var);
826 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
829 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
830 DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
831 nielt = XNEW (struct int_tree_map);
833 nielt->to = var_copy;
836 /* Ensure that when we meet this decl next time, we won't duplicate
838 nuid = DECL_UID (var_copy);
840 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
841 gcc_assert (!*dslot);
842 nielt = XNEW (struct int_tree_map);
844 nielt->to = var_copy;
848 var_copy = ((struct int_tree_map *) *dslot)->to;
850 replace_ssa_name_symbol (copy, var_copy);
854 /* Finds the ssa names used in STMT that are defined outside the
855 region between ENTRY and EXIT and replaces such ssa names with
856 their duplicates. The duplicates are stored to NAME_COPIES. Base
857 decls of all ssa names used in STMT (including those defined in
858 LOOP) are replaced with the new temporary variables; the
859 replacement decls are stored in DECL_COPIES. */
862 separate_decls_in_region_stmt (edge entry, edge exit, gimple stmt,
863 htab_t name_copies, htab_t decl_copies)
871 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
873 name = DEF_FROM_PTR (def);
874 gcc_assert (TREE_CODE (name) == SSA_NAME);
875 copy = separate_decls_in_region_name (name, name_copies, decl_copies,
877 gcc_assert (copy == name);
880 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
882 name = USE_FROM_PTR (use);
883 if (TREE_CODE (name) != SSA_NAME)
886 copy_name_p = expr_invariant_in_region_p (entry, exit, name);
887 copy = separate_decls_in_region_name (name, name_copies, decl_copies,
893 /* Finds the ssa names used in STMT that are defined outside the
894 region between ENTRY and EXIT and replaces such ssa names with
895 their duplicates. The duplicates are stored to NAME_COPIES. Base
896 decls of all ssa names used in STMT (including those defined in
897 LOOP) are replaced with the new temporary variables; the
898 replacement decls are stored in DECL_COPIES. */
901 separate_decls_in_region_debug (gimple stmt, htab_t name_copies,
907 struct int_tree_map ielt;
908 struct name_to_copy_elt elt;
909 void **slot, **dslot;
911 if (gimple_debug_bind_p (stmt))
912 var = gimple_debug_bind_get_var (stmt);
913 else if (gimple_debug_source_bind_p (stmt))
914 var = gimple_debug_source_bind_get_var (stmt);
917 if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL)
919 gcc_assert (DECL_P (var) && SSA_VAR_P (var));
920 ielt.uid = DECL_UID (var);
921 dslot = htab_find_slot_with_hash (decl_copies, &ielt, ielt.uid, NO_INSERT);
924 if (gimple_debug_bind_p (stmt))
925 gimple_debug_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to);
926 else if (gimple_debug_source_bind_p (stmt))
927 gimple_debug_source_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to);
929 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
931 name = USE_FROM_PTR (use);
932 if (TREE_CODE (name) != SSA_NAME)
935 elt.version = SSA_NAME_VERSION (name);
936 slot = htab_find_slot_with_hash (name_copies, &elt, elt.version, NO_INSERT);
939 gimple_debug_bind_reset_value (stmt);
944 SET_USE (use, ((struct name_to_copy_elt *) *slot)->new_name);
950 /* Callback for htab_traverse. Adds a field corresponding to the reduction
951 specified in SLOT. The type is passed in DATA. */
954 add_field_for_reduction (void **slot, void *data)
957 struct reduction_info *const red = (struct reduction_info *) *slot;
958 tree const type = (tree) data;
959 tree var = gimple_assign_lhs (red->reduc_stmt);
960 tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL,
961 SSA_NAME_IDENTIFIER (var), TREE_TYPE (var));
963 insert_field_into_struct (type, field);
970 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
971 described in SLOT. The type is passed in DATA. */
974 add_field_for_name (void **slot, void *data)
976 struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot;
977 tree type = (tree) data;
978 tree name = ssa_name (elt->version);
979 tree field = build_decl (UNKNOWN_LOCATION,
980 FIELD_DECL, SSA_NAME_IDENTIFIER (name),
983 insert_field_into_struct (type, field);
989 /* Callback for htab_traverse. A local result is the intermediate result
991 thread, or the initial value in case no iteration was executed.
992 This function creates a phi node reflecting these values.
993 The phi's result will be stored in NEW_PHI field of the
994 reduction's data structure. */
997 create_phi_for_local_result (void **slot, void *data)
999 struct reduction_info *const reduc = (struct reduction_info *) *slot;
1000 const struct loop *const loop = (const struct loop *) data;
1003 basic_block store_bb;
1005 source_location locus;
1007 /* STORE_BB is the block where the phi
1008 should be stored. It is the destination of the loop exit.
1009 (Find the fallthru edge from GIMPLE_OMP_CONTINUE). */
1010 store_bb = FALLTHRU_EDGE (loop->latch)->dest;
1012 /* STORE_BB has two predecessors. One coming from the loop
1013 (the reduction's result is computed at the loop),
1014 and another coming from a block preceding the loop,
1016 are executed (the initial value should be taken). */
1017 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch))
1018 e = EDGE_PRED (store_bb, 1);
1020 e = EDGE_PRED (store_bb, 0);
1021 local_res = copy_ssa_name (gimple_assign_lhs (reduc->reduc_stmt), NULL);
1022 locus = gimple_location (reduc->reduc_stmt);
1023 new_phi = create_phi_node (local_res, store_bb);
1024 add_phi_arg (new_phi, reduc->init, e, locus);
1025 add_phi_arg (new_phi, gimple_assign_lhs (reduc->reduc_stmt),
1026 FALLTHRU_EDGE (loop->latch), locus);
1027 reduc->new_phi = new_phi;
1037 basic_block store_bb;
1038 basic_block load_bb;
1041 /* Callback for htab_traverse. Create an atomic instruction for the
1042 reduction described in SLOT.
1043 DATA annotates the place in memory the atomic operation relates to,
1044 and the basic block it needs to be generated in. */
1047 create_call_for_reduction_1 (void **slot, void *data)
1049 struct reduction_info *const reduc = (struct reduction_info *) *slot;
1050 struct clsn_data *const clsn_data = (struct clsn_data *) data;
1051 gimple_stmt_iterator gsi;
1052 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1057 tree t, addr, ref, x;
1058 tree tmp_load, name;
1061 load_struct = build_simple_mem_ref (clsn_data->load);
1062 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
1064 addr = build_addr (t, current_function_decl);
1066 /* Create phi node. */
1067 bb = clsn_data->load_bb;
1069 e = split_block (bb, t);
1072 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL);
1073 tmp_load = make_ssa_name (tmp_load, NULL);
1074 load = gimple_build_omp_atomic_load (tmp_load, addr);
1075 SSA_NAME_DEF_STMT (tmp_load) = load;
1076 gsi = gsi_start_bb (new_bb);
1077 gsi_insert_after (&gsi, load, GSI_NEW_STMT);
1079 e = split_block (new_bb, load);
1081 gsi = gsi_start_bb (new_bb);
1083 x = fold_build2 (reduc->reduction_code,
1084 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1085 PHI_RESULT (reduc->new_phi));
1087 name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
1088 GSI_CONTINUE_LINKING);
1090 gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT);
1094 /* Create the atomic operation at the join point of the threads.
1095 REDUCTION_LIST describes the reductions in the LOOP.
1096 LD_ST_DATA describes the shared data structure where
1097 shared data is stored in and loaded from. */
1099 create_call_for_reduction (struct loop *loop, htab_t reduction_list,
1100 struct clsn_data *ld_st_data)
1102 htab_traverse (reduction_list, create_phi_for_local_result, loop);
1103 /* Find the fallthru edge from GIMPLE_OMP_CONTINUE. */
1104 ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest;
1105 htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data);
1108 /* Callback for htab_traverse. Loads the final reduction value at the
1109 join point of all threads, and inserts it in the right place. */
1112 create_loads_for_reductions (void **slot, void *data)
1114 struct reduction_info *const red = (struct reduction_info *) *slot;
1115 struct clsn_data *const clsn_data = (struct clsn_data *) data;
1117 gimple_stmt_iterator gsi;
1118 tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt));
1123 gsi = gsi_after_labels (clsn_data->load_bb);
1124 load_struct = build_simple_mem_ref (clsn_data->load);
1125 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1129 name = PHI_RESULT (red->keep_res);
1130 stmt = gimple_build_assign (name, x);
1131 SSA_NAME_DEF_STMT (name) = stmt;
1133 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1135 for (gsi = gsi_start_phis (gimple_bb (red->keep_res));
1136 !gsi_end_p (gsi); gsi_next (&gsi))
1137 if (gsi_stmt (gsi) == red->keep_res)
1139 remove_phi_node (&gsi, false);
1145 /* Load the reduction result that was stored in LD_ST_DATA.
1146 REDUCTION_LIST describes the list of reductions that the
1147 loads should be generated for. */
1149 create_final_loads_for_reduction (htab_t reduction_list,
1150 struct clsn_data *ld_st_data)
1152 gimple_stmt_iterator gsi;
1156 gsi = gsi_after_labels (ld_st_data->load_bb);
1157 t = build_fold_addr_expr (ld_st_data->store);
1158 stmt = gimple_build_assign (ld_st_data->load, t);
1160 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1161 SSA_NAME_DEF_STMT (ld_st_data->load) = stmt;
1163 htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data);
1167 /* Callback for htab_traverse. Store the neutral value for the
1168 particular reduction's operation, e.g. 0 for PLUS_EXPR,
1169 1 for MULT_EXPR, etc. into the reduction field.
1170 The reduction is specified in SLOT. The store information is
1174 create_stores_for_reduction (void **slot, void *data)
1176 struct reduction_info *const red = (struct reduction_info *) *slot;
1177 struct clsn_data *const clsn_data = (struct clsn_data *) data;
1180 gimple_stmt_iterator gsi;
1181 tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt));
1183 gsi = gsi_last_bb (clsn_data->store_bb);
1184 t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
1185 stmt = gimple_build_assign (t, red->initial_value);
1186 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1191 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
1192 store to a field of STORE in STORE_BB for the ssa name and its duplicate
1193 specified in SLOT. */
1196 create_loads_and_stores_for_name (void **slot, void *data)
1198 struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot;
1199 struct clsn_data *const clsn_data = (struct clsn_data *) data;
1202 gimple_stmt_iterator gsi;
1203 tree type = TREE_TYPE (elt->new_name);
1206 gsi = gsi_last_bb (clsn_data->store_bb);
1207 t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1208 stmt = gimple_build_assign (t, ssa_name (elt->version));
1209 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1211 gsi = gsi_last_bb (clsn_data->load_bb);
1212 load_struct = build_simple_mem_ref (clsn_data->load);
1213 t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1214 stmt = gimple_build_assign (elt->new_name, t);
1215 SSA_NAME_DEF_STMT (elt->new_name) = stmt;
1216 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1221 /* Moves all the variables used in LOOP and defined outside of it (including
1222 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1223 name) to a structure created for this purpose. The code
1231 is transformed this way:
1246 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
1247 pointer `new' is intentionally not initialized (the loop will be split to a
1248 separate function later, and `new' will be initialized from its arguments).
1249 LD_ST_DATA holds information about the shared data structure used to pass
1250 information among the threads. It is initialized here, and
1251 gen_parallel_loop will pass it to create_call_for_reduction that
1252 needs this information. REDUCTION_LIST describes the reductions
1256 separate_decls_in_region (edge entry, edge exit, htab_t reduction_list,
1257 tree *arg_struct, tree *new_arg_struct,
1258 struct clsn_data *ld_st_data)
1261 basic_block bb1 = split_edge (entry);
1262 basic_block bb0 = single_pred (bb1);
1263 htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
1264 name_to_copy_elt_eq, free);
1265 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
1268 tree type, type_name, nvar;
1269 gimple_stmt_iterator gsi;
1270 struct clsn_data clsn_data;
1271 vec<basic_block> body;
1274 basic_block entry_bb = bb1;
1275 basic_block exit_bb = exit->dest;
1276 bool has_debug_stmt = false;
1278 entry = single_succ_edge (entry_bb);
1279 gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
1281 FOR_EACH_VEC_ELT (body, i, bb)
1283 if (bb != entry_bb && bb != exit_bb)
1285 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1286 separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi),
1287 name_copies, decl_copies);
1289 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1291 gimple stmt = gsi_stmt (gsi);
1293 if (is_gimple_debug (stmt))
1294 has_debug_stmt = true;
1296 separate_decls_in_region_stmt (entry, exit, stmt,
1297 name_copies, decl_copies);
1302 /* Now process debug bind stmts. We must not create decls while
1303 processing debug stmts, so we defer their processing so as to
1304 make sure we will have debug info for as many variables as
1305 possible (all of those that were dealt with in the loop above),
1306 and discard those for which we know there's nothing we can
1309 FOR_EACH_VEC_ELT (body, i, bb)
1310 if (bb != entry_bb && bb != exit_bb)
1312 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1314 gimple stmt = gsi_stmt (gsi);
1316 if (is_gimple_debug (stmt))
1318 if (separate_decls_in_region_debug (stmt, name_copies,
1321 gsi_remove (&gsi, true);
1332 if (htab_elements (name_copies) == 0 && htab_elements (reduction_list) == 0)
1334 /* It may happen that there is nothing to copy (if there are only
1335 loop carried and external variables in the loop). */
1337 *new_arg_struct = NULL;
1341 /* Create the type for the structure to store the ssa names to. */
1342 type = lang_hooks.types.make_type (RECORD_TYPE);
1343 type_name = build_decl (UNKNOWN_LOCATION,
1344 TYPE_DECL, create_tmp_var_name (".paral_data"),
1346 TYPE_NAME (type) = type_name;
1348 htab_traverse (name_copies, add_field_for_name, type);
1349 if (reduction_list && htab_elements (reduction_list) > 0)
1351 /* Create the fields for reductions. */
1352 htab_traverse (reduction_list, add_field_for_reduction,
1357 /* Create the loads and stores. */
1358 *arg_struct = create_tmp_var (type, ".paral_data_store");
1359 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1360 *new_arg_struct = make_ssa_name (nvar, NULL);
1362 ld_st_data->store = *arg_struct;
1363 ld_st_data->load = *new_arg_struct;
1364 ld_st_data->store_bb = bb0;
1365 ld_st_data->load_bb = bb1;
1367 htab_traverse (name_copies, create_loads_and_stores_for_name,
1370 /* Load the calculation from memory (after the join of the threads). */
1372 if (reduction_list && htab_elements (reduction_list) > 0)
1374 htab_traverse (reduction_list, create_stores_for_reduction,
1376 clsn_data.load = make_ssa_name (nvar, NULL);
1377 clsn_data.load_bb = exit->dest;
1378 clsn_data.store = ld_st_data->store;
1379 create_final_loads_for_reduction (reduction_list, &clsn_data);
1383 htab_delete (decl_copies);
1384 htab_delete (name_copies);
1387 /* Bitmap containing uids of functions created by parallelization. We cannot
1388 allocate it from the default obstack, as it must live across compilation
1389 of several functions; we make it gc allocated instead. */
1391 static GTY(()) bitmap parallelized_functions;
1393 /* Returns true if FN was created by create_loop_fn. */
1396 parallelized_function_p (tree fn)
1398 if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
1401 return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
1404 /* Creates and returns an empty function that will receive the body of
1405 a parallelized loop. */
1408 create_loop_fn (location_t loc)
1412 tree decl, type, name, t;
1413 struct function *act_cfun = cfun;
1414 static unsigned loopfn_num;
1416 loc = LOCATION_LOCUS (loc);
1417 snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1418 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1419 clean_symbol_name (tname);
1420 name = get_identifier (tname);
1421 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1423 decl = build_decl (loc, FUNCTION_DECL, name, type);
1424 if (!parallelized_functions)
1425 parallelized_functions = BITMAP_GGC_ALLOC ();
1426 bitmap_set_bit (parallelized_functions, DECL_UID (decl));
1428 TREE_STATIC (decl) = 1;
1429 TREE_USED (decl) = 1;
1430 DECL_ARTIFICIAL (decl) = 1;
1431 DECL_IGNORED_P (decl) = 0;
1432 TREE_PUBLIC (decl) = 0;
1433 DECL_UNINLINABLE (decl) = 1;
1434 DECL_EXTERNAL (decl) = 0;
1435 DECL_CONTEXT (decl) = NULL_TREE;
1436 DECL_INITIAL (decl) = make_node (BLOCK);
1438 t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
1439 DECL_ARTIFICIAL (t) = 1;
1440 DECL_IGNORED_P (t) = 1;
1441 DECL_RESULT (decl) = t;
1443 t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"),
1445 DECL_ARTIFICIAL (t) = 1;
1446 DECL_ARG_TYPE (t) = ptr_type_node;
1447 DECL_CONTEXT (t) = decl;
1449 DECL_ARGUMENTS (decl) = t;
1451 allocate_struct_function (decl, false);
1453 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1455 set_cfun (act_cfun);
1460 /* Moves the exit condition of LOOP to the beginning of its header, and
1461 duplicates the part of the last iteration that gets disabled to the
1462 exit of the loop. NIT is the number of iterations of the loop
1463 (used to initialize the variables in the duplicated part).
1465 TODO: the common case is that latch of the loop is empty and immediately
1466 follows the loop exit. In this case, it would be better not to copy the
1467 body of the loop, but only move the entry of the loop directly before the
1468 exit check and increase the number of iterations of the loop by one.
1469 This may need some additional preconditioning in case NIT = ~0.
1470 REDUCTION_LIST describes the reductions in LOOP. */
1473 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit)
1475 basic_block *bbs, *nbbs, ex_bb, orig_header;
1478 edge exit = single_dom_exit (loop), hpred;
1479 tree control, control_name, res, t;
1480 gimple phi, nphi, cond_stmt, stmt, cond_nit;
1481 gimple_stmt_iterator gsi;
1484 split_block_after_labels (loop->header);
1485 orig_header = single_succ (loop->header);
1486 hpred = single_succ_edge (loop->header);
1488 cond_stmt = last_stmt (exit->src);
1489 control = gimple_cond_lhs (cond_stmt);
1490 gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
1492 /* Make sure that we have phi nodes on exit for all loop header phis
1493 (create_parallel_loop requires that). */
1494 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1496 phi = gsi_stmt (gsi);
1497 res = PHI_RESULT (phi);
1498 t = copy_ssa_name (res, phi);
1499 SET_PHI_RESULT (phi, t);
1500 nphi = create_phi_node (res, orig_header);
1501 add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
1505 gimple_cond_set_lhs (cond_stmt, t);
1506 update_stmt (cond_stmt);
1511 bbs = get_loop_body_in_dom_order (loop);
1513 for (n = 0; bbs[n] != exit->src; n++)
1515 nbbs = XNEWVEC (basic_block, n);
1516 ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1523 /* Other than reductions, the only gimple reg that should be copied
1524 out of the loop is the control variable. */
1525 exit = single_dom_exit (loop);
1526 control_name = NULL_TREE;
1527 for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); )
1529 phi = gsi_stmt (gsi);
1530 res = PHI_RESULT (phi);
1531 if (virtual_operand_p (res))
1537 /* Check if it is a part of reduction. If it is,
1538 keep the phi at the reduction's keep_res field. The
1539 PHI_RESULT of this phi is the resulting value of the reduction
1540 variable when exiting the loop. */
1542 if (htab_elements (reduction_list) > 0)
1544 struct reduction_info *red;
1546 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1547 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1550 red->keep_res = phi;
1555 gcc_assert (control_name == NULL_TREE
1556 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
1558 remove_phi_node (&gsi, false);
1560 gcc_assert (control_name != NULL_TREE);
1562 /* Initialize the control variable to number of iterations
1563 according to the rhs of the exit condition. */
1564 gsi = gsi_after_labels (ex_bb);
1565 cond_nit = last_stmt (exit->src);
1566 nit_1 = gimple_cond_rhs (cond_nit);
1567 nit_1 = force_gimple_operand_gsi (&gsi,
1568 fold_convert (TREE_TYPE (control_name), nit_1),
1569 false, NULL_TREE, false, GSI_SAME_STMT);
1570 stmt = gimple_build_assign (control_name, nit_1);
1571 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1572 SSA_NAME_DEF_STMT (control_name) = stmt;
1575 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1576 LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
1577 NEW_DATA is the variable that should be initialized from the argument
1578 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
1579 basic block containing GIMPLE_OMP_PARALLEL tree. */
1582 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
1583 tree new_data, unsigned n_threads, location_t loc)
1585 gimple_stmt_iterator gsi;
1586 basic_block bb, paral_bb, for_bb, ex_bb;
1588 gimple stmt, for_stmt, phi, cond_stmt;
1589 tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
1590 edge exit, nexit, guard, end, e;
1592 /* Prepare the GIMPLE_OMP_PARALLEL statement. */
1593 bb = loop_preheader_edge (loop)->src;
1594 paral_bb = single_pred (bb);
1595 gsi = gsi_last_bb (paral_bb);
1597 t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
1598 OMP_CLAUSE_NUM_THREADS_EXPR (t)
1599 = build_int_cst (integer_type_node, n_threads);
1600 stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
1601 gimple_set_location (stmt, loc);
1603 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1605 /* Initialize NEW_DATA. */
1608 gsi = gsi_after_labels (bb);
1610 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL);
1611 stmt = gimple_build_assign (param, build_fold_addr_expr (data));
1612 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1613 SSA_NAME_DEF_STMT (param) = stmt;
1615 stmt = gimple_build_assign (new_data,
1616 fold_convert (TREE_TYPE (new_data), param));
1617 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1618 SSA_NAME_DEF_STMT (new_data) = stmt;
1621 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
1622 bb = split_loop_exit_edge (single_dom_exit (loop));
1623 gsi = gsi_last_bb (bb);
1624 stmt = gimple_build_omp_return (false);
1625 gimple_set_location (stmt, loc);
1626 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1628 /* Extract data for GIMPLE_OMP_FOR. */
1629 gcc_assert (loop->header == single_dom_exit (loop)->src);
1630 cond_stmt = last_stmt (loop->header);
1632 cvar = gimple_cond_lhs (cond_stmt);
1633 cvar_base = SSA_NAME_VAR (cvar);
1634 phi = SSA_NAME_DEF_STMT (cvar);
1635 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1636 initvar = copy_ssa_name (cvar, NULL);
1637 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
1639 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
1641 gsi = gsi_last_nondebug_bb (loop->latch);
1642 gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
1643 gsi_remove (&gsi, true);
1646 for_bb = split_edge (loop_preheader_edge (loop));
1647 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
1648 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
1649 gcc_assert (exit == single_dom_exit (loop));
1651 guard = make_edge (for_bb, ex_bb, 0);
1652 single_succ_edge (loop->latch)->flags = 0;
1653 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
1654 for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); gsi_next (&gsi))
1656 source_location locus;
1658 phi = gsi_stmt (gsi);
1659 stmt = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
1661 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
1662 locus = gimple_phi_arg_location_from_edge (stmt,
1663 loop_preheader_edge (loop));
1664 add_phi_arg (phi, def, guard, locus);
1666 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
1667 locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop));
1668 add_phi_arg (phi, def, end, locus);
1670 e = redirect_edge_and_branch (exit, nexit->dest);
1671 PENDING_STMT (e) = NULL;
1673 /* Emit GIMPLE_OMP_FOR. */
1674 gimple_cond_set_lhs (cond_stmt, cvar_base);
1675 type = TREE_TYPE (cvar);
1676 t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
1677 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
1679 for_stmt = gimple_build_omp_for (NULL, t, 1, NULL);
1680 gimple_set_location (for_stmt, loc);
1681 gimple_omp_for_set_index (for_stmt, 0, initvar);
1682 gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
1683 gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt));
1684 gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt));
1685 gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type,
1687 build_int_cst (type, 1)));
1689 gsi = gsi_last_bb (for_bb);
1690 gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
1691 SSA_NAME_DEF_STMT (initvar) = for_stmt;
1693 /* Emit GIMPLE_OMP_CONTINUE. */
1694 gsi = gsi_last_bb (loop->latch);
1695 stmt = gimple_build_omp_continue (cvar_next, cvar);
1696 gimple_set_location (stmt, loc);
1697 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1698 SSA_NAME_DEF_STMT (cvar_next) = stmt;
1700 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR. */
1701 gsi = gsi_last_bb (ex_bb);
1702 stmt = gimple_build_omp_return (true);
1703 gimple_set_location (stmt, loc);
1704 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1706 /* After the above dom info is hosed. Re-compute it. */
1707 free_dominance_info (CDI_DOMINATORS);
1708 calculate_dominance_info (CDI_DOMINATORS);
1713 /* Generates code to execute the iterations of LOOP in N_THREADS
1714 threads in parallel.
1716 NITER describes number of iterations of LOOP.
1717 REDUCTION_LIST describes the reductions existent in the LOOP. */
1720 gen_parallel_loop (struct loop *loop, htab_t reduction_list,
1721 unsigned n_threads, struct tree_niter_desc *niter)
1724 tree many_iterations_cond, type, nit;
1725 tree arg_struct, new_arg_struct;
1727 basic_block parallel_head;
1729 struct clsn_data clsn_data;
1733 unsigned int m_p_thread=2;
1737 ---------------------------------------------------------------------
1740 IV = phi (INIT, IV + STEP)
1746 ---------------------------------------------------------------------
1748 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1749 we generate the following code:
1751 ---------------------------------------------------------------------
1754 || NITER < MIN_PER_THREAD * N_THREADS)
1758 store all local loop-invariant variables used in body of the loop to DATA.
1759 GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1760 load the variables from DATA.
1761 GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1764 GIMPLE_OMP_CONTINUE;
1765 GIMPLE_OMP_RETURN -- GIMPLE_OMP_FOR
1766 GIMPLE_OMP_RETURN -- GIMPLE_OMP_PARALLEL
1772 IV = phi (INIT, IV + STEP)
1783 /* Create two versions of the loop -- in the old one, we know that the
1784 number of iterations is large enough, and we will transform it into the
1785 loop that will be split to loop_fn, the new one will be used for the
1786 remaining iterations. */
1788 /* We should compute a better number-of-iterations value for outer loops.
1791 for (i = 0; i < n; ++i)
1792 for (j = 0; j < m; ++j)
1795 we should compute nit = n * m, not nit = n.
1796 Also may_be_zero handling would need to be adjusted. */
1798 type = TREE_TYPE (niter->niter);
1799 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1802 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1807 m_p_thread=MIN_PER_THREAD;
1809 many_iterations_cond =
1810 fold_build2 (GE_EXPR, boolean_type_node,
1811 nit, build_int_cst (type, m_p_thread * n_threads));
1813 many_iterations_cond
1814 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1815 invert_truthvalue (unshare_expr (niter->may_be_zero)),
1816 many_iterations_cond);
1817 many_iterations_cond
1818 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
1820 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1821 if (!is_gimple_condexpr (many_iterations_cond))
1823 many_iterations_cond
1824 = force_gimple_operand (many_iterations_cond, &stmts,
1827 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1830 initialize_original_copy_tables ();
1832 /* We assume that the loop usually iterates a lot. */
1833 prob = 4 * REG_BR_PROB_BASE / 5;
1834 loop_version (loop, many_iterations_cond, NULL,
1835 prob, prob, REG_BR_PROB_BASE - prob, true);
1836 update_ssa (TODO_update_ssa);
1837 free_original_copy_tables ();
1839 /* Base all the induction variables in LOOP on a single control one. */
1840 canonicalize_loop_ivs (loop, &nit, true);
1842 /* Ensure that the exit condition is the first statement in the loop. */
1843 transform_to_exit_first_loop (loop, reduction_list, nit);
1845 /* Generate initializations for reductions. */
1846 if (htab_elements (reduction_list) > 0)
1847 htab_traverse (reduction_list, initialize_reductions, loop);
1849 /* Eliminate the references to local variables from the loop. */
1850 gcc_assert (single_exit (loop));
1851 entry = loop_preheader_edge (loop);
1852 exit = single_dom_exit (loop);
1854 eliminate_local_variables (entry, exit);
1855 /* In the old loop, move all variables non-local to the loop to a structure
1856 and back, and create separate decls for the variables used in loop. */
1857 separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
1858 &new_arg_struct, &clsn_data);
1860 /* Create the parallel constructs. */
1861 loc = UNKNOWN_LOCATION;
1862 cond_stmt = last_stmt (loop->header);
1864 loc = gimple_location (cond_stmt);
1865 parallel_head = create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
1866 new_arg_struct, n_threads, loc);
1867 if (htab_elements (reduction_list) > 0)
1868 create_call_for_reduction (loop, reduction_list, &clsn_data);
1872 /* Cancel the loop (it is simpler to do it here rather than to teach the
1873 expander to do it). */
1874 cancel_loop_tree (loop);
1876 /* Free loop bound estimations that could contain references to
1877 removed statements. */
1878 FOR_EACH_LOOP (li, loop, 0)
1879 free_numbers_of_iterations_estimates_loop (loop);
1881 /* Expand the parallel constructs. We do it directly here instead of running
1882 a separate expand_omp pass, since it is more efficient, and less likely to
1883 cause troubles with further analyses not being able to deal with the
1886 omp_expand_local (parallel_head);
1889 /* Returns true when LOOP contains vector phi nodes. */
1892 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED)
1895 basic_block *bbs = get_loop_body_in_dom_order (loop);
1896 gimple_stmt_iterator gsi;
1899 for (i = 0; i < loop->num_nodes; i++)
1900 for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
1901 if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi_stmt (gsi)))) == VECTOR_TYPE)
1910 /* Create a reduction_info struct, initialize it with REDUC_STMT
1911 and PHI, insert it to the REDUCTION_LIST. */
1914 build_new_reduction (htab_t reduction_list, gimple reduc_stmt, gimple phi)
1917 struct reduction_info *new_reduction;
1919 gcc_assert (reduc_stmt);
1921 if (dump_file && (dump_flags & TDF_DETAILS))
1924 "Detected reduction. reduction stmt is: \n");
1925 print_gimple_stmt (dump_file, reduc_stmt, 0, 0);
1926 fprintf (dump_file, "\n");
1929 new_reduction = XCNEW (struct reduction_info);
1931 new_reduction->reduc_stmt = reduc_stmt;
1932 new_reduction->reduc_phi = phi;
1933 new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi));
1934 new_reduction->reduction_code = gimple_assign_rhs_code (reduc_stmt);
1935 slot = htab_find_slot (reduction_list, new_reduction, INSERT);
1936 *slot = new_reduction;
1939 /* Callback for htab_traverse. Sets gimple_uid of reduc_phi stmts. */
1942 set_reduc_phi_uids (void **slot, void *data ATTRIBUTE_UNUSED)
1944 struct reduction_info *const red = (struct reduction_info *) *slot;
1945 gimple_set_uid (red->reduc_phi, red->reduc_version);
1949 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST. */
1952 gather_scalar_reductions (loop_p loop, htab_t reduction_list)
1954 gimple_stmt_iterator gsi;
1955 loop_vec_info simple_loop_info;
1957 simple_loop_info = vect_analyze_loop_form (loop);
1959 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1961 gimple phi = gsi_stmt (gsi);
1963 tree res = PHI_RESULT (phi);
1966 if (virtual_operand_p (res))
1969 if (!simple_iv (loop, loop, res, &iv, true)
1970 && simple_loop_info)
1972 gimple reduc_stmt = vect_force_simple_reduction (simple_loop_info,
1975 if (reduc_stmt && !double_reduc)
1976 build_new_reduction (reduction_list, reduc_stmt, phi);
1979 destroy_loop_vec_info (simple_loop_info, true);
1981 /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form
1982 and destroy_loop_vec_info, we can set gimple_uid of reduc_phi stmts
1984 htab_traverse (reduction_list, set_reduc_phi_uids, NULL);
1987 /* Try to initialize NITER for code generation part. */
1990 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter)
1992 edge exit = single_dom_exit (loop);
1996 /* We need to know # of iterations, and there should be no uses of values
1997 defined inside loop outside of it, unless the values are invariants of
1999 if (!number_of_iterations_exit (loop, exit, niter, false))
2001 if (dump_file && (dump_flags & TDF_DETAILS))
2002 fprintf (dump_file, " FAILED: number of iterations not known\n");
2009 /* Try to initialize REDUCTION_LIST for code generation part.
2010 REDUCTION_LIST describes the reductions. */
2013 try_create_reduction_list (loop_p loop, htab_t reduction_list)
2015 edge exit = single_dom_exit (loop);
2016 gimple_stmt_iterator gsi;
2020 gather_scalar_reductions (loop, reduction_list);
2023 for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2025 gimple phi = gsi_stmt (gsi);
2026 struct reduction_info *red;
2027 imm_use_iterator imm_iter;
2028 use_operand_p use_p;
2030 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2032 if (!virtual_operand_p (val))
2034 if (dump_file && (dump_flags & TDF_DETAILS))
2036 fprintf (dump_file, "phi is ");
2037 print_gimple_stmt (dump_file, phi, 0, 0);
2038 fprintf (dump_file, "arg of phi to exit: value ");
2039 print_generic_expr (dump_file, val, 0);
2040 fprintf (dump_file, " used outside loop\n");
2042 " checking if it a part of reduction pattern: \n");
2044 if (htab_elements (reduction_list) == 0)
2046 if (dump_file && (dump_flags & TDF_DETAILS))
2048 " FAILED: it is not a part of reduction.\n");
2052 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
2054 if (!gimple_debug_bind_p (USE_STMT (use_p))
2055 && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2057 reduc_phi = USE_STMT (use_p);
2061 red = reduction_phi (reduction_list, reduc_phi);
2064 if (dump_file && (dump_flags & TDF_DETAILS))
2066 " FAILED: it is not a part of reduction.\n");
2069 if (dump_file && (dump_flags & TDF_DETAILS))
2071 fprintf (dump_file, "reduction phi is ");
2072 print_gimple_stmt (dump_file, red->reduc_phi, 0, 0);
2073 fprintf (dump_file, "reduction stmt is ");
2074 print_gimple_stmt (dump_file, red->reduc_stmt, 0, 0);
2079 /* The iterations of the loop may communicate only through bivs whose
2080 iteration space can be distributed efficiently. */
2081 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2083 gimple phi = gsi_stmt (gsi);
2084 tree def = PHI_RESULT (phi);
2087 if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true))
2089 struct reduction_info *red;
2091 red = reduction_phi (reduction_list, phi);
2094 if (dump_file && (dump_flags & TDF_DETAILS))
2096 " FAILED: scalar dependency between iterations\n");
2106 /* Detect parallel loops and generate parallel code using libgomp
2107 primitives. Returns true if some loop was parallelized, false
2111 parallelize_loops (void)
2113 unsigned n_threads = flag_tree_parallelize_loops;
2114 bool changed = false;
2116 struct tree_niter_desc niter_desc;
2118 htab_t reduction_list;
2119 struct obstack parloop_obstack;
2120 HOST_WIDE_INT estimated;
2123 /* Do not parallelize loops in the functions created by parallelization. */
2124 if (parallelized_function_p (cfun->decl))
2126 if (cfun->has_nonlocal_label)
2129 gcc_obstack_init (&parloop_obstack);
2130 reduction_list = htab_create (10, reduction_info_hash,
2131 reduction_info_eq, free);
2132 init_stmt_vec_info_vec ();
2134 FOR_EACH_LOOP (li, loop, 0)
2136 htab_empty (reduction_list);
2137 if (dump_file && (dump_flags & TDF_DETAILS))
2139 fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
2141 fprintf (dump_file, "loop %d is not innermost\n",loop->num);
2143 fprintf (dump_file, "loop %d is innermost\n",loop->num);
2146 /* If we use autopar in graphite pass, we use its marked dependency
2147 checking results. */
2148 if (flag_loop_parallelize_all && !loop->can_be_parallel)
2150 if (dump_file && (dump_flags & TDF_DETAILS))
2151 fprintf (dump_file, "loop is not parallel according to graphite\n");
2155 if (!single_dom_exit (loop))
2158 if (dump_file && (dump_flags & TDF_DETAILS))
2159 fprintf (dump_file, "loop is !single_dom_exit\n");
2164 if (/* And of course, the loop must be parallelizable. */
2165 !can_duplicate_loop_p (loop)
2166 || loop_has_blocks_with_irreducible_flag (loop)
2167 || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP)
2168 /* FIXME: the check for vector phi nodes could be removed. */
2169 || loop_has_vector_phi_nodes (loop))
2172 estimated = estimated_stmt_executions_int (loop);
2173 if (estimated == -1)
2174 estimated = max_stmt_executions_int (loop);
2175 /* FIXME: Bypass this check as graphite doesn't update the
2176 count and frequency correctly now. */
2177 if (!flag_loop_parallelize_all
2178 && ((estimated != -1
2179 && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
2180 /* Do not bother with loops in cold areas. */
2181 || optimize_loop_nest_for_size_p (loop)))
2184 if (!try_get_loop_niter (loop, &niter_desc))
2187 if (!try_create_reduction_list (loop, reduction_list))
2190 if (!flag_loop_parallelize_all
2191 && !loop_parallel_p (loop, &parloop_obstack))
2195 if (dump_file && (dump_flags & TDF_DETAILS))
2198 fprintf (dump_file, "parallelizing outer loop %d\n",loop->header->index);
2200 fprintf (dump_file, "parallelizing inner loop %d\n",loop->header->index);
2201 loop_loc = find_loop_location (loop);
2202 if (loop_loc != UNKNOWN_LOC)
2203 fprintf (dump_file, "\nloop at %s:%d: ",
2204 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
2206 gen_parallel_loop (loop, reduction_list,
2207 n_threads, &niter_desc);
2208 #ifdef ENABLE_CHECKING
2209 verify_flow_info ();
2210 verify_loop_structure ();
2211 verify_loop_closed_ssa (true);
2215 free_stmt_vec_info_vec ();
2216 htab_delete (reduction_list);
2217 obstack_free (&parloop_obstack, NULL);
2219 /* Parallelization will cause new function calls to be inserted through
2220 which local variables will escape. Reset the points-to solution
2223 pt_solution_reset (&cfun->gimple_df->escaped);
2228 #include "gt-tree-parloops.h"