1 /* OpenACC worker partitioning via middle end neutering/broadcasting scheme
3 Copyright (C) 2015-2022 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
28 #include "tree-pass.h"
31 #include "pretty-print.h"
32 #include "fold-const.h"
34 #include "gimple-iterator.h"
35 #include "gimple-walk.h"
36 #include "tree-inline.h"
37 #include "langhooks.h"
38 #include "omp-general.h"
40 #include "gimple-pretty-print.h"
42 #include "insn-config.h"
44 #include "internal-fn.h"
46 #include "tree-nested.h"
47 #include "stor-layout.h"
48 #include "tree-ssa-threadupdate.h"
49 #include "tree-into-ssa.h"
50 #include "splay-tree.h"
54 #include "omp-offload.h"
56 #include "targhooks.h"
57 #include "diagnostic-core.h"
59 /* Loop structure of the function. The entire function is described as
61 /* Adapted from 'gcc/config/nvptx/nvptx.cc:struct parallel'. */
65 /* Parent parallel. */
68 /* Next sibling parallel. */
71 /* First child parallel. */
74 /* Partitioning mask of the parallel. */
77 /* Partitioning used within inner parallels. */
80 /* Location of parallel forked and join. The forked is the first
81 block in the parallel and the join is the first block after of
83 basic_block forked_block;
84 basic_block join_block;
92 /* Basic blocks in this parallel, but not in child parallels. The
93 FORKED and JOINING blocks are in the partition. The FORK and JOIN
95 auto_vec<basic_block> blocks;
102 parallel_g (parallel_g *parent, unsigned mode);
106 /* Constructor links the new parallel into it's parent's chain of
109 parallel_g::parallel_g (parallel_g *parent_, unsigned mask_)
110 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
112 forked_block = join_block = 0;
113 forked_stmt = join_stmt = NULL;
114 fork_stmt = joining_stmt = NULL;
116 record_type = NULL_TREE;
117 sender_decl = NULL_TREE;
118 receiver_decl = NULL_TREE;
122 next = parent->inner;
123 parent->inner = this;
127 parallel_g::~parallel_g ()
134 local_var_based_p (tree decl)
136 switch (TREE_CODE (decl))
139 return !is_global_var (decl);
144 return local_var_based_p (TREE_OPERAND (decl, 0));
151 /* Map of basic blocks to gimple stmts. */
152 typedef hash_map<basic_block, gimple *> bb_stmt_map_t;
154 /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since
155 the routine likely contains partitioned loops (else will do its own
156 neutering and variable propagation). Return TRUE if a function call CALL
157 should be made in (worker) single mode instead, rather than redundant
161 omp_sese_active_worker_call (gcall *call)
163 #define GOMP_DIM_SEQ GOMP_DIM_MAX
164 tree fndecl = gimple_call_fndecl (call);
169 tree attrs = oacc_get_fn_attrib (fndecl);
174 int level = oacc_fn_attrib_level (attrs);
176 /* Neither regular functions nor "seq" routines should be run by all threads
177 in worker-single mode. */
178 return level == -1 || level == GOMP_DIM_SEQ;
182 /* Split basic blocks such that each forked and join unspecs are at
183 the start of their basic blocks. Thus afterwards each block will
184 have a single partitioning mode. We also do the same for return
185 insns, as they are executed by every thread. Return the
186 partitioning mode of the function as a whole. Populate MAP with
187 head and tail blocks. We also clear the BB visited flag, which is
188 used when finding partitions. */
189 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_split_blocks'. */
192 omp_sese_split_blocks (bb_stmt_map_t *map)
194 auto_vec<gimple *> worklist;
197 /* Locate all the reorg instructions of interest. */
198 FOR_ALL_BB_FN (block, cfun)
200 /* Clear visited flag, for use by parallel locator */
201 block->flags &= ~BB_VISITED;
203 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
207 gimple *stmt = gsi_stmt (gsi);
209 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
211 enum ifn_unique_kind k = ((enum ifn_unique_kind)
212 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
214 if (k == IFN_UNIQUE_OACC_JOIN)
215 worklist.safe_push (stmt);
216 else if (k == IFN_UNIQUE_OACC_FORK)
218 gcc_assert (gsi_one_before_end_p (gsi));
219 basic_block forked_block = single_succ (block);
220 gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block);
222 /* We push a NOP as a placeholder for the "forked" stmt.
223 This is then recognized in omp_sese_find_par. */
224 gimple *nop = gimple_build_nop ();
225 gsi_insert_before (&gsi2, nop, GSI_SAME_STMT);
227 worklist.safe_push (nop);
230 else if (gimple_code (stmt) == GIMPLE_RETURN
231 || gimple_code (stmt) == GIMPLE_COND
232 || gimple_code (stmt) == GIMPLE_SWITCH
233 || (gimple_code (stmt) == GIMPLE_CALL
234 && !gimple_call_internal_p (stmt)
235 && !omp_sese_active_worker_call (as_a <gcall *> (stmt))))
236 worklist.safe_push (stmt);
237 else if (is_gimple_assign (stmt))
239 tree lhs = gimple_assign_lhs (stmt);
241 /* Force assignments to components/fields/elements of local
242 aggregates into fully-partitioned (redundant) mode. This
243 avoids having to broadcast the whole aggregate. The RHS of
244 the assignment will be propagated using the normal
247 switch (TREE_CODE (lhs))
253 tree aggr = TREE_OPERAND (lhs, 0);
255 if (local_var_based_p (aggr))
256 worklist.safe_push (stmt);
267 /* Split blocks on the worklist. */
271 for (ix = 0; worklist.iterate (ix, &stmt); ix++)
273 basic_block block = gimple_bb (stmt);
275 if (gimple_code (stmt) == GIMPLE_COND)
277 gcond *orig_cond = as_a <gcond *> (stmt);
278 tree_code code = gimple_expr_code (orig_cond);
279 tree pred = make_ssa_name (boolean_type_node);
280 gimple *asgn = gimple_build_assign (pred, code,
281 gimple_cond_lhs (orig_cond),
282 gimple_cond_rhs (orig_cond));
284 = gimple_build_cond (NE_EXPR, pred, boolean_false_node,
285 gimple_cond_true_label (orig_cond),
286 gimple_cond_false_label (orig_cond));
288 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
289 gsi_insert_before (&gsi, asgn, GSI_SAME_STMT);
290 gsi_replace (&gsi, new_cond, true);
292 edge e = split_block (block, asgn);
294 map->get_or_insert (block) = new_cond;
296 else if ((gimple_code (stmt) == GIMPLE_CALL
297 && !gimple_call_internal_p (stmt))
298 || is_gimple_assign (stmt))
300 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
303 edge call = split_block (block, gsi_stmt (gsi));
305 gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest));
307 edge call_to_ret = split_block (call->dest, call_stmt);
309 map->get_or_insert (call_to_ret->src) = call_stmt;
313 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
317 map->get_or_insert (block) = stmt;
320 /* Split block before insn. The insn is in the new block. */
321 edge e = split_block (block, gsi_stmt (gsi));
324 map->get_or_insert (block) = stmt;
331 mask_name (unsigned mask)
335 case 0: return "gang redundant";
336 case 1: return "gang partitioned";
337 case 2: return "worker partitioned";
338 case 3: return "gang+worker partitioned";
339 case 4: return "vector partitioned";
340 case 5: return "gang+vector partitioned";
341 case 6: return "worker+vector partitioned";
342 case 7: return "fully partitioned";
343 default: return "<illegal>";
347 /* Dump this parallel and all its inner parallels. */
348 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_dump_pars'. */
351 omp_sese_dump_pars (parallel_g *par, unsigned depth)
353 fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n",
354 depth, par->mask, mask_name (par->mask),
355 par->forked_block ? par->forked_block->index : -1,
356 par->join_block ? par->join_block->index : -1);
358 fprintf (dump_file, " blocks:");
361 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
362 fprintf (dump_file, " %d", block->index);
363 fprintf (dump_file, "\n");
365 omp_sese_dump_pars (par->inner, depth + 1);
368 omp_sese_dump_pars (par->next, depth);
371 /* If BLOCK contains a fork/join marker, process it to create or
372 terminate a loop structure. Add this block to the current loop,
373 and then walk successor blocks. */
374 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_find_par'. */
377 omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block)
379 if (block->flags & BB_VISITED)
381 block->flags |= BB_VISITED;
383 if (gimple **stmtp = map->get (block))
385 gimple *stmt = *stmtp;
387 if (gimple_code (stmt) == GIMPLE_COND
388 || gimple_code (stmt) == GIMPLE_SWITCH
389 || gimple_code (stmt) == GIMPLE_RETURN
390 || (gimple_code (stmt) == GIMPLE_CALL
391 && !gimple_call_internal_p (stmt))
392 || is_gimple_assign (stmt))
394 /* A single block that is forced to be at the maximum partition
395 level. Make a singleton par for it. */
396 par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG)
397 | GOMP_DIM_MASK (GOMP_DIM_WORKER)
398 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
399 par->forked_block = block;
400 par->forked_stmt = stmt;
401 par->blocks.safe_push (block);
403 goto walk_successors;
405 else if (gimple_nop_p (stmt))
407 basic_block pred = single_pred (block);
409 gimple_stmt_iterator gsi = gsi_last_bb (pred);
410 gimple *final_stmt = gsi_stmt (gsi);
412 if (gimple_call_internal_p (final_stmt, IFN_UNIQUE))
414 gcall *call = as_a <gcall *> (final_stmt);
415 enum ifn_unique_kind k = ((enum ifn_unique_kind)
416 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
418 if (k == IFN_UNIQUE_OACC_FORK)
421 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
422 unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
424 par = new parallel_g (par, mask);
425 par->forked_block = block;
426 par->forked_stmt = final_stmt;
427 par->fork_stmt = stmt;
435 else if (gimple_call_internal_p (stmt, IFN_UNIQUE))
437 gcall *call = as_a <gcall *> (stmt);
438 enum ifn_unique_kind k = ((enum ifn_unique_kind)
439 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
440 if (k == IFN_UNIQUE_OACC_JOIN)
442 HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
443 unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
445 gcc_assert (par->mask == mask);
446 par->join_block = block;
447 par->join_stmt = stmt;
458 /* Add this block onto the current loop's list of blocks. */
459 par->blocks.safe_push (block);
461 /* This must be the entry block. Create a NULL parallel. */
462 par = new parallel_g (0, 0);
465 /* Walk successor blocks. */
469 FOR_EACH_EDGE (e, ei, block->succs)
470 omp_sese_find_par (map, par, e->dest);
475 /* DFS walk the CFG looking for fork & join markers. Construct
476 loop structures as we go. MAP is a mapping of basic blocks
477 to head & tail markers, discovered when splitting blocks. This
478 speeds up the discovery. We rely on the BB visited flag having
479 been cleared when splitting blocks. */
480 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_discover_pars'. */
483 omp_sese_discover_pars (bb_stmt_map_t *map)
487 /* Mark exit blocks as visited. */
488 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
489 block->flags |= BB_VISITED;
491 /* And entry block as not. */
492 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
493 block->flags &= ~BB_VISITED;
495 parallel_g *par = omp_sese_find_par (map, 0, block);
499 fprintf (dump_file, "\nLoops\n");
500 omp_sese_dump_pars (par, 0);
501 fprintf (dump_file, "\n");
508 populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single,
509 bitmap vector_single, unsigned outer_mask,
512 unsigned mask = outer_mask | par->mask;
516 for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
518 if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
519 bitmap_set_bit (worker_single, block->index);
521 if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0)
522 bitmap_set_bit (vector_single, block->index);
526 populate_single_mode_bitmaps (par->inner, worker_single, vector_single,
529 populate_single_mode_bitmaps (par->next, worker_single, vector_single,
533 /* A map from SSA names or var decls to record fields. */
535 typedef hash_map<tree, tree> field_map_t;
537 /* For each propagation record type, this is a map from SSA names or var decls
538 to propagate, to the field in the record type that should be used for
539 transmission and reception. */
541 typedef hash_map<tree, field_map_t> record_field_map_t;
544 install_var_field (tree var, tree record_type, field_map_t *fields)
549 if (TREE_CODE (var) == SSA_NAME)
551 name = SSA_NAME_IDENTIFIER (var);
554 sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var));
555 name = get_identifier (tmp);
558 else if (TREE_CODE (var) == VAR_DECL)
560 name = DECL_NAME (var);
563 sprintf (tmp, "D_%u", (unsigned) DECL_UID (var));
564 name = get_identifier (tmp);
570 gcc_assert (!fields->get (var));
572 tree type = TREE_TYPE (var);
574 if (POINTER_TYPE_P (type)
575 && TYPE_RESTRICT (type))
576 type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT);
578 tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type);
580 if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var))
582 SET_DECL_ALIGN (field, DECL_ALIGN (var));
583 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
584 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
587 SET_DECL_ALIGN (field, TYPE_ALIGN (type));
589 fields->put (var, field);
591 insert_field_into_struct (record_type, field);
594 /* Sets of SSA_NAMES or VAR_DECLs to propagate. */
595 typedef hash_set<tree> propagation_set;
598 find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask,
599 bitmap worker_single, bitmap vector_single,
600 vec<propagation_set *> *prop_set)
602 unsigned mask = outer_mask | par->mask;
605 find_ssa_names_to_propagate (par->inner, mask, worker_single,
606 vector_single, prop_set);
608 find_ssa_names_to_propagate (par->next, outer_mask, worker_single,
609 vector_single, prop_set);
611 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
616 for (ix = 0; par->blocks.iterate (ix, &block); ix++)
618 for (gphi_iterator psi = gsi_start_phis (block);
619 !gsi_end_p (psi); gsi_next (&psi))
621 gphi *phi = psi.phi ();
625 FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE)
627 tree var = USE_FROM_PTR (use);
629 if (TREE_CODE (var) != SSA_NAME)
632 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
634 if (gimple_nop_p (def_stmt))
637 basic_block def_bb = gimple_bb (def_stmt);
639 if (bitmap_bit_p (worker_single, def_bb->index))
641 if (!(*prop_set)[def_bb->index])
642 (*prop_set)[def_bb->index] = new propagation_set;
644 propagation_set *ws_prop = (*prop_set)[def_bb->index];
651 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
652 !gsi_end_p (gsi); gsi_next (&gsi))
656 gimple *stmt = gsi_stmt (gsi);
658 FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
660 tree var = USE_FROM_PTR (use);
662 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
664 if (gimple_nop_p (def_stmt))
667 basic_block def_bb = gimple_bb (def_stmt);
669 if (bitmap_bit_p (worker_single, def_bb->index))
671 if (!(*prop_set)[def_bb->index])
672 (*prop_set)[def_bb->index] = new propagation_set;
674 propagation_set *ws_prop = (*prop_set)[def_bb->index];
684 /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a
688 find_partitioned_var_uses_1 (tree *node, int *, void *data)
690 walk_stmt_info *wi = (walk_stmt_info *) data;
691 hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info;
693 if (!wi->is_lhs && VAR_P (*node))
694 partitioned_var_uses->add (*node);
700 find_partitioned_var_uses (parallel_g *par, unsigned outer_mask,
701 hash_set<tree> *partitioned_var_uses)
703 unsigned mask = outer_mask | par->mask;
706 find_partitioned_var_uses (par->inner, mask, partitioned_var_uses);
708 find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses);
710 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
715 for (ix = 0; par->blocks.iterate (ix, &block); ix++)
716 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
717 !gsi_end_p (gsi); gsi_next (&gsi))
720 memset (&wi, 0, sizeof (wi));
721 wi.info = (void *) partitioned_var_uses;
722 walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi);
727 /* Gang-private variables (typically placed in a GPU's shared memory) do not
728 need to be processed by the worker-propagation mechanism. Populate the
729 GANG_PRIVATE_VARS set with any such variables found in the current
733 find_gang_private_vars (hash_set<tree> *gang_private_vars)
737 FOR_EACH_BB_FN (block, cfun)
739 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
743 gimple *stmt = gsi_stmt (gsi);
745 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
747 enum ifn_unique_kind k = ((enum ifn_unique_kind)
748 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
749 if (k == IFN_UNIQUE_OACC_PRIVATE)
752 = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
753 if (level != GOMP_DIM_GANG)
755 for (unsigned i = 3; i < gimple_call_num_args (stmt); i++)
757 tree arg = gimple_call_arg (stmt, i);
758 gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
759 tree decl = TREE_OPERAND (arg, 0);
760 gang_private_vars->add (decl);
769 find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
770 hash_set<tree> *partitioned_var_uses,
771 hash_set<tree> *gang_private_vars,
772 bitmap writes_gang_private,
773 vec<propagation_set *> *prop_set)
775 unsigned mask = outer_mask | par->mask;
778 find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
779 gang_private_vars, writes_gang_private,
782 find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
783 gang_private_vars, writes_gang_private,
786 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
791 for (ix = 0; par->blocks.iterate (ix, &block); ix++)
793 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
794 !gsi_end_p (gsi); gsi_next (&gsi))
796 gimple *stmt = gsi_stmt (gsi);
800 FOR_EACH_LOCAL_DECL (cfun, i, var)
803 || is_global_var (var)
804 || AGGREGATE_TYPE_P (TREE_TYPE (var))
805 || !partitioned_var_uses->contains (var))
808 if (stmt_may_clobber_ref_p (stmt, var))
812 fprintf (dump_file, "bb %u: local variable may be "
813 "clobbered in %s mode: ", block->index,
815 print_generic_expr (dump_file, var, TDF_SLIM);
816 fprintf (dump_file, "\n");
819 if (gang_private_vars->contains (var))
821 /* If we write a gang-private variable, we want a
822 barrier at the end of the block. */
823 bitmap_set_bit (writes_gang_private, block->index);
827 if (!(*prop_set)[block->index])
828 (*prop_set)[block->index] = new propagation_set;
830 propagation_set *ws_prop
831 = (*prop_set)[block->index];
841 /* Transform basic blocks FROM, TO (which may be the same block) into:
842 if (GOACC_single_start ())
847 | | (new) predicate block
850 +----+ +----+ +----+ |
851 | | | | ===> | | | f (old) from block
852 +----+ +----+ +----+ |
855 (split (split before | | skip block
856 at end) condition) +----+
861 worker_single_simple (basic_block from, basic_block to,
862 hash_set<tree> *def_escapes_block)
866 basic_block skip_block;
868 gimple_stmt_iterator gsi = gsi_last_bb (to);
869 if (EDGE_COUNT (to->succs) > 1)
871 gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND);
874 edge e = split_block (to, gsi_stmt (gsi));
875 skip_block = e->dest;
877 gimple_stmt_iterator start = gsi_after_labels (from);
879 decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START);
880 lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
881 call = gimple_build_call (decl, 0);
882 gimple_call_set_lhs (call, lhs);
883 gsi_insert_before (&start, call, GSI_NEW_STMT);
886 cond = gimple_build_cond (EQ_EXPR, lhs,
887 fold_convert_loc (UNKNOWN_LOCATION,
890 NULL_TREE, NULL_TREE);
891 gsi_insert_after (&start, cond, GSI_NEW_STMT);
894 edge et = split_block (from, cond);
895 et->flags &= ~EDGE_FALLTHRU;
896 et->flags |= EDGE_TRUE_VALUE;
897 /* Make the active worker the more probable path so we prefer fallthrough
898 (letting the idle workers jump around more). */
899 et->probability = profile_probability::likely ();
901 edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE);
902 ef->probability = et->probability.invert ();
904 basic_block neutered = split_edge (ef);
905 gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered);
907 for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi))
909 gimple *stmt = gsi_stmt (gsi);
913 FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF)
915 if (def_escapes_block->contains (var))
917 gphi *join_phi = create_phi_node (NULL_TREE, skip_block);
918 create_new_def_for (var, join_phi,
919 gimple_phi_result_ptr (join_phi));
920 add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION);
922 tree neutered_def = copy_ssa_name (var, NULL);
923 /* We really want "don't care" or some value representing
924 undefined here, but optimizers will probably get rid of the
925 zero-assignments anyway. */
926 gassign *zero = gimple_build_assign (neutered_def,
927 build_zero_cst (TREE_TYPE (neutered_def)));
929 gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING);
932 add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered),
934 update_stmt (join_phi);
940 /* Build COMPONENT_REF and set TREE_THIS_VOLATILE and TREE_READONLY on it
942 /* Adapted from 'gcc/omp-low.cc:omp_build_component_ref'. */
945 oacc_build_component_ref (tree obj, tree field)
947 tree field_type = TREE_TYPE (field);
948 tree obj_type = TREE_TYPE (obj);
949 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (obj_type)))
950 field_type = build_qualified_type
952 KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (obj_type)));
954 tree ret = build3 (COMPONENT_REF, field_type, obj, field, NULL);
955 if (TREE_THIS_VOLATILE (field))
956 TREE_THIS_VOLATILE (ret) |= 1;
957 if (TREE_READONLY (field))
958 TREE_READONLY (ret) |= 1;
963 build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields)
965 tree x = build_simple_mem_ref (receiver_decl);
966 tree field = *fields->get (var);
967 TREE_THIS_NOTRAP (x) = 1;
968 x = oacc_build_component_ref (x, field);
973 build_sender_ref (tree var, tree sender_decl, field_map_t *fields)
975 if (POINTER_TYPE_P (TREE_TYPE (sender_decl)))
976 sender_decl = build_simple_mem_ref (sender_decl);
977 tree field = *fields->get (var);
978 return oacc_build_component_ref (sender_decl, field);
982 sort_by_ssa_version_or_uid (const void *p1, const void *p2)
984 const tree t1 = *(const tree *)p1;
985 const tree t2 = *(const tree *)p2;
987 if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME)
988 return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2);
989 else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME)
991 else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME)
994 return DECL_UID (t1) - DECL_UID (t2);
998 sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2)
1000 const tree t1 = *(const tree *)p1;
1001 const tree t2 = *(const tree *)p2;
1002 unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1)));
1003 unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2)));
1007 return sort_by_ssa_version_or_uid (p1, p2);
1011 worker_single_copy (basic_block from, basic_block to,
1012 hash_set<tree> *def_escapes_block,
1013 hash_set<tree> *worker_partitioned_uses,
1014 tree record_type, record_field_map_t *record_field_map,
1015 unsigned HOST_WIDE_INT placement,
1016 bool isolate_broadcasts, bool has_gang_private_write)
1018 /* If we only have virtual defs, we'll have no record type, but we still want
1019 to emit single_copy_start and (particularly) single_copy_end to act as
1020 a vdef source on the neutered edge representing memory writes on the
1021 non-neutered edge. */
1023 record_type = char_type_node;
1026 = targetm.goacc.create_worker_broadcast_record (record_type, true,
1030 = targetm.goacc.create_worker_broadcast_record (record_type, false,
1034 gimple_stmt_iterator gsi = gsi_last_bb (to);
1035 if (EDGE_COUNT (to->succs) > 1)
1037 edge e = split_block (to, gsi_stmt (gsi));
1038 basic_block barrier_block = e->dest;
1040 gimple_stmt_iterator start = gsi_after_labels (from);
1042 tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START);
1044 tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
1047 = gimple_build_call (decl, 1,
1048 POINTER_TYPE_P (TREE_TYPE (sender_decl))
1049 ? sender_decl : build_fold_addr_expr (sender_decl));
1050 gimple_call_set_lhs (call, lhs);
1051 gsi_insert_before (&start, call, GSI_NEW_STMT);
1054 /* The shared-memory range for this block overflowed. Add a barrier before
1055 the GOACC_single_copy_start call. */
1056 if (isolate_broadcasts)
1058 decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1059 gimple *acc_bar = gimple_build_call (decl, 0);
1060 gsi_insert_before (&start, acc_bar, GSI_SAME_STMT);
1063 tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1065 gimple *conv = gimple_build_assign (conv_tmp,
1066 fold_convert (TREE_TYPE (receiver_decl),
1069 gsi_insert_after (&start, conv, GSI_NEW_STMT);
1070 gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp);
1071 gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1074 tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0);
1076 tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1077 asgn = gimple_build_assign (recv_tmp, receiver_decl);
1078 gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1081 gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE,
1085 gsi_insert_after (&start, cond, GSI_NEW_STMT);
1087 edge et = split_block (from, cond);
1088 et->flags &= ~EDGE_FALLTHRU;
1089 et->flags |= EDGE_TRUE_VALUE;
1090 /* Make the active worker the more probable path so we prefer fallthrough
1091 (letting the idle workers jump around more). */
1092 et->probability = profile_probability::likely ();
1094 basic_block body = et->dest;
1096 edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
1097 ef->probability = et->probability.invert ();
1099 gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
1100 cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
1102 if (record_type != char_type_node || has_gang_private_write)
1104 decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1105 gimple *acc_bar = gimple_build_call (decl, 0);
1107 gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
1108 gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
1111 gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT);
1113 edge et2 = split_block (barrier_block, cond);
1114 et2->flags &= ~EDGE_FALLTHRU;
1115 et2->flags |= EDGE_TRUE_VALUE;
1116 et2->probability = profile_probability::unlikely ();
1118 basic_block exit_block = et2->dest;
1120 basic_block copyout_block = split_edge (et2);
1121 edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE);
1122 ef2->probability = et2->probability.invert ();
1124 gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block);
1126 edge copyout_to_exit = single_succ_edge (copyout_block);
1128 gimple_seq sender_seq = NULL;
1130 /* Make sure we iterate over definitions in a stable order. */
1131 auto_vec<tree> escape_vec (def_escapes_block->elements ());
1132 for (hash_set<tree>::iterator it = def_escapes_block->begin ();
1133 it != def_escapes_block->end (); ++it)
1134 escape_vec.quick_push (*it);
1135 escape_vec.qsort (sort_by_ssa_version_or_uid);
1137 for (unsigned i = 0; i < escape_vec.length (); i++)
1139 tree var = escape_vec[i];
1141 if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var))
1144 tree barrier_def = 0;
1146 if (TREE_CODE (var) == SSA_NAME)
1148 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1150 if (gimple_nop_p (def_stmt))
1153 /* The barrier phi takes one result from the actual work of the
1154 block we're neutering, and the other result is constant zero of
1157 gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block);
1158 barrier_def = create_new_def_for (var, barrier_phi,
1159 gimple_phi_result_ptr (barrier_phi));
1161 add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION);
1162 add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef,
1165 update_stmt (barrier_phi);
1168 gcc_assert (TREE_CODE (var) == VAR_DECL);
1170 /* If we had no record type, we will have no fields map. */
1171 field_map_t *fields = record_field_map->get (record_type);
1173 if (worker_partitioned_uses->contains (var)
1175 && fields->get (var))
1177 tree neutered_def = make_ssa_name (TREE_TYPE (var));
1179 /* Receive definition from shared memory block. */
1181 tree receiver_ref = build_receiver_ref (var, receiver_decl, fields);
1182 gassign *recv = gimple_build_assign (neutered_def,
1184 gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING);
1187 if (TREE_CODE (var) == VAR_DECL)
1189 /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy
1190 to the final location now. */
1191 gassign *asgn = gimple_build_assign (var, neutered_def);
1192 gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING);
1197 /* If it's an SSA name, create a new phi at the join node to
1198 represent either the output from the active worker (the
1199 barrier) or the inactive workers (the copyout block). */
1200 gphi *join_phi = create_phi_node (NULL_TREE, exit_block);
1201 create_new_def_for (barrier_def, join_phi,
1202 gimple_phi_result_ptr (join_phi));
1203 add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION);
1204 add_phi_arg (join_phi, neutered_def, copyout_to_exit,
1206 update_stmt (join_phi);
1209 /* Send definition to shared memory block. */
1211 tree sender_ref = build_sender_ref (var, sender_decl, fields);
1213 if (TREE_CODE (var) == SSA_NAME)
1215 gassign *send = gimple_build_assign (sender_ref, var);
1216 gimple_seq_add_stmt (&sender_seq, send);
1219 else if (TREE_CODE (var) == VAR_DECL)
1221 tree tmp = make_ssa_name (TREE_TYPE (var));
1222 gassign *send = gimple_build_assign (tmp, var);
1223 gimple_seq_add_stmt (&sender_seq, send);
1225 send = gimple_build_assign (sender_ref, tmp);
1226 gimple_seq_add_stmt (&sender_seq, send);
1234 /* The shared-memory range for this block overflowed. Add a barrier at the
1236 if (isolate_broadcasts)
1238 gsi = gsi_start_bb (exit_block);
1239 decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1240 gimple *acc_bar = gimple_build_call (decl, 0);
1241 gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
1244 /* It's possible for the ET->DEST block (the work done by the active thread)
1245 to finish with a control-flow insn, e.g. a UNIQUE function call. Split
1246 the block and add SENDER_SEQ in the latter part to avoid having control
1247 flow in the middle of a BB. */
1249 decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
1250 call = gimple_build_call (decl, 1,
1251 POINTER_TYPE_P (TREE_TYPE (sender_decl))
1253 : build_fold_addr_expr (sender_decl));
1254 gimple_seq_add_stmt (&sender_seq, call);
1256 gsi = gsi_last_bb (body);
1257 gimple *last = gsi_stmt (gsi);
1258 basic_block sender_block = split_block (body, last)->dest;
1259 gsi = gsi_last_bb (sender_block);
1260 gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
1263 typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> >
1267 neuter_worker_single (parallel_g *par, unsigned outer_mask,
1268 bitmap worker_single, bitmap vector_single,
1269 vec<propagation_set *> *prop_set,
1270 hash_set<tree> *partitioned_var_uses,
1271 record_field_map_t *record_field_map,
1272 blk_offset_map_t *blk_offset_map,
1273 bitmap writes_gang_private)
1275 unsigned mask = outer_mask | par->mask;
1277 if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1281 for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1283 bool has_defs = false;
1284 hash_set<tree> def_escapes_block;
1285 hash_set<tree> worker_partitioned_uses;
1289 FOR_EACH_SSA_NAME (j, var, cfun)
1291 if (SSA_NAME_IS_VIRTUAL_OPERAND (var))
1297 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1299 if (gimple_nop_p (def_stmt))
1302 if (gimple_bb (def_stmt)->index != block->index)
1306 imm_use_iterator use_iter;
1307 bool uses_outside_block = false;
1308 bool worker_partitioned_use = false;
1310 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var)
1312 int blocknum = gimple_bb (use_stmt)->index;
1314 /* Don't propagate SSA names that are only used in the
1315 current block, unless the usage is in a phi node: that
1316 means the name left the block, then came back in at the
1318 if (blocknum != block->index
1319 || gimple_code (use_stmt) == GIMPLE_PHI)
1320 uses_outside_block = true;
1321 if (!bitmap_bit_p (worker_single, blocknum))
1322 worker_partitioned_use = true;
1325 if (uses_outside_block)
1326 def_escapes_block.add (var);
1328 if (worker_partitioned_use)
1330 worker_partitioned_uses.add (var);
1335 propagation_set *ws_prop = (*prop_set)[block->index];
1339 for (propagation_set::iterator it = ws_prop->begin ();
1340 it != ws_prop->end ();
1344 if (TREE_CODE (var) == VAR_DECL)
1346 def_escapes_block.add (var);
1347 if (partitioned_var_uses->contains (var))
1349 worker_partitioned_uses.add (var);
1356 (*prop_set)[block->index] = 0;
1359 bool only_marker_fns = true;
1360 bool join_block = false;
1362 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
1366 gimple *stmt = gsi_stmt (gsi);
1367 if (gimple_code (stmt) == GIMPLE_CALL
1368 && gimple_call_internal_p (stmt, IFN_UNIQUE))
1370 enum ifn_unique_kind k = ((enum ifn_unique_kind)
1371 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1372 if (k != IFN_UNIQUE_OACC_PRIVATE
1373 && k != IFN_UNIQUE_OACC_JOIN
1374 && k != IFN_UNIQUE_OACC_FORK
1375 && k != IFN_UNIQUE_OACC_HEAD_MARK
1376 && k != IFN_UNIQUE_OACC_TAIL_MARK)
1377 only_marker_fns = false;
1378 else if (k == IFN_UNIQUE_OACC_JOIN)
1379 /* The JOIN marker is special in that it *cannot* be
1380 predicated for worker zero, because it may be lowered
1381 to a barrier instruction and all workers must typically
1382 execute that barrier. We shouldn't be doing any
1383 broadcasts from the join block anyway. */
1386 else if (gimple_code (stmt) == GIMPLE_CALL
1387 && gimple_call_internal_p (stmt, IFN_GOACC_LOOP))
1389 else if (gimple_nop_p (stmt))
1392 only_marker_fns = false;
1395 /* We can skip predicating this block for worker zero if the only
1396 thing it contains is marker functions that will be removed in the
1397 oaccdevlow pass anyway.
1398 Don't do this if the block has (any) phi nodes, because those
1399 might define SSA names that need broadcasting.
1400 TODO: We might be able to skip transforming blocks that only
1401 contain some other trivial statements too. */
1402 if (only_marker_fns && !phi_nodes (block))
1405 gcc_assert (!join_block);
1409 tree record_type = (tree) block->aux;
1410 std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc
1411 = blk_offset_map->get (block);
1412 gcc_assert (!record_type || off_rngalloc);
1413 unsigned HOST_WIDE_INT offset
1414 = off_rngalloc ? off_rngalloc->first : 0;
1415 bool range_allocated
1416 = off_rngalloc ? off_rngalloc->second : true;
1417 bool has_gang_private_write
1418 = bitmap_bit_p (writes_gang_private, block->index);
1419 worker_single_copy (block, block, &def_escapes_block,
1420 &worker_partitioned_uses, record_type,
1422 offset, !range_allocated,
1423 has_gang_private_write);
1426 worker_single_simple (block, block, &def_escapes_block);
1430 if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1434 for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1435 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
1439 gimple *stmt = gsi_stmt (gsi);
1441 if (gimple_code (stmt) == GIMPLE_CALL
1442 && !gimple_call_internal_p (stmt)
1443 && !omp_sese_active_worker_call (as_a <gcall *> (stmt)))
1445 /* If we have an OpenACC routine call in worker-single mode,
1446 place barriers before and afterwards to prevent
1447 clobbering re-used shared memory regions (as are used
1448 for AMDGCN at present, for example). */
1449 tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1450 gsi_insert_before (&gsi, gimple_build_call (decl, 0),
1452 gsi_insert_after (&gsi, gimple_build_call (decl, 0),
1459 neuter_worker_single (par->inner, mask, worker_single, vector_single,
1460 prop_set, partitioned_var_uses, record_field_map,
1461 blk_offset_map, writes_gang_private);
1463 neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
1464 prop_set, partitioned_var_uses, record_field_map,
1465 blk_offset_map, writes_gang_private);
1469 dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable)
1471 if (bb->flags & BB_VISITED)
1474 bb->flags |= BB_VISITED;
1480 FOR_EACH_EDGE (e, ei, bb->succs)
1482 basic_block dest = e->dest;
1484 bitmap_set_bit (reachable, dest->index);
1486 dfs_broadcast_reachable_1 (dest, reachable);
1491 typedef std::pair<int, tree> idx_decl_pair_t;
1493 typedef auto_vec<splay_tree> used_range_vec_t;
1496 sort_size_descending (const void *a, const void *b)
1498 const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a;
1499 const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b;
1500 unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second));
1501 unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second));
1502 return bsize - asize;
1508 addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi)
1509 : lo (addr_lo), hi (addr_hi)
1511 addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi)
1513 addr_range () : lo (0), hi (0)
1516 bool invalid () { return lo == 0 && hi == 0; }
1518 unsigned HOST_WIDE_INT lo;
1519 unsigned HOST_WIDE_INT hi;
1523 splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b)
1525 addr_range *ar = (addr_range *) a;
1526 addr_range *br = (addr_range *) b;
1527 if (ar->lo == br->lo && ar->hi == br->hi)
1529 if (ar->hi <= br->lo)
1531 else if (ar->lo >= br->hi)
1537 splay_tree_free_key (splay_tree_key k)
1539 addr_range *ar = (addr_range *) k;
1544 first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size,
1545 unsigned HOST_WIDE_INT align, addr_range *bounds)
1547 splay_tree_node min = splay_tree_min (s);
1550 splay_tree_node next;
1551 while ((next = splay_tree_successor (s, min->key)))
1553 unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi;
1554 unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo;
1555 unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1);
1556 if (base + size <= hi)
1557 return addr_range (base, base + size);
1561 unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi;
1562 base = (base + align - 1) & ~(align - 1);
1563 if (base + size <= bounds->hi)
1564 return addr_range (base, base + size);
1566 return addr_range ();
1570 unsigned HOST_WIDE_INT lo = bounds->lo;
1571 lo = (lo + align - 1) & ~(align - 1);
1572 if (lo + size <= bounds->hi)
1573 return addr_range (lo, lo + size);
1575 return addr_range ();
1580 merge_ranges_1 (splay_tree_node n, void *ptr)
1582 splay_tree accum = (splay_tree) ptr;
1583 addr_range ar = *(addr_range *) n->key;
1585 splay_tree_node old = splay_tree_lookup (accum, n->key);
1587 /* We might have an overlap. Create a new range covering the
1588 overlapping parts. */
1591 addr_range *old_ar = (addr_range *) old->key;
1592 ar.lo = MIN (old_ar->lo, ar.lo);
1593 ar.hi = MAX (old_ar->hi, ar.hi);
1594 splay_tree_remove (accum, old->key);
1597 addr_range *new_ar = new addr_range (ar);
1599 splay_tree_insert (accum, (splay_tree_key) new_ar, n->value);
1605 merge_ranges (splay_tree accum, splay_tree sp)
1607 splay_tree_foreach (sp, merge_ranges_1, (void *) accum);
1611 oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
1612 unsigned HOST_WIDE_INT bounds_hi)
1614 bb_stmt_map_t bb_stmt_map;
1615 auto_bitmap worker_single, vector_single;
1617 omp_sese_split_blocks (&bb_stmt_map);
1621 fprintf (dump_file, "\n\nAfter splitting:\n\n");
1622 dump_function_to_file (current_function_decl, dump_file, dump_flags);
1627 /* If this is a routine, calculate MASK as if the outer levels are already
1630 tree attr = oacc_get_fn_attrib (current_function_decl);
1631 tree dims = TREE_VALUE (attr);
1633 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
1635 tree allowed = TREE_PURPOSE (dims);
1636 if (allowed && integer_zerop (allowed))
1637 mask |= GOMP_DIM_MASK (ix);
1641 parallel_g *par = omp_sese_discover_pars (&bb_stmt_map);
1642 populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0);
1645 FOR_ALL_BB_FN (bb, cfun)
1648 vec<propagation_set *> prop_set (vNULL);
1649 prop_set.safe_grow_cleared (last_basic_block_for_fn (cfun), true);
1651 find_ssa_names_to_propagate (par, mask, worker_single, vector_single,
1654 hash_set<tree> partitioned_var_uses;
1655 hash_set<tree> gang_private_vars;
1656 auto_bitmap writes_gang_private;
1658 find_gang_private_vars (&gang_private_vars);
1659 find_partitioned_var_uses (par, mask, &partitioned_var_uses);
1660 find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
1661 &gang_private_vars, writes_gang_private,
1664 record_field_map_t record_field_map;
1666 FOR_ALL_BB_FN (bb, cfun)
1668 propagation_set *ws_prop = prop_set[bb->index];
1671 tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
1672 tree name = create_tmp_var_name (".oacc_ws_data_s");
1673 name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type);
1674 DECL_ARTIFICIAL (name) = 1;
1675 DECL_NAMELESS (name) = 1;
1676 TYPE_NAME (record_type) = name;
1677 TYPE_ARTIFICIAL (record_type) = 1;
1679 auto_vec<tree> field_vec (ws_prop->elements ());
1680 for (hash_set<tree>::iterator it = ws_prop->begin ();
1681 it != ws_prop->end (); ++it)
1682 field_vec.quick_push (*it);
1684 field_vec.qsort (sort_by_size_then_ssa_version_or_uid);
1688 = &record_field_map.get_or_insert (record_type, &existed);
1689 gcc_checking_assert (!existed);
1691 /* Insert var fields in reverse order, so the last inserted element
1692 is the first in the structure. */
1693 for (int i = field_vec.length () - 1; i >= 0; i--)
1694 install_var_field (field_vec[i], record_type, fields);
1696 layout_type (record_type);
1698 bb->aux = (tree) record_type;
1703 = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
1704 last_basic_block_for_fn (cfun));
1706 bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun));
1708 auto_vec<std::pair<int, tree> > priority;
1710 FOR_ALL_BB_FN (bb, cfun)
1714 tree record_type = (tree) bb->aux;
1717 FOR_ALL_BB_FN (bb2, cfun)
1718 bb2->flags &= ~BB_VISITED;
1720 priority.safe_push (std::make_pair (bb->index, record_type));
1721 dfs_broadcast_reachable_1 (bb, reachable[bb->index]);
1726 = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
1727 last_basic_block_for_fn (cfun));
1729 bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun));
1731 for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
1733 sbitmap_iterator bi;
1735 EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi)
1736 bitmap_set_bit (inverted[j], i);
1739 for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
1740 bitmap_ior (reachable[i], reachable[i], inverted[i]);
1742 sbitmap_vector_free (inverted);
1744 used_range_vec_t used_ranges;
1746 used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun));
1748 blk_offset_map_t blk_offset_map;
1750 addr_range worker_shm_bounds (bounds_lo, bounds_hi);
1752 priority.qsort (sort_size_descending);
1753 for (unsigned int i = 0; i < priority.length (); i++)
1755 idx_decl_pair_t p = priority[i];
1756 int blkno = p.first;
1757 tree record_type = p.second;
1758 HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
1759 HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type);
1761 splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range,
1762 splay_tree_free_key, NULL);
1764 if (!used_ranges[blkno])
1765 used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range,
1766 splay_tree_free_key, NULL);
1768 merge_ranges (conflicts, used_ranges[blkno]);
1770 sbitmap_iterator bi;
1772 EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi)
1774 merge_ranges (conflicts, used_ranges[j]);
1777 = first_fit_range (conflicts, size, align, &worker_shm_bounds);
1779 splay_tree_delete (conflicts);
1783 unsigned HOST_WIDE_INT base
1784 = (bounds_lo + align - 1) & ~(align - 1);
1785 if (base + size > bounds_hi)
1786 error_at (UNKNOWN_LOCATION, "shared-memory region overflow");
1787 std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
1788 = std::make_pair (base, false);
1789 blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
1793 splay_tree_node old = splay_tree_lookup (used_ranges[blkno],
1794 (splay_tree_key) &ar);
1797 fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is "
1798 "already mapped in block %d\n", (int) ar.lo,
1799 (int) ar.hi, (int) ((addr_range *) old->key)->lo,
1800 (int) ((addr_range *) old->key)->hi, blkno);
1804 addr_range *arp = new addr_range (ar);
1805 splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp,
1806 (splay_tree_value) blkno);
1807 std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
1808 = std::make_pair (ar.lo, true);
1809 blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
1813 sbitmap_vector_free (reachable);
1815 neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
1816 &partitioned_var_uses, &record_field_map,
1817 &blk_offset_map, writes_gang_private);
1819 record_field_map.empty ();
1821 /* These are supposed to have been 'delete'd by 'neuter_worker_single'. */
1822 for (auto it : prop_set)
1823 gcc_checking_assert (!it);
1824 prop_set.release ();
1828 /* This doesn't seem to make a difference. */
1829 loops_state_clear (LOOP_CLOSED_SSA);
1831 /* Neutering worker-single neutered blocks will invalidate dominance info.
1832 It may be possible to incrementally update just the affected blocks, but
1833 obliterate everything for now. */
1834 free_dominance_info (CDI_DOMINATORS);
1835 free_dominance_info (CDI_POST_DOMINATORS);
1839 fprintf (dump_file, "\n\nAfter neutering:\n\n");
1840 dump_function_to_file (current_function_decl, dump_file, dump_flags);
1845 execute_omp_oacc_neuter_broadcast ()
1847 unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX];
1848 unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX];
1850 for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
1852 reduction_size[i] = 0;
1853 private_size[i] = 0;
1856 /* Calculate shared memory size required for reduction variables and
1857 gang-private memory for this offloaded function. */
1859 FOR_ALL_BB_FN (bb, cfun)
1861 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
1865 gimple *stmt = gsi_stmt (gsi);
1866 if (!is_gimple_call (stmt))
1868 gcall *call = as_a <gcall *> (stmt);
1869 if (!gimple_call_internal_p (call))
1871 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1875 case IFN_GOACC_REDUCTION:
1876 if (integer_minus_onep (gimple_call_arg (call, 3)))
1880 unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1881 /* Only count reduction variables once: the choice to pick
1882 the setup call is fairly arbitrary. */
1883 if (code == IFN_GOACC_REDUCTION_SETUP)
1885 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
1886 tree var = gimple_call_arg (call, 2);
1887 tree offset = gimple_call_arg (call, 5);
1888 tree var_type = TREE_TYPE (var);
1889 unsigned HOST_WIDE_INT limit
1890 = (tree_to_uhwi (offset)
1891 + tree_to_uhwi (TYPE_SIZE_UNIT (var_type)));
1892 reduction_size[level]
1893 = MAX (reduction_size[level], limit);
1899 enum ifn_unique_kind kind
1900 = ((enum ifn_unique_kind)
1901 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1903 if (kind == IFN_UNIQUE_OACC_PRIVATE)
1906 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1909 for (unsigned i = 3;
1910 i < gimple_call_num_args (call);
1913 tree arg = gimple_call_arg (call, i);
1914 gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
1915 tree decl = TREE_OPERAND (arg, 0);
1916 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl);
1917 private_size[level] = ((private_size[level] + align - 1)
1919 unsigned HOST_WIDE_INT decl_size
1920 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl)));
1921 private_size[level] += decl_size;
1930 int dims[GOMP_DIM_MAX];
1931 for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
1932 dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
1934 /* Find bounds of shared-memory buffer space we can use. */
1935 unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0;
1936 if (targetm.goacc.shared_mem_layout)
1937 targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims,
1938 private_size, reduction_size);
1940 /* Perform worker partitioning unless we know 'num_workers(1)'. */
1941 if (dims[GOMP_DIM_WORKER] != 1)
1942 oacc_do_neutering (bounds_lo, bounds_hi);
1949 const pass_data pass_data_omp_oacc_neuter_broadcast =
1951 GIMPLE_PASS, /* type */
1952 "omp_oacc_neuter_broadcast", /* name */
1953 OPTGROUP_OMP, /* optinfo_flags */
1954 TV_NONE, /* tv_id */
1955 PROP_cfg, /* properties_required */
1956 0, /* properties_provided */
1957 0, /* properties_destroyed */
1958 0, /* todo_flags_start */
1959 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1962 class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass
1965 pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
1966 : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt)
1969 /* opt_pass methods: */
1970 virtual bool gate (function *fun)
1975 if (!targetm.goacc.create_worker_broadcast_record)
1978 /* Only relevant for OpenACC offloaded functions. */
1979 tree attr = oacc_get_fn_attrib (fun->decl);
1986 virtual unsigned int execute (function *)
1988 return execute_omp_oacc_neuter_broadcast ();
1991 }; // class pass_omp_oacc_neuter_broadcast
1996 make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
1998 return new pass_omp_oacc_neuter_broadcast (ctxt);