1 /* OpenACC worker partitioning via middle end neutering/broadcasting scheme
3 Copyright (C) 2015-2021 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
28 #include "tree-pass.h"
31 #include "pretty-print.h"
32 #include "fold-const.h"
34 #include "gimple-iterator.h"
35 #include "gimple-walk.h"
36 #include "tree-inline.h"
37 #include "langhooks.h"
38 #include "omp-general.h"
40 #include "gimple-pretty-print.h"
42 #include "insn-config.h"
44 #include "internal-fn.h"
46 #include "tree-nested.h"
47 #include "stor-layout.h"
48 #include "tree-ssa-threadupdate.h"
49 #include "tree-into-ssa.h"
50 #include "splay-tree.h"
54 #include "omp-offload.h"
57 /* Loop structure of the function. The entire function is described as
59 /* Adapted from 'gcc/config/nvptx/nvptx.c:struct parallel'. */
63 /* Parent parallel. */
66 /* Next sibling parallel. */
69 /* First child parallel. */
72 /* Partitioning mask of the parallel. */
75 /* Partitioning used within inner parallels. */
78 /* Location of parallel forked and join. The forked is the first
79 block in the parallel and the join is the first block after of
81 basic_block forked_block;
82 basic_block join_block;
90 /* Basic blocks in this parallel, but not in child parallels. The
91 FORKED and JOINING blocks are in the partition. The FORK and JOIN
93 auto_vec<basic_block> blocks;
100 parallel_g (parallel_g *parent, unsigned mode);
104 /* Constructor links the new parallel into it's parent's chain of
107 parallel_g::parallel_g (parallel_g *parent_, unsigned mask_)
108 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
110 forked_block = join_block = 0;
111 forked_stmt = join_stmt = NULL;
112 fork_stmt = joining_stmt = NULL;
114 record_type = NULL_TREE;
115 sender_decl = NULL_TREE;
116 receiver_decl = NULL_TREE;
120 next = parent->inner;
121 parent->inner = this;
125 parallel_g::~parallel_g ()
132 local_var_based_p (tree decl)
134 switch (TREE_CODE (decl))
137 return !is_global_var (decl);
142 return local_var_based_p (TREE_OPERAND (decl, 0));
149 /* Map of basic blocks to gimple stmts. */
150 typedef hash_map<basic_block, gimple *> bb_stmt_map_t;
152 /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since
153 the routine likely contains partitioned loops (else will do its own
154 neutering and variable propagation). Return TRUE if a function call CALL
155 should be made in (worker) single mode instead, rather than redundant
159 omp_sese_active_worker_call (gcall *call)
161 #define GOMP_DIM_SEQ GOMP_DIM_MAX
162 tree fndecl = gimple_call_fndecl (call);
167 tree attrs = oacc_get_fn_attrib (fndecl);
172 int level = oacc_fn_attrib_level (attrs);
174 /* Neither regular functions nor "seq" routines should be run by all threads
175 in worker-single mode. */
176 return level == -1 || level == GOMP_DIM_SEQ;
180 /* Split basic blocks such that each forked and join unspecs are at
181 the start of their basic blocks. Thus afterwards each block will
182 have a single partitioning mode. We also do the same for return
183 insns, as they are executed by every thread. Return the
184 partitioning mode of the function as a whole. Populate MAP with
185 head and tail blocks. We also clear the BB visited flag, which is
186 used when finding partitions. */
187 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_split_blocks'. */
190 omp_sese_split_blocks (bb_stmt_map_t *map)
192 auto_vec<gimple *> worklist;
195 /* Locate all the reorg instructions of interest. */
196 FOR_ALL_BB_FN (block, cfun)
198 /* Clear visited flag, for use by parallel locator */
199 block->flags &= ~BB_VISITED;
201 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
205 gimple *stmt = gsi_stmt (gsi);
207 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
209 enum ifn_unique_kind k = ((enum ifn_unique_kind)
210 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
212 if (k == IFN_UNIQUE_OACC_JOIN)
213 worklist.safe_push (stmt);
214 else if (k == IFN_UNIQUE_OACC_FORK)
216 gcc_assert (gsi_one_before_end_p (gsi));
217 basic_block forked_block = single_succ (block);
218 gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block);
220 /* We push a NOP as a placeholder for the "forked" stmt.
221 This is then recognized in omp_sese_find_par. */
222 gimple *nop = gimple_build_nop ();
223 gsi_insert_before (&gsi2, nop, GSI_SAME_STMT);
225 worklist.safe_push (nop);
228 else if (gimple_code (stmt) == GIMPLE_RETURN
229 || gimple_code (stmt) == GIMPLE_COND
230 || gimple_code (stmt) == GIMPLE_SWITCH
231 || (gimple_code (stmt) == GIMPLE_CALL
232 && !gimple_call_internal_p (stmt)
233 && !omp_sese_active_worker_call (as_a <gcall *> (stmt))))
234 worklist.safe_push (stmt);
235 else if (is_gimple_assign (stmt))
237 tree lhs = gimple_assign_lhs (stmt);
239 /* Force assignments to components/fields/elements of local
240 aggregates into fully-partitioned (redundant) mode. This
241 avoids having to broadcast the whole aggregate. The RHS of
242 the assignment will be propagated using the normal
245 switch (TREE_CODE (lhs))
251 tree aggr = TREE_OPERAND (lhs, 0);
253 if (local_var_based_p (aggr))
254 worklist.safe_push (stmt);
265 /* Split blocks on the worklist. */
269 for (ix = 0; worklist.iterate (ix, &stmt); ix++)
271 basic_block block = gimple_bb (stmt);
273 if (gimple_code (stmt) == GIMPLE_COND)
275 gcond *orig_cond = as_a <gcond *> (stmt);
276 tree_code code = gimple_expr_code (orig_cond);
277 tree pred = make_ssa_name (boolean_type_node);
278 gimple *asgn = gimple_build_assign (pred, code,
279 gimple_cond_lhs (orig_cond),
280 gimple_cond_rhs (orig_cond));
282 = gimple_build_cond (NE_EXPR, pred, boolean_false_node,
283 gimple_cond_true_label (orig_cond),
284 gimple_cond_false_label (orig_cond));
286 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
287 gsi_insert_before (&gsi, asgn, GSI_SAME_STMT);
288 gsi_replace (&gsi, new_cond, true);
290 edge e = split_block (block, asgn);
292 map->get_or_insert (block) = new_cond;
294 else if ((gimple_code (stmt) == GIMPLE_CALL
295 && !gimple_call_internal_p (stmt))
296 || is_gimple_assign (stmt))
298 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
301 edge call = split_block (block, gsi_stmt (gsi));
303 gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest));
305 edge call_to_ret = split_block (call->dest, call_stmt);
307 map->get_or_insert (call_to_ret->src) = call_stmt;
311 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
315 map->get_or_insert (block) = stmt;
318 /* Split block before insn. The insn is in the new block. */
319 edge e = split_block (block, gsi_stmt (gsi));
322 map->get_or_insert (block) = stmt;
329 mask_name (unsigned mask)
333 case 0: return "gang redundant";
334 case 1: return "gang partitioned";
335 case 2: return "worker partitioned";
336 case 3: return "gang+worker partitioned";
337 case 4: return "vector partitioned";
338 case 5: return "gang+vector partitioned";
339 case 6: return "worker+vector partitioned";
340 case 7: return "fully partitioned";
341 default: return "<illegal>";
345 /* Dump this parallel and all its inner parallels. */
346 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_dump_pars'. */
349 omp_sese_dump_pars (parallel_g *par, unsigned depth)
351 fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n",
352 depth, par->mask, mask_name (par->mask),
353 par->forked_block ? par->forked_block->index : -1,
354 par->join_block ? par->join_block->index : -1);
356 fprintf (dump_file, " blocks:");
359 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
360 fprintf (dump_file, " %d", block->index);
361 fprintf (dump_file, "\n");
363 omp_sese_dump_pars (par->inner, depth + 1);
366 omp_sese_dump_pars (par->next, depth);
369 /* If BLOCK contains a fork/join marker, process it to create or
370 terminate a loop structure. Add this block to the current loop,
371 and then walk successor blocks. */
372 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_find_par'. */
375 omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block)
377 if (block->flags & BB_VISITED)
379 block->flags |= BB_VISITED;
381 if (gimple **stmtp = map->get (block))
383 gimple *stmt = *stmtp;
385 if (gimple_code (stmt) == GIMPLE_COND
386 || gimple_code (stmt) == GIMPLE_SWITCH
387 || gimple_code (stmt) == GIMPLE_RETURN
388 || (gimple_code (stmt) == GIMPLE_CALL
389 && !gimple_call_internal_p (stmt))
390 || is_gimple_assign (stmt))
392 /* A single block that is forced to be at the maximum partition
393 level. Make a singleton par for it. */
394 par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG)
395 | GOMP_DIM_MASK (GOMP_DIM_WORKER)
396 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
397 par->forked_block = block;
398 par->forked_stmt = stmt;
399 par->blocks.safe_push (block);
401 goto walk_successors;
403 else if (gimple_nop_p (stmt))
405 basic_block pred = single_pred (block);
407 gimple_stmt_iterator gsi = gsi_last_bb (pred);
408 gimple *final_stmt = gsi_stmt (gsi);
410 if (gimple_call_internal_p (final_stmt, IFN_UNIQUE))
412 gcall *call = as_a <gcall *> (final_stmt);
413 enum ifn_unique_kind k = ((enum ifn_unique_kind)
414 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
416 if (k == IFN_UNIQUE_OACC_FORK)
419 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
420 unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
422 par = new parallel_g (par, mask);
423 par->forked_block = block;
424 par->forked_stmt = final_stmt;
425 par->fork_stmt = stmt;
433 else if (gimple_call_internal_p (stmt, IFN_UNIQUE))
435 gcall *call = as_a <gcall *> (stmt);
436 enum ifn_unique_kind k = ((enum ifn_unique_kind)
437 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
438 if (k == IFN_UNIQUE_OACC_JOIN)
440 HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
441 unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
443 gcc_assert (par->mask == mask);
444 par->join_block = block;
445 par->join_stmt = stmt;
456 /* Add this block onto the current loop's list of blocks. */
457 par->blocks.safe_push (block);
459 /* This must be the entry block. Create a NULL parallel. */
460 par = new parallel_g (0, 0);
463 /* Walk successor blocks. */
467 FOR_EACH_EDGE (e, ei, block->succs)
468 omp_sese_find_par (map, par, e->dest);
473 /* DFS walk the CFG looking for fork & join markers. Construct
474 loop structures as we go. MAP is a mapping of basic blocks
475 to head & tail markers, discovered when splitting blocks. This
476 speeds up the discovery. We rely on the BB visited flag having
477 been cleared when splitting blocks. */
478 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_discover_pars'. */
481 omp_sese_discover_pars (bb_stmt_map_t *map)
485 /* Mark exit blocks as visited. */
486 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
487 block->flags |= BB_VISITED;
489 /* And entry block as not. */
490 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
491 block->flags &= ~BB_VISITED;
493 parallel_g *par = omp_sese_find_par (map, 0, block);
497 fprintf (dump_file, "\nLoops\n");
498 omp_sese_dump_pars (par, 0);
499 fprintf (dump_file, "\n");
506 populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single,
507 bitmap vector_single, unsigned outer_mask,
510 unsigned mask = outer_mask | par->mask;
514 for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
516 if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
517 bitmap_set_bit (worker_single, block->index);
519 if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0)
520 bitmap_set_bit (vector_single, block->index);
524 populate_single_mode_bitmaps (par->inner, worker_single, vector_single,
527 populate_single_mode_bitmaps (par->next, worker_single, vector_single,
531 /* A map from SSA names or var decls to record fields. */
533 typedef hash_map<tree, tree> field_map_t;
535 /* For each propagation record type, this is a map from SSA names or var decls
536 to propagate, to the field in the record type that should be used for
537 transmission and reception. */
539 typedef hash_map<tree, field_map_t *> record_field_map_t;
541 static GTY(()) record_field_map_t *field_map;
544 install_var_field (tree var, tree record_type)
546 field_map_t *fields = *field_map->get (record_type);
550 if (TREE_CODE (var) == SSA_NAME)
552 name = SSA_NAME_IDENTIFIER (var);
555 sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var));
556 name = get_identifier (tmp);
559 else if (TREE_CODE (var) == VAR_DECL)
561 name = DECL_NAME (var);
564 sprintf (tmp, "D_%u", (unsigned) DECL_UID (var));
565 name = get_identifier (tmp);
571 gcc_assert (!fields->get (var));
573 tree type = TREE_TYPE (var);
575 if (POINTER_TYPE_P (type)
576 && TYPE_RESTRICT (type))
577 type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT);
579 tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type);
581 if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var))
583 SET_DECL_ALIGN (field, DECL_ALIGN (var));
584 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
585 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
588 SET_DECL_ALIGN (field, TYPE_ALIGN (type));
590 fields->put (var, field);
592 insert_field_into_struct (record_type, field);
595 /* Sets of SSA_NAMES or VAR_DECLs to propagate. */
596 typedef hash_set<tree> propagation_set;
599 find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask,
600 bitmap worker_single, bitmap vector_single,
601 vec<propagation_set *> *prop_set)
603 unsigned mask = outer_mask | par->mask;
606 find_ssa_names_to_propagate (par->inner, mask, worker_single,
607 vector_single, prop_set);
609 find_ssa_names_to_propagate (par->next, outer_mask, worker_single,
610 vector_single, prop_set);
612 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
617 for (ix = 0; par->blocks.iterate (ix, &block); ix++)
619 for (gphi_iterator psi = gsi_start_phis (block);
620 !gsi_end_p (psi); gsi_next (&psi))
622 gphi *phi = psi.phi ();
626 FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE)
628 tree var = USE_FROM_PTR (use);
630 if (TREE_CODE (var) != SSA_NAME)
633 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
635 if (gimple_nop_p (def_stmt))
638 basic_block def_bb = gimple_bb (def_stmt);
640 if (bitmap_bit_p (worker_single, def_bb->index))
642 if (!(*prop_set)[def_bb->index])
643 (*prop_set)[def_bb->index] = new propagation_set;
645 propagation_set *ws_prop = (*prop_set)[def_bb->index];
652 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
653 !gsi_end_p (gsi); gsi_next (&gsi))
657 gimple *stmt = gsi_stmt (gsi);
659 FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
661 tree var = USE_FROM_PTR (use);
663 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
665 if (gimple_nop_p (def_stmt))
668 basic_block def_bb = gimple_bb (def_stmt);
670 if (bitmap_bit_p (worker_single, def_bb->index))
672 if (!(*prop_set)[def_bb->index])
673 (*prop_set)[def_bb->index] = new propagation_set;
675 propagation_set *ws_prop = (*prop_set)[def_bb->index];
685 /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a
689 find_partitioned_var_uses_1 (tree *node, int *, void *data)
691 walk_stmt_info *wi = (walk_stmt_info *) data;
692 hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info;
694 if (!wi->is_lhs && VAR_P (*node))
695 partitioned_var_uses->add (*node);
701 find_partitioned_var_uses (parallel_g *par, unsigned outer_mask,
702 hash_set<tree> *partitioned_var_uses)
704 unsigned mask = outer_mask | par->mask;
707 find_partitioned_var_uses (par->inner, mask, partitioned_var_uses);
709 find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses);
711 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
716 for (ix = 0; par->blocks.iterate (ix, &block); ix++)
717 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
718 !gsi_end_p (gsi); gsi_next (&gsi))
721 memset (&wi, 0, sizeof (wi));
722 wi.info = (void *) partitioned_var_uses;
723 walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi);
728 /* Gang-private variables (typically placed in a GPU's shared memory) do not
729 need to be processed by the worker-propagation mechanism. Populate the
730 GANG_PRIVATE_VARS set with any such variables found in the current
734 find_gang_private_vars (hash_set<tree> *gang_private_vars)
738 FOR_EACH_BB_FN (block, cfun)
740 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
744 gimple *stmt = gsi_stmt (gsi);
746 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
748 enum ifn_unique_kind k = ((enum ifn_unique_kind)
749 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
750 if (k == IFN_UNIQUE_OACC_PRIVATE)
753 = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
754 if (level != GOMP_DIM_GANG)
756 for (unsigned i = 3; i < gimple_call_num_args (stmt); i++)
758 tree arg = gimple_call_arg (stmt, i);
759 gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
760 tree decl = TREE_OPERAND (arg, 0);
761 gang_private_vars->add (decl);
770 find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
771 hash_set<tree> *partitioned_var_uses,
772 hash_set<tree> *gang_private_vars,
773 vec<propagation_set *> *prop_set)
775 unsigned mask = outer_mask | par->mask;
778 find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
779 gang_private_vars, prop_set);
781 find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
782 gang_private_vars, prop_set);
784 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
789 for (ix = 0; par->blocks.iterate (ix, &block); ix++)
791 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
792 !gsi_end_p (gsi); gsi_next (&gsi))
794 gimple *stmt = gsi_stmt (gsi);
798 FOR_EACH_LOCAL_DECL (cfun, i, var)
801 || is_global_var (var)
802 || AGGREGATE_TYPE_P (TREE_TYPE (var))
803 || !partitioned_var_uses->contains (var)
804 || gang_private_vars->contains (var))
807 if (stmt_may_clobber_ref_p (stmt, var))
811 fprintf (dump_file, "bb %u: local variable may be "
812 "clobbered in %s mode: ", block->index,
814 print_generic_expr (dump_file, var, TDF_SLIM);
815 fprintf (dump_file, "\n");
818 if (!(*prop_set)[block->index])
819 (*prop_set)[block->index] = new propagation_set;
821 propagation_set *ws_prop
822 = (*prop_set)[block->index];
832 /* Transform basic blocks FROM, TO (which may be the same block) into:
833 if (GOACC_single_start ())
838 | | (new) predicate block
841 +----+ +----+ +----+ |
842 | | | | ===> | | | f (old) from block
843 +----+ +----+ +----+ |
846 (split (split before | | skip block
847 at end) condition) +----+
852 worker_single_simple (basic_block from, basic_block to,
853 hash_set<tree> *def_escapes_block)
857 basic_block skip_block;
859 gimple_stmt_iterator gsi = gsi_last_bb (to);
860 if (EDGE_COUNT (to->succs) > 1)
862 gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND);
865 edge e = split_block (to, gsi_stmt (gsi));
866 skip_block = e->dest;
868 gimple_stmt_iterator start = gsi_after_labels (from);
870 decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START);
871 lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
872 call = gimple_build_call (decl, 0);
873 gimple_call_set_lhs (call, lhs);
874 gsi_insert_before (&start, call, GSI_NEW_STMT);
877 cond = gimple_build_cond (EQ_EXPR, lhs,
878 fold_convert_loc (UNKNOWN_LOCATION,
881 NULL_TREE, NULL_TREE);
882 gsi_insert_after (&start, cond, GSI_NEW_STMT);
885 edge et = split_block (from, cond);
886 et->flags &= ~EDGE_FALLTHRU;
887 et->flags |= EDGE_TRUE_VALUE;
888 /* Make the active worker the more probable path so we prefer fallthrough
889 (letting the idle workers jump around more). */
890 et->probability = profile_probability::likely ();
892 edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE);
893 ef->probability = et->probability.invert ();
895 basic_block neutered = split_edge (ef);
896 gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered);
898 for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi))
900 gimple *stmt = gsi_stmt (gsi);
904 FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF)
906 if (def_escapes_block->contains (var))
908 gphi *join_phi = create_phi_node (NULL_TREE, skip_block);
909 create_new_def_for (var, join_phi,
910 gimple_phi_result_ptr (join_phi));
911 add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION);
913 tree neutered_def = copy_ssa_name (var, NULL);
914 /* We really want "don't care" or some value representing
915 undefined here, but optimizers will probably get rid of the
916 zero-assignments anyway. */
917 gassign *zero = gimple_build_assign (neutered_def,
918 build_zero_cst (TREE_TYPE (neutered_def)));
920 gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING);
923 add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered),
925 update_stmt (join_phi);
930 gsi = gsi_start_bb (skip_block);
932 decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
933 gimple *acc_bar = gimple_build_call (decl, 0);
935 gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
936 update_stmt (acc_bar);
939 /* Build COMPONENT_REF and set TREE_THIS_VOLATILE and TREE_READONLY on it
941 /* Adapted from 'gcc/omp-low.c:omp_build_component_ref'. */
944 oacc_build_component_ref (tree obj, tree field)
946 tree field_type = TREE_TYPE (field);
947 tree obj_type = TREE_TYPE (obj);
948 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (obj_type)))
949 field_type = build_qualified_type
951 KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (obj_type)));
953 tree ret = build3 (COMPONENT_REF, field_type, obj, field, NULL);
954 if (TREE_THIS_VOLATILE (field))
955 TREE_THIS_VOLATILE (ret) |= 1;
956 if (TREE_READONLY (field))
957 TREE_READONLY (ret) |= 1;
962 build_receiver_ref (tree record_type, tree var, tree receiver_decl)
964 field_map_t *fields = *field_map->get (record_type);
965 tree x = build_simple_mem_ref (receiver_decl);
966 tree field = *fields->get (var);
967 TREE_THIS_NOTRAP (x) = 1;
968 x = oacc_build_component_ref (x, field);
973 build_sender_ref (tree record_type, tree var, tree sender_decl)
975 field_map_t *fields = *field_map->get (record_type);
976 tree field = *fields->get (var);
977 return oacc_build_component_ref (sender_decl, field);
981 sort_by_ssa_version_or_uid (const void *p1, const void *p2)
983 const tree t1 = *(const tree *)p1;
984 const tree t2 = *(const tree *)p2;
986 if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME)
987 return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2);
988 else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME)
990 else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME)
993 return DECL_UID (t1) - DECL_UID (t2);
997 sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2)
999 const tree t1 = *(const tree *)p1;
1000 const tree t2 = *(const tree *)p2;
1001 unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1)));
1002 unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2)));
1006 return sort_by_ssa_version_or_uid (p1, p2);
1010 worker_single_copy (basic_block from, basic_block to,
1011 hash_set<tree> *def_escapes_block,
1012 hash_set<tree> *worker_partitioned_uses,
1015 /* If we only have virtual defs, we'll have no record type, but we still want
1016 to emit single_copy_start and (particularly) single_copy_end to act as
1017 a vdef source on the neutered edge representing memory writes on the
1018 non-neutered edge. */
1020 record_type = char_type_node;
1023 = targetm.goacc.create_worker_broadcast_record (record_type, true,
1026 = targetm.goacc.create_worker_broadcast_record (record_type, false,
1029 gimple_stmt_iterator gsi = gsi_last_bb (to);
1030 if (EDGE_COUNT (to->succs) > 1)
1032 edge e = split_block (to, gsi_stmt (gsi));
1033 basic_block barrier_block = e->dest;
1035 gimple_stmt_iterator start = gsi_after_labels (from);
1037 tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START);
1039 tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
1041 gimple *call = gimple_build_call (decl, 1,
1042 build_fold_addr_expr (sender_decl));
1043 gimple_call_set_lhs (call, lhs);
1044 gsi_insert_before (&start, call, GSI_NEW_STMT);
1047 tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1049 gimple *conv = gimple_build_assign (conv_tmp,
1050 fold_convert (TREE_TYPE (receiver_decl),
1053 gsi_insert_after (&start, conv, GSI_NEW_STMT);
1054 gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp);
1055 gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1058 tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0);
1060 tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1061 asgn = gimple_build_assign (recv_tmp, receiver_decl);
1062 gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1065 gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE,
1069 gsi_insert_after (&start, cond, GSI_NEW_STMT);
1071 edge et = split_block (from, cond);
1072 et->flags &= ~EDGE_FALLTHRU;
1073 et->flags |= EDGE_TRUE_VALUE;
1074 /* Make the active worker the more probable path so we prefer fallthrough
1075 (letting the idle workers jump around more). */
1076 et->probability = profile_probability::likely ();
1078 basic_block body = et->dest;
1080 edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
1081 ef->probability = et->probability.invert ();
1083 decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1084 gimple *acc_bar = gimple_build_call (decl, 0);
1086 gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
1087 gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
1089 cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
1090 gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
1092 edge et2 = split_block (barrier_block, cond);
1093 et2->flags &= ~EDGE_FALLTHRU;
1094 et2->flags |= EDGE_TRUE_VALUE;
1095 et2->probability = profile_probability::unlikely ();
1097 basic_block exit_block = et2->dest;
1099 basic_block copyout_block = split_edge (et2);
1100 edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE);
1101 ef2->probability = et2->probability.invert ();
1103 gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block);
1105 edge copyout_to_exit = single_succ_edge (copyout_block);
1107 gimple_seq sender_seq = NULL;
1109 /* Make sure we iterate over definitions in a stable order. */
1110 auto_vec<tree> escape_vec (def_escapes_block->elements ());
1111 for (hash_set<tree>::iterator it = def_escapes_block->begin ();
1112 it != def_escapes_block->end (); ++it)
1113 escape_vec.quick_push (*it);
1114 escape_vec.qsort (sort_by_ssa_version_or_uid);
1116 for (unsigned i = 0; i < escape_vec.length (); i++)
1118 tree var = escape_vec[i];
1120 if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var))
1123 tree barrier_def = 0;
1125 if (TREE_CODE (var) == SSA_NAME)
1127 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1129 if (gimple_nop_p (def_stmt))
1132 /* The barrier phi takes one result from the actual work of the
1133 block we're neutering, and the other result is constant zero of
1136 gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block);
1137 barrier_def = create_new_def_for (var, barrier_phi,
1138 gimple_phi_result_ptr (barrier_phi));
1140 add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION);
1141 add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef,
1144 update_stmt (barrier_phi);
1147 gcc_assert (TREE_CODE (var) == VAR_DECL);
1149 /* If we had no record type, we will have no fields map. */
1150 field_map_t **fields_p = field_map->get (record_type);
1151 field_map_t *fields = fields_p ? *fields_p : NULL;
1153 if (worker_partitioned_uses->contains (var)
1155 && fields->get (var))
1157 tree neutered_def = make_ssa_name (TREE_TYPE (var));
1159 /* Receive definition from shared memory block. */
1161 tree receiver_ref = build_receiver_ref (record_type, var,
1163 gassign *recv = gimple_build_assign (neutered_def,
1165 gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING);
1168 if (TREE_CODE (var) == VAR_DECL)
1170 /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy
1171 to the final location now. */
1172 gassign *asgn = gimple_build_assign (var, neutered_def);
1173 gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING);
1178 /* If it's an SSA name, create a new phi at the join node to
1179 represent either the output from the active worker (the
1180 barrier) or the inactive workers (the copyout block). */
1181 gphi *join_phi = create_phi_node (NULL_TREE, exit_block);
1182 create_new_def_for (barrier_def, join_phi,
1183 gimple_phi_result_ptr (join_phi));
1184 add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION);
1185 add_phi_arg (join_phi, neutered_def, copyout_to_exit,
1187 update_stmt (join_phi);
1190 /* Send definition to shared memory block. */
1192 tree sender_ref = build_sender_ref (record_type, var, sender_decl);
1194 if (TREE_CODE (var) == SSA_NAME)
1196 gassign *send = gimple_build_assign (sender_ref, var);
1197 gimple_seq_add_stmt (&sender_seq, send);
1200 else if (TREE_CODE (var) == VAR_DECL)
1202 tree tmp = make_ssa_name (TREE_TYPE (var));
1203 gassign *send = gimple_build_assign (tmp, var);
1204 gimple_seq_add_stmt (&sender_seq, send);
1206 send = gimple_build_assign (sender_ref, tmp);
1207 gimple_seq_add_stmt (&sender_seq, send);
1215 /* It's possible for the ET->DEST block (the work done by the active thread)
1216 to finish with a control-flow insn, e.g. a UNIQUE function call. Split
1217 the block and add SENDER_SEQ in the latter part to avoid having control
1218 flow in the middle of a BB. */
1220 decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
1221 call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl));
1222 gimple_seq_add_stmt (&sender_seq, call);
1224 gsi = gsi_last_bb (body);
1225 gimple *last = gsi_stmt (gsi);
1226 basic_block sender_block = split_block (body, last)->dest;
1227 gsi = gsi_last_bb (sender_block);
1228 gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
1232 neuter_worker_single (parallel_g *par, unsigned outer_mask,
1233 bitmap worker_single, bitmap vector_single,
1234 vec<propagation_set *> *prop_set,
1235 hash_set<tree> *partitioned_var_uses)
1237 unsigned mask = outer_mask | par->mask;
1239 if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1243 for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1245 bool has_defs = false;
1246 hash_set<tree> def_escapes_block;
1247 hash_set<tree> worker_partitioned_uses;
1251 FOR_EACH_SSA_NAME (j, var, cfun)
1253 if (SSA_NAME_IS_VIRTUAL_OPERAND (var))
1259 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1261 if (gimple_nop_p (def_stmt))
1264 if (gimple_bb (def_stmt)->index != block->index)
1268 imm_use_iterator use_iter;
1269 bool uses_outside_block = false;
1270 bool worker_partitioned_use = false;
1272 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var)
1274 int blocknum = gimple_bb (use_stmt)->index;
1276 /* Don't propagate SSA names that are only used in the
1277 current block, unless the usage is in a phi node: that
1278 means the name left the block, then came back in at the
1280 if (blocknum != block->index
1281 || gimple_code (use_stmt) == GIMPLE_PHI)
1282 uses_outside_block = true;
1283 if (!bitmap_bit_p (worker_single, blocknum))
1284 worker_partitioned_use = true;
1287 if (uses_outside_block)
1288 def_escapes_block.add (var);
1290 if (worker_partitioned_use)
1292 worker_partitioned_uses.add (var);
1297 propagation_set *ws_prop = (*prop_set)[block->index];
1301 for (propagation_set::iterator it = ws_prop->begin ();
1302 it != ws_prop->end ();
1306 if (TREE_CODE (var) == VAR_DECL)
1308 def_escapes_block.add (var);
1309 if (partitioned_var_uses->contains (var))
1311 worker_partitioned_uses.add (var);
1318 (*prop_set)[block->index] = 0;
1321 tree record_type = (tree) block->aux;
1324 worker_single_copy (block, block, &def_escapes_block,
1325 &worker_partitioned_uses, record_type);
1327 worker_single_simple (block, block, &def_escapes_block);
1331 if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1335 for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1336 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
1340 gimple *stmt = gsi_stmt (gsi);
1342 if (gimple_code (stmt) == GIMPLE_CALL
1343 && !gimple_call_internal_p (stmt)
1344 && !omp_sese_active_worker_call (as_a <gcall *> (stmt)))
1346 /* If we have an OpenACC routine call in worker-single mode,
1347 place barriers before and afterwards to prevent
1348 clobbering re-used shared memory regions (as are used
1349 for AMDGCN at present, for example). */
1350 tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1351 gsi_insert_before (&gsi, gimple_build_call (decl, 0),
1353 gsi_insert_after (&gsi, gimple_build_call (decl, 0),
1360 neuter_worker_single (par->inner, mask, worker_single, vector_single,
1361 prop_set, partitioned_var_uses);
1363 neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
1364 prop_set, partitioned_var_uses);
1368 execute_omp_oacc_neuter_broadcast ()
1370 bb_stmt_map_t bb_stmt_map;
1371 auto_bitmap worker_single, vector_single;
1373 omp_sese_split_blocks (&bb_stmt_map);
1377 fprintf (dump_file, "\n\nAfter splitting:\n\n");
1378 dump_function_to_file (current_function_decl, dump_file, dump_flags);
1383 /* If this is a routine, calculate MASK as if the outer levels are already
1385 tree attr = oacc_get_fn_attrib (current_function_decl);
1388 tree dims = TREE_VALUE (attr);
1390 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
1392 tree allowed = TREE_PURPOSE (dims);
1393 if (allowed && integer_zerop (allowed))
1394 mask |= GOMP_DIM_MASK (ix);
1398 parallel_g *par = omp_sese_discover_pars (&bb_stmt_map);
1399 populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0);
1402 FOR_ALL_BB_FN (bb, cfun)
1405 field_map = record_field_map_t::create_ggc (40);
1407 vec<propagation_set *> prop_set;
1408 prop_set.create (last_basic_block_for_fn (cfun));
1410 for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
1411 prop_set.quick_push (0);
1413 find_ssa_names_to_propagate (par, mask, worker_single, vector_single,
1416 hash_set<tree> partitioned_var_uses;
1417 hash_set<tree> gang_private_vars;
1419 find_gang_private_vars (&gang_private_vars);
1420 find_partitioned_var_uses (par, mask, &partitioned_var_uses);
1421 find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
1422 &gang_private_vars, &prop_set);
1424 FOR_ALL_BB_FN (bb, cfun)
1426 propagation_set *ws_prop = prop_set[bb->index];
1429 tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
1430 tree name = create_tmp_var_name (".oacc_ws_data_s");
1431 name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type);
1432 DECL_ARTIFICIAL (name) = 1;
1433 DECL_NAMELESS (name) = 1;
1434 TYPE_NAME (record_type) = name;
1435 TYPE_ARTIFICIAL (record_type) = 1;
1437 auto_vec<tree> field_vec (ws_prop->elements ());
1438 for (hash_set<tree>::iterator it = ws_prop->begin ();
1439 it != ws_prop->end (); ++it)
1440 field_vec.quick_push (*it);
1442 field_vec.qsort (sort_by_size_then_ssa_version_or_uid);
1444 field_map->put (record_type, field_map_t::create_ggc (17));
1446 /* Insert var fields in reverse order, so the last inserted element
1447 is the first in the structure. */
1448 for (int i = field_vec.length () - 1; i >= 0; i--)
1449 install_var_field (field_vec[i], record_type);
1451 layout_type (record_type);
1453 bb->aux = (tree) record_type;
1457 neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
1458 &partitioned_var_uses);
1460 prop_set.release ();
1462 /* This doesn't seem to make a difference. */
1463 loops_state_clear (LOOP_CLOSED_SSA);
1465 /* Neutering worker-single neutered blocks will invalidate dominance info.
1466 It may be possible to incrementally update just the affected blocks, but
1467 obliterate everything for now. */
1468 free_dominance_info (CDI_DOMINATORS);
1469 free_dominance_info (CDI_POST_DOMINATORS);
1473 fprintf (dump_file, "\n\nAfter neutering:\n\n");
1474 dump_function_to_file (current_function_decl, dump_file, dump_flags);
1482 const pass_data pass_data_omp_oacc_neuter_broadcast =
1484 GIMPLE_PASS, /* type */
1485 "omp_oacc_neuter_broadcast", /* name */
1486 OPTGROUP_OMP, /* optinfo_flags */
1487 TV_NONE, /* tv_id */
1488 PROP_cfg, /* properties_required */
1489 0, /* properties_provided */
1490 0, /* properties_destroyed */
1491 0, /* todo_flags_start */
1492 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1495 class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass
1498 pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
1499 : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt)
1502 /* opt_pass methods: */
1503 virtual bool gate (function *)
1505 return (flag_openacc
1506 && targetm.goacc.create_worker_broadcast_record);
1509 virtual unsigned int execute (function *)
1511 return execute_omp_oacc_neuter_broadcast ();
1514 }; // class pass_omp_oacc_neuter_broadcast
1519 make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
1521 return new pass_omp_oacc_neuter_broadcast (ctxt);