From e50146711b7200e8f822c6d8239430c682b76e4f Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Wed, 4 Nov 2015 16:57:36 +0000 Subject: [PATCH] internal-fn.def (GOACC_REDUCTION): New. * internal-fn.def (GOACC_REDUCTION): New. * internal-fn.h (enum ifn_goacc_reduction_kind): New. * internal-fn.c (expand_GOACC_REDUCTION): New. * target.def (goacc.reduction): New OpenACC hook. * targhooks.h (default_goacc_reduction): Declare. * doc/tm.texi.in: Add TARGET_GOACC_REDUCTION. * doc/tm.texi: Rebuilt. * omp-low.c (oacc_get_reduction_array_id, oacc_max_threads, scan_sharing_clauses): Remove oacc reduction handling here. (lower_rec_input_clauses): Don't handle OpenACC reductions here. (oacc_lower_reduction_var_helper): Delete. (lower_oacc_reductions): New. (lower_reduction_clauses): Don't handle OpenACC reductions here. (lower_oacc_head_tail): Call lower_oacc_reductions. (oacc_gimple_assign, oacc_init_reduction_array, oacc_initialize_reduction_data, oacc_finalize_reduction_data, oacc_process_reduction_data): Delete. (lower_omp_target): Remove old OpenACC reduction handling. Insert dummy OpenACC gang reduction for reductions at outer level. (oacc_loop_xform_head_tail): Transform IFN_GOACC_REDUCTION. (default_goacc_reduction): New. (execute_oacc_device_lower): Handle IFN_GOACC_REDUCTION. From-SVN: r229767 --- gcc/ChangeLog | 28 +- gcc/doc/tm.texi | 9 + gcc/doc/tm.texi.in | 2 + gcc/internal-fn.c | 8 + gcc/internal-fn.def | 3 + gcc/internal-fn.h | 22 ++ gcc/omp-low.c | 818 ++++++++++++++++------------------------------------ gcc/target.def | 11 + gcc/targhooks.h | 1 + 9 files changed, 330 insertions(+), 572 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6e7020c..e332df3 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,29 @@ +2015-11-04 Nathan Sidwell + Cesar Philippidis + + * internal-fn.def (GOACC_REDUCTION): New. + * internal-fn.h (enum ifn_goacc_reduction_kind): New. + * internal-fn.c (expand_GOACC_REDUCTION): New. + * target.def (goacc.reduction): New OpenACC hook. + * targhooks.h (default_goacc_reduction): Declare. + * doc/tm.texi.in: Add TARGET_GOACC_REDUCTION. + * doc/tm.texi: Rebuilt. + * omp-low.c (oacc_get_reduction_array_id, oacc_max_threads, + scan_sharing_clauses): Remove oacc reduction handling here. + (lower_rec_input_clauses): Don't handle OpenACC reductions here. + (oacc_lower_reduction_var_helper): Delete. + (lower_oacc_reductions): New. + (lower_reduction_clauses): Don't handle OpenACC reductions here. + (lower_oacc_head_tail): Call lower_oacc_reductions. + (oacc_gimple_assign, oacc_init_reduction_array, + oacc_initialize_reduction_data, oacc_finalize_reduction_data, + oacc_process_reduction_data): Delete. + (lower_omp_target): Remove old OpenACC reduction handling. Insert + dummy OpenACC gang reduction for reductions at outer level. + (oacc_loop_xform_head_tail): Transform IFN_GOACC_REDUCTION. + (default_goacc_reduction): New. + (execute_oacc_device_lower): Handle IFN_GOACC_REDUCTION. + 2015-11-04 Martin Liska * cgraphunit.c (cgraph_node::expand_thunk): Call @@ -41,7 +67,7 @@ vi->only_restrict_pointers. (intra_create_variable_infos): Call create_variable_info_for_1 with extra arg. Remove restrict handling. Call make_param_constraints with - one less arg. + one fewer arg. 2015-11-04 Tom de Vries diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 3b1e2dc..1e6baa0 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -5787,6 +5787,15 @@ gimple has been inserted before it, or there is no need for it). The default hook returns false, if there are no RTL expanders for them. @end deftypefn +@deftypefn {Target Hook} void TARGET_GOACC_REDUCTION (gcall *@var{call}) +This hook is used by the oacc_transform pass to expand calls to the +@var{GOACC_REDUCTION} internal function, into a sequence of gimple +instructions. @var{call} is gimple statement containing the call to +the function. This hook removes statement @var{call} after the +expanded sequence has been inserted. This hook is also responsible +for allocating any storage for reductions when necessary. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index afa00f7..0529011 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4264,6 +4264,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_GOACC_FORK_JOIN +@hook TARGET_GOACC_REDUCTION + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index a7da373..6fac752 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -2045,6 +2045,14 @@ expand_GOACC_LOOP (gcall *stmt ATTRIBUTE_UNUSED) gcc_unreachable (); } +/* This is expanded by oacc_device_lower pass. */ + +static void +expand_GOACC_REDUCTION (gcall *stmt ATTRIBUTE_UNUSED) +{ + gcc_unreachable (); +} + /* Routines to expand each internal function, indexed by function number. Each routine has the prototype: diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 78266d9..ca06b10 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -83,3 +83,6 @@ DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, ".") /* OpenACC looping abstraction. See internal-fn.h for usage. */ DEF_INTERNAL_FN (GOACC_LOOP, ECF_PURE | ECF_NOTHROW, NULL) + +/* OpenACC reduction abstraction. See internal-fn.h for usage. */ +DEF_INTERNAL_FN (GOACC_REDUCTION, ECF_NOTHROW | ECF_LEAF, NULL) diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index 2b675e8..20cbd13 100644 --- a/gcc/internal-fn.h +++ b/gcc/internal-fn.h @@ -66,6 +66,28 @@ enum ifn_goacc_loop_kind { IFN_GOACC_LOOP_BOUND /* Limit of iteration value. */ }; +/* The GOACC_REDUCTION function defines a generic interface to support + gang, worker and vector reductions. All calls are of the following + form: + + V = REDUCTION (CODE, REF_TO_RES, LOCAL_VAR, LEVEL, OP, OFFSET) + + REF_TO_RES - is a reference to the original reduction varl, may be NULL + LOCAL_VAR is the intermediate reduction variable + LEVEL corresponds to the GOMP_DIM of the reduction + OP is the tree code of the reduction operation + OFFSET may be used as an offset into a reduction array for the + reductions occuring at this level. + In general the return value is LOCAL_VAR, which creates a data + dependency between calls operating on the same reduction. */ + +enum ifn_goacc_reduction_kind { + IFN_GOACC_REDUCTION_SETUP, + IFN_GOACC_REDUCTION_INIT, + IFN_GOACC_REDUCTION_FINI, + IFN_GOACC_REDUCTION_TEARDOWN +}; + /* Initialize internal function tables. */ extern void init_internal_fns (); diff --git a/gcc/omp-low.c b/gcc/omp-low.c index ccf0b63..6a0915b 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -305,66 +305,6 @@ is_oacc_kernels (omp_context *ctx) == GF_OMP_TARGET_KIND_OACC_KERNELS)); } -/* Helper function to get the name of the array containing the partial - reductions for OpenACC reductions. */ -static const char * -oacc_get_reduction_array_id (tree node) -{ - const char *id = IDENTIFIER_POINTER (DECL_NAME (node)); - int len = strlen ("OACC") + strlen (id); - char *temp_name = XALLOCAVEC (char, len + 1); - snprintf (temp_name, len + 1, "OACC%s", id); - return IDENTIFIER_POINTER (get_identifier (temp_name)); -} - -/* Determine the number of threads OpenACC threads used to determine the - size of the array of partial reductions. Currently, this is num_gangs - * vector_length. This value may be different than GOACC_GET_NUM_THREADS, - because it is independed of the device used. */ - -static tree -oacc_max_threads (omp_context *ctx) -{ - tree nthreads, vector_length, gangs, clauses; - - gangs = fold_convert (sizetype, integer_one_node); - vector_length = gangs; - - /* The reduction clause may be nested inside a loop directive. - Scan for the innermost vector_length clause. */ - for (omp_context *oc = ctx; oc; oc = oc->outer) - { - if (gimple_code (oc->stmt) != GIMPLE_OMP_TARGET - || (gimple_omp_target_kind (oc->stmt) - != GF_OMP_TARGET_KIND_OACC_PARALLEL)) - continue; - - clauses = gimple_omp_target_clauses (oc->stmt); - - vector_length = find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH); - if (vector_length) - vector_length = fold_convert_loc (OMP_CLAUSE_LOCATION (vector_length), - sizetype, - OMP_CLAUSE_VECTOR_LENGTH_EXPR - (vector_length)); - else - vector_length = fold_convert (sizetype, integer_one_node); - - gangs = find_omp_clause (clauses, OMP_CLAUSE_NUM_GANGS); - if (gangs) - gangs = fold_convert_loc (OMP_CLAUSE_LOCATION (gangs), sizetype, - OMP_CLAUSE_NUM_GANGS_EXPR (gangs)); - else - gangs = fold_convert (sizetype, integer_one_node); - - break; - } - - nthreads = fold_build2 (MULT_EXPR, sizetype, gangs, vector_length); - - return nthreads; -} - /* If DECL is the artificial dummy VAR_DECL created for non-static data member privatization, return the underlying "this" parameter, otherwise return NULL. */ @@ -2016,27 +1956,6 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) install_var_field (decl, by_ref, 3, ctx); } install_var_local (decl, ctx); - if (is_gimple_omp_oacc (ctx->stmt) - && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION) - { - /* Create a decl for the reduction array. */ - tree var = OMP_CLAUSE_DECL (c); - tree type = get_base_type (var); - tree ptype = build_pointer_type (type); - tree array = create_tmp_var (ptype, - oacc_get_reduction_array_id (var)); - omp_context *octx = (ctx->field_map ? ctx : ctx->outer); - install_var_field (array, true, 3, octx); - install_var_local (array, octx); - - /* Insert it into the current context. */ - splay_tree_insert (ctx->reduction_map, (splay_tree_key) - oacc_get_reduction_array_id (var), - (splay_tree_value) array); - splay_tree_insert (ctx->reduction_map, - (splay_tree_key) array, - (splay_tree_value) array); - } break; case OMP_CLAUSE_USE_DEVICE_PTR: @@ -4938,6 +4857,10 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist, break; case OMP_CLAUSE_REDUCTION: + /* OpenACC reductions are initialized using the + GOACC_REDUCTION internal function. */ + if (is_gimple_omp_oacc (ctx->stmt)) + break; if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) { tree placeholder = OMP_CLAUSE_REDUCTION_PLACEHOLDER (c); @@ -5351,56 +5274,170 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list, gimple_seq_add_stmt (stmt_list, gimple_build_label (label)); } +/* Lower the OpenACC reductions of CLAUSES for compute axis LEVEL + (which might be a placeholder). INNER is true if this is an inner + axis of a multi-axis loop. FORK and JOIN are (optional) fork and + join markers. Generate the before-loop forking sequence in + FORK_SEQ and the after-loop joining sequence to JOIN_SEQ. The + general form of these sequences is + + GOACC_REDUCTION_SETUP + GOACC_FORK + GOACC_REDUCTION_INIT + ... + GOACC_REDUCTION_FINI + GOACC_JOIN + GOACC_REDUCTION_TEARDOWN. */ + static void -oacc_lower_reduction_var_helper (gimple_seq *stmt_seqp, omp_context *ctx, - tree tid, tree var, tree new_var) +lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner, + gcall *fork, gcall *join, gimple_seq *fork_seq, + gimple_seq *join_seq, omp_context *ctx) { - /* The atomic add at the end of the sum creates unnecessary - write contention on accelerators. To work around this, - create an array to store the partial reductions. Later, in - lower_omp_for (for openacc), the values of array will be - combined. */ - - tree t = NULL_TREE, array, x; - tree type = get_base_type (var); - gimple *stmt; + gimple_seq before_fork = NULL; + gimple_seq after_fork = NULL; + gimple_seq before_join = NULL; + gimple_seq after_join = NULL; + tree init_code = NULL_TREE, fini_code = NULL_TREE, + setup_code = NULL_TREE, teardown_code = NULL_TREE; + unsigned offset = 0; + + for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION) + { + tree orig = OMP_CLAUSE_DECL (c); + tree var = maybe_lookup_decl (orig, ctx); + tree ref_to_res = NULL_TREE; + tree incoming, outgoing; + + enum tree_code rcode = OMP_CLAUSE_REDUCTION_CODE (c); + if (rcode == MINUS_EXPR) + rcode = PLUS_EXPR; + else if (rcode == TRUTH_ANDIF_EXPR) + rcode = BIT_AND_EXPR; + else if (rcode == TRUTH_ORIF_EXPR) + rcode = BIT_IOR_EXPR; + tree op = build_int_cst (unsigned_type_node, rcode); + + if (!var) + var = orig; + gcc_assert (!is_reference (var)); + + incoming = outgoing = var; + + if (!inner) + { + /* See if an outer construct also reduces this variable. */ + omp_context *outer = ctx; - /* Now insert the partial reductions into the array. */ + while (omp_context *probe = outer->outer) + { + enum gimple_code type = gimple_code (probe->stmt); + tree cls; - /* Find the reduction array. */ + switch (type) + { + case GIMPLE_OMP_FOR: + cls = gimple_omp_for_clauses (probe->stmt); + break; - tree ptype = build_pointer_type (type); + case GIMPLE_OMP_TARGET: + if (gimple_omp_target_kind (probe->stmt) + != GF_OMP_TARGET_KIND_OACC_PARALLEL) + goto do_lookup; - t = lookup_oacc_reduction (oacc_get_reduction_array_id (var), ctx); - t = build_receiver_ref (t, false, ctx->outer); + cls = gimple_omp_target_clauses (probe->stmt); + break; - array = create_tmp_var (ptype); - gimplify_assign (array, t, stmt_seqp); + default: + goto do_lookup; + } + + outer = probe; + for (; cls; cls = OMP_CLAUSE_CHAIN (cls)) + if (OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_REDUCTION + && orig == OMP_CLAUSE_DECL (cls)) + goto has_outer_reduction; + } - tree ptr = create_tmp_var (TREE_TYPE (array)); + do_lookup: + /* This is the outermost construct with this reduction, + see if there's a mapping for it. */ + if (gimple_code (outer->stmt) == GIMPLE_OMP_TARGET + && maybe_lookup_field (orig, outer)) + { + ref_to_res = build_receiver_ref (orig, false, outer); + if (is_reference (orig)) + ref_to_res = build_simple_mem_ref (ref_to_res); - /* Find the reduction array. */ + outgoing = var; + incoming = omp_reduction_init_op (loc, rcode, TREE_TYPE (var)); + } + else + incoming = outgoing = orig; + + has_outer_reduction:; + } - /* testing a unary conversion. */ - tree offset = create_tmp_var (sizetype); - gimplify_assign (offset, TYPE_SIZE_UNIT (type), - stmt_seqp); - t = create_tmp_var (sizetype); - gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype, tid)), - stmt_seqp); - stmt = gimple_build_assign (offset, MULT_EXPR, offset, t); - gimple_seq_add_stmt (stmt_seqp, stmt); + if (!ref_to_res) + ref_to_res = integer_zero_node; - /* Offset expression. Does the POINTER_PLUS_EXPR take care - of adding sizeof(var) to the array? */ - ptr = create_tmp_var (ptype); - stmt = gimple_build_assign (unshare_expr (ptr), POINTER_PLUS_EXPR, array, - offset); - gimple_seq_add_stmt (stmt_seqp, stmt); + /* Determine position in reduction buffer, which may be used + by target. */ + enum machine_mode mode = TYPE_MODE (TREE_TYPE (var)); + unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT; + offset = (offset + align - 1) & ~(align - 1); + tree off = build_int_cst (sizetype, offset); + offset += GET_MODE_SIZE (mode); - /* Move the local sum to gfc$sum[i]. */ - x = unshare_expr (build_simple_mem_ref (ptr)); - stmt = gimplify_assign (x, new_var, stmt_seqp); + if (!init_code) + { + init_code = build_int_cst (integer_type_node, + IFN_GOACC_REDUCTION_INIT); + fini_code = build_int_cst (integer_type_node, + IFN_GOACC_REDUCTION_FINI); + setup_code = build_int_cst (integer_type_node, + IFN_GOACC_REDUCTION_SETUP); + teardown_code = build_int_cst (integer_type_node, + IFN_GOACC_REDUCTION_TEARDOWN); + } + + tree setup_call + = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION, + TREE_TYPE (var), 6, setup_code, + unshare_expr (ref_to_res), + incoming, level, op, off); + tree init_call + = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION, + TREE_TYPE (var), 6, init_code, + unshare_expr (ref_to_res), + var, level, op, off); + tree fini_call + = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION, + TREE_TYPE (var), 6, fini_code, + unshare_expr (ref_to_res), + var, level, op, off); + tree teardown_call + = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION, + TREE_TYPE (var), 6, teardown_code, + ref_to_res, var, level, op, off); + + gimplify_assign (var, setup_call, &before_fork); + gimplify_assign (var, init_call, &after_fork); + gimplify_assign (var, fini_call, &before_join); + gimplify_assign (outgoing, teardown_call, &after_join); + } + + /* Now stitch things together. */ + gimple_seq_add_seq (fork_seq, before_fork); + if (fork) + gimple_seq_add_stmt (fork_seq, fork); + gimple_seq_add_seq (fork_seq, after_fork); + + gimple_seq_add_seq (join_seq, before_join); + if (join) + gimple_seq_add_stmt (join_seq, join); + gimple_seq_add_seq (join_seq, after_join); } /* Generate code to implement the REDUCTION clauses. */ @@ -5413,6 +5450,10 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx) tree x, c, tid = NULL_TREE; int count = 0; + /* OpenACC loop reductions are handled elsewhere. */ + if (is_gimple_omp_oacc (ctx->stmt)) + return; + /* SIMD reductions are handled in lower_rec_input_clauses. */ if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR && gimple_omp_for_kind (ctx->stmt) & GF_OMP_FOR_SIMD) @@ -5484,13 +5525,7 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx) if (code == MINUS_EXPR) code = PLUS_EXPR; - if (is_gimple_omp_oacc (ctx->stmt)) - { - gcc_checking_assert (!OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)); - - oacc_lower_reduction_var_helper (stmt_seqp, ctx, tid, var, new_var); - } - else if (count == 1) + if (count == 1) { tree addr = build_fold_addr_expr_loc (clause_loc, ref); @@ -6055,8 +6090,8 @@ lower_oacc_head_tail (location_t loc, tree clauses, build_int_cst (integer_type_node, done), &join_seq); - gimple_seq_add_stmt (&fork_seq, fork); - gimple_seq_add_stmt (&join_seq, join); + lower_oacc_reductions (loc, clauses, place, inner, + fork, join, &fork_seq, &join_seq, ctx); /* Append this level to head. */ gimple_seq_add_seq (head, fork_seq); @@ -12948,446 +12983,6 @@ make_pass_expand_omp_ssa (gcc::context *ctxt) /* Routines to lower OMP directives into OMP-GIMPLE. */ -/* Helper function to preform, potentially COMPLEX_TYPE, operation and - convert it to gimple. */ -static void -oacc_gimple_assign (tree dest, tree_code op, tree src, gimple_seq *seq) -{ - gimple *stmt; - - if (TREE_CODE (TREE_TYPE (dest)) != COMPLEX_TYPE) - { - stmt = gimple_build_assign (dest, op, dest, src); - gimple_seq_add_stmt (seq, stmt); - return; - } - - tree t = create_tmp_var (TREE_TYPE (TREE_TYPE (dest))); - tree rdest = fold_build1 (REALPART_EXPR, TREE_TYPE (TREE_TYPE (dest)), dest); - gimplify_assign (t, rdest, seq); - rdest = t; - - t = create_tmp_var (TREE_TYPE (TREE_TYPE (dest))); - tree idest = fold_build1 (IMAGPART_EXPR, TREE_TYPE (TREE_TYPE (dest)), dest); - gimplify_assign (t, idest, seq); - idest = t; - - t = create_tmp_var (TREE_TYPE (TREE_TYPE (src))); - tree rsrc = fold_build1 (REALPART_EXPR, TREE_TYPE (TREE_TYPE (src)), src); - gimplify_assign (t, rsrc, seq); - rsrc = t; - - t = create_tmp_var (TREE_TYPE (TREE_TYPE (src))); - tree isrc = fold_build1 (IMAGPART_EXPR, TREE_TYPE (TREE_TYPE (src)), src); - gimplify_assign (t, isrc, seq); - isrc = t; - - tree r = create_tmp_var (TREE_TYPE (TREE_TYPE (dest))); - tree i = create_tmp_var (TREE_TYPE (TREE_TYPE (dest))); - tree result; - - if (op == PLUS_EXPR) - { - stmt = gimple_build_assign (r, op, rdest, rsrc); - gimple_seq_add_stmt (seq, stmt); - - stmt = gimple_build_assign (i, op, idest, isrc); - gimple_seq_add_stmt (seq, stmt); - } - else if (op == MULT_EXPR) - { - /* Let x = a + ib = dest, y = c + id = src. - x * y = (ac - bd) + i(ad + bc) */ - tree ac = create_tmp_var (TREE_TYPE (TREE_TYPE (dest))); - tree bd = create_tmp_var (TREE_TYPE (TREE_TYPE (dest))); - tree ad = create_tmp_var (TREE_TYPE (TREE_TYPE (dest))); - tree bc = create_tmp_var (TREE_TYPE (TREE_TYPE (dest))); - - stmt = gimple_build_assign (ac, MULT_EXPR, rdest, rsrc); - gimple_seq_add_stmt (seq, stmt); - - stmt = gimple_build_assign (bd, MULT_EXPR, idest, isrc); - gimple_seq_add_stmt (seq, stmt); - - stmt = gimple_build_assign (r, MINUS_EXPR, ac, bd); - gimple_seq_add_stmt (seq, stmt); - - stmt = gimple_build_assign (ad, MULT_EXPR, rdest, isrc); - gimple_seq_add_stmt (seq, stmt); - - stmt = gimple_build_assign (bd, MULT_EXPR, idest, rsrc); - gimple_seq_add_stmt (seq, stmt); - - stmt = gimple_build_assign (i, PLUS_EXPR, ad, bc); - gimple_seq_add_stmt (seq, stmt); - } - else - gcc_unreachable (); - - result = build2 (COMPLEX_EXPR, TREE_TYPE (dest), r, i); - gimplify_assign (dest, result, seq); -} - -/* Initialize the reduction array with default values. */ - -static void -oacc_init_reduction_array (tree array, tree init, tree nthreads, - gimple_seq *stmt_seqp) -{ - tree type = TREE_TYPE (TREE_TYPE (array)); - tree x, loop_header, loop_body, loop_exit; - gimple *stmt; - - /* Create for loop. - - let var = the original reduction variable - let array = reduction variable array - - for (i = 0; i < nthreads; i++) - var op= array[i] - */ - - loop_header = create_artificial_label (UNKNOWN_LOCATION); - loop_body = create_artificial_label (UNKNOWN_LOCATION); - loop_exit = create_artificial_label (UNKNOWN_LOCATION); - - /* Create and initialize an index variable. */ - tree ix = create_tmp_var (sizetype); - gimplify_assign (ix, fold_build1 (NOP_EXPR, sizetype, integer_zero_node), - stmt_seqp); - - /* Insert the loop header label here. */ - gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_header)); - - /* Exit loop if ix >= nthreads. */ - x = create_tmp_var (sizetype); - gimplify_assign (x, fold_build1 (NOP_EXPR, sizetype, nthreads), stmt_seqp); - stmt = gimple_build_cond (GE_EXPR, ix, x, loop_exit, loop_body); - gimple_seq_add_stmt (stmt_seqp, stmt); - - /* Insert the loop body label here. */ - gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_body)); - - /* Calculate the array offset. */ - tree offset = create_tmp_var (sizetype); - gimplify_assign (offset, TYPE_SIZE_UNIT (type), stmt_seqp); - stmt = gimple_build_assign (offset, MULT_EXPR, offset, ix); - gimple_seq_add_stmt (stmt_seqp, stmt); - - tree ptr = create_tmp_var (TREE_TYPE (array)); - stmt = gimple_build_assign (ptr, POINTER_PLUS_EXPR, array, offset); - gimple_seq_add_stmt (stmt_seqp, stmt); - - /* Assign init. */ - gimplify_assign (build_simple_mem_ref (ptr), init, stmt_seqp); - - /* Increment the induction variable. */ - tree one = fold_build1 (NOP_EXPR, sizetype, integer_one_node); - stmt = gimple_build_assign (ix, PLUS_EXPR, ix, one); - gimple_seq_add_stmt (stmt_seqp, stmt); - - /* Go back to the top of the loop. */ - gimple_seq_add_stmt (stmt_seqp, gimple_build_goto (loop_header)); - - /* Place the loop exit label here. */ - gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_exit)); -} - -/* Helper function to initialize local data for the reduction arrays. - The reduction arrays need to be placed inside the calling function - for accelerators, or else the host won't be able to preform the final - reduction. */ - -static void -oacc_initialize_reduction_data (tree clauses, tree nthreads, - gimple_seq *stmt_seqp, omp_context *ctx) -{ - tree c, t, oc; - gimple *stmt; - omp_context *octx; - - /* Find the innermost OpenACC parallel context. */ - if (gimple_code (ctx->stmt) == GIMPLE_OMP_TARGET - && (gimple_omp_target_kind (ctx->stmt) - == GF_OMP_TARGET_KIND_OACC_PARALLEL)) - octx = ctx; - else - octx = ctx->outer; - gcc_checking_assert (gimple_code (octx->stmt) == GIMPLE_OMP_TARGET - && (gimple_omp_target_kind (octx->stmt) - == GF_OMP_TARGET_KIND_OACC_PARALLEL)); - - /* Extract the clauses. */ - oc = gimple_omp_target_clauses (octx->stmt); - - /* Find the last outer clause. */ - for (; oc && OMP_CLAUSE_CHAIN (oc); oc = OMP_CLAUSE_CHAIN (oc)) - ; - - /* Allocate arrays for each reduction variable. */ - for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) - { - if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION) - continue; - - tree var = OMP_CLAUSE_DECL (c); - tree type = get_base_type (var); - tree array = lookup_oacc_reduction (oacc_get_reduction_array_id (var), - ctx); - tree size, call; - - /* Calculate size of the reduction array. */ - t = create_tmp_var (TREE_TYPE (nthreads)); - stmt = gimple_build_assign (t, MULT_EXPR, nthreads, - fold_convert (TREE_TYPE (nthreads), - TYPE_SIZE_UNIT (type))); - gimple_seq_add_stmt (stmt_seqp, stmt); - - size = create_tmp_var (sizetype); - gimplify_assign (size, fold_build1 (NOP_EXPR, sizetype, t), stmt_seqp); - - /* Now allocate memory for it. */ - call = unshare_expr (builtin_decl_explicit (BUILT_IN_ALLOCA)); - stmt = gimple_build_call (call, 1, size); - gimple_call_set_lhs (stmt, array); - gimple_seq_add_stmt (stmt_seqp, stmt); - - /* Initialize array. */ - tree init = omp_reduction_init_op (OMP_CLAUSE_LOCATION (c), - OMP_CLAUSE_REDUCTION_CODE (c), - type); - oacc_init_reduction_array (array, init, nthreads, stmt_seqp); - - /* Map this array into the accelerator. */ - - /* Add the reduction array to the list of clauses. */ - tree x = array; - t = build_omp_clause (gimple_location (ctx->stmt), OMP_CLAUSE_MAP); - OMP_CLAUSE_SET_MAP_KIND (t, GOMP_MAP_FORCE_TOFROM); - OMP_CLAUSE_DECL (t) = x; - OMP_CLAUSE_CHAIN (t) = NULL; - if (oc) - OMP_CLAUSE_CHAIN (oc) = t; - else - gimple_omp_target_set_clauses (as_a (octx->stmt), t); - OMP_CLAUSE_SIZE (t) = size; - oc = t; - } -} - -/* Helper function to process the array of partial reductions. Nthreads - indicates the number of threads. Unfortunately, GOACC_GET_NUM_THREADS - cannot be used here, because nthreads on the host may be different than - on the accelerator. */ - -static void -oacc_finalize_reduction_data (tree clauses, tree nthreads, - gimple_seq *stmt_seqp, omp_context *ctx) -{ - tree c, x, var, array, loop_header, loop_body, loop_exit, type; - gimple *stmt; - - /* Create for loop. - - let var = the original reduction variable - let array = reduction variable array - - for (i = 0; i < nthreads; i++) - var op= array[i] - */ - - loop_header = create_artificial_label (UNKNOWN_LOCATION); - loop_body = create_artificial_label (UNKNOWN_LOCATION); - loop_exit = create_artificial_label (UNKNOWN_LOCATION); - - /* Create and initialize an index variable. */ - tree ix = create_tmp_var (sizetype); - gimplify_assign (ix, fold_build1 (NOP_EXPR, sizetype, integer_zero_node), - stmt_seqp); - - /* Insert the loop header label here. */ - gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_header)); - - /* Exit loop if ix >= nthreads. */ - x = create_tmp_var (sizetype); - gimplify_assign (x, fold_build1 (NOP_EXPR, sizetype, nthreads), stmt_seqp); - stmt = gimple_build_cond (GE_EXPR, ix, x, loop_exit, loop_body); - gimple_seq_add_stmt (stmt_seqp, stmt); - - /* Insert the loop body label here. */ - gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_body)); - - /* Collapse each reduction array, one element at a time. */ - for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) - { - if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION) - continue; - - tree_code reduction_code = OMP_CLAUSE_REDUCTION_CODE (c); - - /* reduction(-:var) sums up the partial results, so it acts - identically to reduction(+:var). */ - if (reduction_code == MINUS_EXPR) - reduction_code = PLUS_EXPR; - - /* Set up reduction variable var. */ - var = OMP_CLAUSE_DECL (c); - type = get_base_type (var); - array = lookup_oacc_reduction (oacc_get_reduction_array_id - (OMP_CLAUSE_DECL (c)), ctx); - - /* Calculate the array offset. */ - tree offset = create_tmp_var (sizetype); - gimplify_assign (offset, TYPE_SIZE_UNIT (type), stmt_seqp); - stmt = gimple_build_assign (offset, MULT_EXPR, offset, ix); - gimple_seq_add_stmt (stmt_seqp, stmt); - - tree ptr = create_tmp_var (TREE_TYPE (array)); - stmt = gimple_build_assign (ptr, POINTER_PLUS_EXPR, array, offset); - gimple_seq_add_stmt (stmt_seqp, stmt); - - /* Extract array[ix] into mem. */ - tree mem = create_tmp_var (type); - gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp); - - /* Find the original reduction variable. */ - if (is_reference (var)) - var = build_simple_mem_ref (var); - - tree t = create_tmp_var (type); - - x = lang_hooks.decls.omp_clause_assign_op (c, t, var); - gimplify_and_add (unshare_expr(x), stmt_seqp); - - /* var = var op mem */ - switch (OMP_CLAUSE_REDUCTION_CODE (c)) - { - case TRUTH_ANDIF_EXPR: - case TRUTH_ORIF_EXPR: - t = fold_build2 (OMP_CLAUSE_REDUCTION_CODE (c), integer_type_node, - t, mem); - gimplify_and_add (t, stmt_seqp); - break; - default: - /* The lhs isn't a gimple_reg when var is COMPLEX_TYPE. */ - oacc_gimple_assign (t, OMP_CLAUSE_REDUCTION_CODE (c), mem, - stmt_seqp); - } - - t = fold_build1 (NOP_EXPR, TREE_TYPE (var), t); - x = lang_hooks.decls.omp_clause_assign_op (c, var, t); - gimplify_and_add (unshare_expr(x), stmt_seqp); - } - - /* Increment the induction variable. */ - tree one = fold_build1 (NOP_EXPR, sizetype, integer_one_node); - stmt = gimple_build_assign (ix, PLUS_EXPR, ix, one); - gimple_seq_add_stmt (stmt_seqp, stmt); - - /* Go back to the top of the loop. */ - gimple_seq_add_stmt (stmt_seqp, gimple_build_goto (loop_header)); - - /* Place the loop exit label here. */ - gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_exit)); -} - -/* Scan through all of the gimple stmts searching for an OMP_FOR_EXPR, and - scan that for reductions. */ - -static void -oacc_process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp, - gimple_seq *out_stmt_seqp, omp_context *ctx) -{ - gimple_stmt_iterator gsi; - gimple_seq inner = NULL; - - /* A collapse clause may have inserted a new bind block. */ - gsi = gsi_start (*body); - while (!gsi_end_p (gsi)) - { - gimple *stmt = gsi_stmt (gsi); - if (gbind *bind_stmt = dyn_cast (stmt)) - { - inner = gimple_bind_body (bind_stmt); - body = &inner; - gsi = gsi_start (*body); - } - else if (dyn_cast (stmt)) - break; - else - gsi_next (&gsi); - } - - for (gsi = gsi_start (*body); !gsi_end_p (gsi); gsi_next (&gsi)) - { - tree clauses, nthreads, t, c, acc_device, acc_device_host, call, - enter, exit; - bool reduction_found = false; - - gimple *stmt = gsi_stmt (gsi); - - switch (gimple_code (stmt)) - { - case GIMPLE_OMP_FOR: - clauses = gimple_omp_for_clauses (stmt); - - /* Search for a reduction clause. */ - for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) - if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION) - { - reduction_found = true; - break; - } - - if (!reduction_found) - break; - - ctx = maybe_lookup_ctx (stmt); - t = NULL_TREE; - - /* Extract the number of threads. */ - nthreads = create_tmp_var (sizetype); - t = oacc_max_threads (ctx); - gimplify_assign (nthreads, t, in_stmt_seqp); - - /* Determine if this is kernel will be executed on the host. */ - call = builtin_decl_explicit (BUILT_IN_ACC_GET_DEVICE_TYPE); - acc_device = create_tmp_var (integer_type_node, ".acc_device_type"); - stmt = gimple_build_call (call, 0); - gimple_call_set_lhs (stmt, acc_device); - gimple_seq_add_stmt (in_stmt_seqp, stmt); - - /* Set nthreads = 1 for ACC_DEVICE_TYPE=host. */ - acc_device_host = create_tmp_var (integer_type_node, - ".acc_device_host"); - gimplify_assign (acc_device_host, - build_int_cst (integer_type_node, - GOMP_DEVICE_HOST), - in_stmt_seqp); - - enter = create_artificial_label (UNKNOWN_LOCATION); - exit = create_artificial_label (UNKNOWN_LOCATION); - - stmt = gimple_build_cond (EQ_EXPR, acc_device, acc_device_host, - enter, exit); - gimple_seq_add_stmt (in_stmt_seqp, stmt); - gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (enter)); - gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype, - integer_one_node), - in_stmt_seqp); - gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (exit)); - - oacc_initialize_reduction_data (clauses, nthreads, in_stmt_seqp, - ctx); - oacc_finalize_reduction_data (clauses, nthreads, out_stmt_seqp, ctx); - break; - default: - // Scan for other directives which support reduction here. - break; - } - } -} - /* If ctx is a worksharing context inside of a cancellable parallel region and it isn't nowait, add lhs to its GIMPLE_OMP_RETURN and conditional branch to parallel's cancel_label to handle @@ -15009,9 +14604,6 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) irlist = NULL; orlist = NULL; - if (offloaded - && is_gimple_omp_oacc (stmt)) - oacc_process_reduction_data (&tgt_body, &irlist, &orlist, ctx); for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c)) switch (OMP_CLAUSE_CODE (c)) @@ -15933,7 +15525,22 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) break; } + gimple_seq fork_seq = NULL; + gimple_seq join_seq = NULL; + + if (is_oacc_parallel (ctx)) + { + /* If there are reductions on the offloaded region itself, treat + them as a dummy GANG loop. */ + tree level = build_int_cst (integer_type_node, GOMP_DIM_GANG); + + lower_oacc_reductions (gimple_location (ctx->stmt), clauses, level, + false, NULL, NULL, &fork_seq, &join_seq, ctx); + } + + gimple_seq_add_seq (&new_body, fork_seq); gimple_seq_add_seq (&new_body, tgt_body); + gimple_seq_add_seq (&new_body, join_seq); if (offloaded) new_body = maybe_catch_exception (new_body); @@ -19022,6 +18629,11 @@ oacc_loop_xform_head_tail (gcall *from, int level) else if (k == kind && stmt != from) break; } + else if (is_gimple_call (stmt) + && gimple_call_internal_p (stmt) + && gimple_call_internal_fn (stmt) == IFN_GOACC_REDUCTION) + *gimple_call_arg_ptr (stmt, 3) = replacement; + gsi_next (&gsi); while (gsi_end_p (gsi)) gsi = gsi_start_bb (single_succ (gsi_bb (gsi))); @@ -19240,6 +18852,53 @@ default_goacc_fork_join (gcall *ARG_UNUSED (call), return targetm.have_oacc_join (); } +/* Default goacc.reduction early expander. + + LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET) + If RES_PTR is not integer-zerop: + SETUP - emit 'LHS = *RES_PTR', LHS = NULL + TEARDOWN - emit '*RES_PTR = VAR' + If LHS is not NULL + emit 'LHS = VAR' */ + +void +default_goacc_reduction (gcall *call) +{ + unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0)); + gimple_stmt_iterator gsi = gsi_for_stmt (call); + tree lhs = gimple_call_lhs (call); + tree var = gimple_call_arg (call, 2); + gimple_seq seq = NULL; + + if (code == IFN_GOACC_REDUCTION_SETUP + || code == IFN_GOACC_REDUCTION_TEARDOWN) + { + /* Setup and Teardown need to copy from/to the receiver object, + if there is one. */ + tree ref_to_res = gimple_call_arg (call, 1); + + if (!integer_zerop (ref_to_res)) + { + tree dst = build_simple_mem_ref (ref_to_res); + tree src = var; + + if (code == IFN_GOACC_REDUCTION_SETUP) + { + src = dst; + dst = lhs; + lhs = NULL; + } + gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src)); + } + } + + /* Copy VAR to LHS, if there is an LHS. */ + if (lhs) + gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var)); + + gsi_replace_with_seq (&gsi, seq, true); +} + /* Main entry point for oacc transformations which run on the device compiler after LTO, so we know what the target device is at this point (including the host fallback). */ @@ -19267,6 +18926,10 @@ execute_oacc_device_lower () fprintf (dump_file, "\n"); } + /* Offloaded targets may introduce new basic blocks, which require + dominance information to update SSA. */ + calculate_dominance_info (CDI_DOMINATORS); + /* Now lower internal loop functions to target-specific code sequences. */ basic_block bb; @@ -19301,6 +18964,19 @@ execute_oacc_device_lower () rescan = true; break; + case IFN_GOACC_REDUCTION: + /* Mark the function for SSA renaming. */ + mark_virtual_operands_for_renaming (cfun); + + /* If the level is -1, this ended up being an unused + axis. Handle as a default. */ + if (integer_minus_onep (gimple_call_arg (call, 3))) + default_goacc_reduction (call); + else + targetm.goacc.reduction (call); + rescan = true; + break; + case IFN_UNIQUE: { enum ifn_unique_kind kind diff --git a/gcc/target.def b/gcc/target.def index 773b6ef..461af6b 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1670,6 +1670,17 @@ The default hook returns false, if there are no RTL expanders for them.", bool, (gcall *call, const int *dims, bool is_fork), default_goacc_fork_join) +DEFHOOK +(reduction, +"This hook is used by the oacc_transform pass to expand calls to the\n\ +@var{GOACC_REDUCTION} internal function, into a sequence of gimple\n\ +instructions. @var{call} is gimple statement containing the call to\n\ +the function. This hook removes statement @var{call} after the\n\ +expanded sequence has been inserted. This hook is also responsible\n\ +for allocating any storage for reductions when necessary.", +void, (gcall *call), +default_goacc_reduction) + HOOK_VECTOR_END (goacc) /* Functions relating to vectorization. */ diff --git a/gcc/targhooks.h b/gcc/targhooks.h index 36715c3..c34e4ae 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -111,6 +111,7 @@ extern void default_destroy_cost_data (void *); /* OpenACC hooks. */ extern bool default_goacc_validate_dims (tree, int [], int); extern bool default_goacc_fork_join (gcall *, const int [], bool); +extern void default_goacc_reduction (gcall *); /* These are here, and not in hooks.[ch], because not all users of hooks.h include tm.h, and thus we don't have CUMULATIVE_ARGS. */ -- 2.7.4