Some applications require not only tilable bands, but also parallel
loops inside those bands. The default algorithm favors larger bands
over having parallel loops inside the band. It is always possible
to sacrifice one of the loops in the band to create parallelism by
applying a wavefront transformation. This typically leads to more
complicated schedules, however.
Instead, when enabled, we now force there to be at least one parallel
loop inside each tilable band.
Signed-off-by: Sven Verdoolaege <skimo@kotnet.org>
int convex;
int schedule_parametric;
int convex;
int schedule_parametric;
+ int schedule_outer_parallelism;
};
ISL_ARG_DECL(isl_options, struct isl_options, isl_options_arg)
};
ISL_ARG_DECL(isl_options, struct isl_options, isl_options_arg)
convex, ISL_CONVEX_HULL_WRAP, "convex hull algorithm to use")
ISL_ARG_BOOL(struct isl_options, schedule_parametric, 0,
"schedule-parametric", 1, "construct possibly parametric schedules")
convex, ISL_CONVEX_HULL_WRAP, "convex hull algorithm to use")
ISL_ARG_BOOL(struct isl_options, schedule_parametric, 0,
"schedule-parametric", 1, "construct possibly parametric schedules")
+ISL_ARG_BOOL(struct isl_options, schedule_outer_parallelism, 0,
+ "schedule-outer-parallelism", 0,
+ "try to construct schedules with outer parallelism")
ISL_ARG_VERSION(print_version)
ISL_ARG_END
};
ISL_ARG_VERSION(print_version)
ISL_ARG_END
};
*
* The constraints are those from the edges plus two or three equalities
* to express the sums.
*
* The constraints are those from the edges plus two or three equalities
* to express the sums.
+ *
+ * If force_parallel is set, then we add equalities to ensure that
+ * the sum of the m_n coefficients and m_0 are both zero.
-static int setup_lp(isl_ctx *ctx, struct isl_sched_graph *graph)
+static int setup_lp(isl_ctx *ctx, struct isl_sched_graph *graph,
+ int force_parallel)
dim = isl_dim_set_alloc(ctx, 0, total);
isl_basic_set_free(graph->lp);
dim = isl_dim_set_alloc(ctx, 0, total);
isl_basic_set_free(graph->lp);
- n_eq += 2 + parametric;
+ n_eq += 2 + parametric + force_parallel;
graph->lp = isl_basic_set_alloc_dim(dim, 0, n_eq, n_ineq);
k = isl_basic_set_alloc_equality(graph->lp);
if (k < 0)
return -1;
isl_seq_clr(graph->lp->eq[k], 1 + total);
graph->lp = isl_basic_set_alloc_dim(dim, 0, n_eq, n_ineq);
k = isl_basic_set_alloc_equality(graph->lp);
if (k < 0)
return -1;
isl_seq_clr(graph->lp->eq[k], 1 + total);
- isl_int_set_si(graph->lp->eq[k][1], -1);
+ if (!force_parallel)
+ isl_int_set_si(graph->lp->eq[k][1], -1);
for (i = 0; i < 2 * nparam; ++i)
isl_int_set_si(graph->lp->eq[k][1 + param_pos + i], 1);
for (i = 0; i < 2 * nparam; ++i)
isl_int_set_si(graph->lp->eq[k][1 + param_pos + i], 1);
+ if (force_parallel) {
+ k = isl_basic_set_alloc_equality(graph->lp);
+ if (k < 0)
+ return -1;
+ isl_seq_clr(graph->lp->eq[k], 1 + total);
+ isl_int_set_si(graph->lp->eq[k][2], -1);
+ }
+
if (parametric) {
k = isl_basic_set_alloc_equality(graph->lp);
if (k < 0)
if (parametric) {
k = isl_basic_set_alloc_equality(graph->lp);
if (k < 0)
*
* If we manage to complete the schedule, we finish off by topologically
* sorting the statements based on the remaining dependences.
*
* If we manage to complete the schedule, we finish off by topologically
* sorting the statements based on the remaining dependences.
+ *
+ * If ctx->opt->schedule_outer_parallelism is set, then we force the
+ * outermost dimension in the current band to be parallel. If this
+ * turns out to be impossible, we fall back on the general scheme above
+ * and try to carry as many dependences as possible.
*/
static int compute_schedule_wcc(isl_ctx *ctx, struct isl_sched_graph *graph)
{
*/
static int compute_schedule_wcc(isl_ctx *ctx, struct isl_sched_graph *graph)
{
+ int force_parallel = 0;
+
if (detect_sccs(graph) < 0)
return -1;
sort_sccs(graph);
if (detect_sccs(graph) < 0)
return -1;
sort_sccs(graph);
if (compute_maxvar(graph) < 0)
return -1;
if (compute_maxvar(graph) < 0)
return -1;
+ if (ctx->opt->schedule_outer_parallelism)
+ force_parallel = 1;
+
while (graph->n_row < graph->maxvar) {
isl_vec *sol;
graph->src_scc = -1;
graph->dst_scc = -1;
while (graph->n_row < graph->maxvar) {
isl_vec *sol;
graph->src_scc = -1;
graph->dst_scc = -1;
- if (setup_lp(ctx, graph) < 0)
+ if (setup_lp(ctx, graph, force_parallel) < 0)
return -1;
sol = solve_lp(graph);
if (!sol)
return -1;
sol = solve_lp(graph);
if (!sol)
}
if (update_schedule(graph, sol, 1, 1) < 0)
return -1;
}
if (update_schedule(graph, sol, 1, 1) < 0)
return -1;
}
if (graph->n_total_row > graph->band_start)
}
if (graph->n_total_row > graph->band_start)