If tess patches are wholly in one wave, "s_waitcnt lgkm(0)" is sufficient.
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16304>
break;
}
case nir_intrinsic_control_barrier:
+ /* If output patches are wholly in one wave, we don't need a barrier. */
+ if (ctx->stage == MESA_SHADER_TESS_CTRL &&
+ ctx->ac.wave_size % ctx->info->tess.tcs_vertices_out == 0)
+ break;
+
ac_build_s_barrier(&ctx->ac, ctx->stage);
break;
case nir_intrinsic_shared_atomic_add:
memset(key, 0, sizeof(*key));
key->tcs_epilog.wave32 = shader->wave_size == 32;
key->tcs_epilog.states = shader->key.ge.part.tcs.epilog;
+
+ /* If output patches are wholly in one wave, we don't need a barrier.
+ * The fixed-func TCS doesn't set tcs_vertices_out, but it won't use a barrier
+ * anyway because tess levels are always defined in all invocations there.
+ */
+ key->tcs_epilog.noop_s_barrier =
+ shader->selector->info.base.tess.tcs_vertices_out &&
+ shader->wave_size % shader->selector->info.base.tess.tcs_vertices_out == 0;
}
/**
struct {
struct si_tcs_epilog_bits states;
unsigned wave32 : 1;
+ unsigned noop_s_barrier : 1;
} tcs_epilog;
struct {
struct si_ps_prolog_bits states;
shader->selector->info.base.inputs_read &
~shader->selector->info.tcs_vgpr_only_inputs) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
- ac_build_s_barrier(&ctx->ac, ctx->stage);
+
+ /* If both input and output patches are wholly in one wave, we don't need a barrier.
+ * That's true when both VS and TCS have the same number of patch vertices and
+ * the wave size is a multiple of the number of patch vertices.
+ *
+ * The fixed-func TCS doesn't set tcs_vertices_out.
+ */
+ if (!shader->key.ge.opt.same_patch_vertices ||
+ (sel->info.base.tess.tcs_vertices_out &&
+ ctx->ac.wave_size % sel->info.base.tess.tcs_vertices_out != 0))
+ ac_build_s_barrier(&ctx->ac, ctx->stage);
}
} else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
/* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
}
}
-static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id,
- LLVMValueRef invocation_id,
+static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader_part_key *key,
+ LLVMValueRef rel_patch_id, LLVMValueRef invocation_id,
LLVMValueRef tcs_out_current_patch_data_offset,
LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
{
/* Add a barrier before loading tess factors from LDS. */
if (!shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def) {
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
- ac_build_s_barrier(&ctx->ac, ctx->stage);
+
+ if (!key->tcs_epilog.noop_s_barrier)
+ ac_build_s_barrier(&ctx->ac, ctx->stage);
}
/* Do this only for invocation 0, because the tess levels are per-patch,
for (unsigned i = 0; i < 6; i++)
invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
- si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id),
+ si_write_tess_factors(ctx, key, ac_get_arg(&ctx->ac, rel_patch_id),
ac_get_arg(&ctx->ac, invocation_id),
ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
invoc0_tess_factors, invoc0_tess_factors + 4);