bool ls_vgpr_fix : 1;
bool prim_discard_cs_instancing : 1;
bool ngg : 1;
+ bool same_patch_vertices : 1;
uint8_t ngg_culling;
int last_index_size;
int last_base_vertex;
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
- declare_vb_descriptor_input_sgprs(ctx);
+ if (ctx->stage == MESA_SHADER_VERTEX)
+ declare_vb_descriptor_input_sgprs(ctx);
/* VGPRs (first TCS, then VS) */
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
key->mono.u.ff_tcs_inputs_to_copy);
+ fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono);
+ fprintf(f, " opt.same_patch_vertices = %u\n", key->opt.same_patch_vertices);
break;
case MESA_SHADER_TESS_EVAL:
}
parts[num_parts++] = main_fn;
- si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0);
+ si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0, false);
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
si_build_prim_discard_compute_shader(&ctx);
parts[0] = ngg_cull_main_fn;
parts[1] = ctx.main_fn;
- si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0, false);
} else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
if (sscreen->info.chip_class >= GFX9) {
struct si_shader_selector *ls = shader->key.part.tcs.ls;
ctx.stage = MESA_SHADER_TESS_CTRL;
si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
- vs_needs_prolog, vs_needs_prolog ? 2 : 1);
+ vs_needs_prolog, vs_needs_prolog ? 2 : 1,
+ shader->key.opt.same_patch_vertices);
} else {
LLVMValueRef parts[2];
union si_shader_part_key epilog_key;
si_llvm_build_tcs_epilog(&ctx, &epilog_key);
parts[1] = ctx.main_fn;
- si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0, false);
}
} else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) {
if (ctx.screen->info.chip_class >= GFX9) {
parts[next_first_part = num_parts++] = gs_prolog;
parts[num_parts++] = gs_main;
- si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part);
+ si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part, false);
} else {
LLVMValueRef parts[2];
union si_shader_part_key prolog_key;
si_llvm_build_gs_prolog(&ctx, &prolog_key);
parts[0] = ctx.main_fn;
- si_build_wrapper_function(&ctx, parts, 2, 1, 0);
+ si_build_wrapper_function(&ctx, parts, 2, 1, 0, false);
}
} else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
si_llvm_build_monolithic_ps(&ctx, shader);
unsigned cs_cull_back : 1;
unsigned cs_cull_z : 1;
unsigned cs_halfz_clip_space : 1;
+
+ /* VS and TCS have the same number of patch vertices. */
+ unsigned same_patch_vertices:1;
+
unsigned inline_uniforms:1;
uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
unsigned num_parts, unsigned main_part,
- unsigned next_shader_first_part);
+ unsigned next_shader_first_part, bool same_thread_count);
/* si_shader_llvm_gs.c */
LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
*/
void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
unsigned num_parts, unsigned main_part,
- unsigned next_shader_first_part)
+ unsigned next_shader_first_part, bool same_thread_count)
{
LLVMBuilderRef builder = ctx->ac.builder;
/* PS epilog has one arg per color component; gfx9 merged shader
si_llvm_create_func(ctx, "wrapper", returns, num_returns,
si_get_max_workgroup_size(ctx->shader));
- if (si_is_merged_shader(ctx->shader))
+ if (si_is_merged_shader(ctx->shader) && !same_thread_count)
ac_init_exec_full_mask(&ctx->ac);
/* Record the arguments of the function as if they were an output of
/* Merged shaders are executed conditionally depending
* on the number of enabled threads passed in the input SGPRs. */
if (si_is_multi_part_shader(ctx->shader) && part == 0) {
- LLVMValueRef ena, count = initial[3];
+ if (same_thread_count) {
+ struct ac_arg arg;
+ arg.arg_index = 3;
+ arg.used = true;
- count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
- ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
- ac_build_ifcc(&ctx->ac, ena, 6506);
+ si_init_exec_from_input(ctx, arg, 0);
+ } else {
+ LLVMValueRef ena, count = initial[3];
+
+ count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
+ ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
+ ac_build_ifcc(&ctx->ac, ena, 6506);
+ }
}
/* Derive arguments for the next part from outputs of the
ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
- if (si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
+ if (!same_thread_count &&
+ si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
ac_build_endif(&ctx->ac, 6506);
/* The second half of the merged shader should use
}
/* Close the conditional wrapping the second shader. */
- if (ctx->stage == MESA_SHADER_TESS_CTRL && si_is_multi_part_shader(ctx->shader))
+ if (ctx->stage == MESA_SHADER_TESS_CTRL &&
+ !same_thread_count && si_is_multi_part_shader(ctx->shader))
ac_build_endif(&ctx->ac, 6507);
assert(LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind);
si_llvm_build_ps_epilog(ctx, &epilog_key);
parts[num_parts++] = ctx->main_fn;
- si_build_wrapper_function(ctx, parts, num_parts, main_index, 0);
+ si_build_wrapper_function(ctx, parts, num_parts, main_index, 0, false);
}
void si_llvm_init_ps_callbacks(struct si_shader_context *ctx)
sctx->do_update_shaders = true;
}
- if (sctx->tes_shader.cso && sctx->screen->info.has_ls_vgpr_init_bug) {
- /* Determine whether the LS VGPR fix should be applied.
- *
- * It is only required when num input CPs > num output CPs,
- * which cannot happen with the fixed function TCS. We should
- * also update this bit when switching from TCS to fixed
- * function TCS.
- */
+ if (sctx->tes_shader.cso) {
struct si_shader_selector *tcs = sctx->tcs_shader.cso;
- bool ls_vgpr_fix =
- tcs && info->vertices_per_patch > tcs->info.base.tess.tcs_vertices_out;
- if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
- sctx->ls_vgpr_fix = ls_vgpr_fix;
+ /* The rarely occuring tcs == NULL case is not optimized. */
+ bool same_patch_vertices =
+ sctx->chip_class >= GFX9 &&
+ tcs && info->vertices_per_patch == tcs->info.base.tess.tcs_vertices_out;
+
+ if (sctx->same_patch_vertices != same_patch_vertices) {
+ sctx->same_patch_vertices = same_patch_vertices;
sctx->do_update_shaders = true;
}
+
+ if (sctx->screen->info.has_ls_vgpr_init_bug) {
+ /* Determine whether the LS VGPR fix should be applied.
+ *
+ * It is only required when num input CPs > num output CPs,
+ * which cannot happen with the fixed function TCS. We should
+ * also update this bit when switching from TCS to fixed
+ * function TCS.
+ */
+ bool ls_vgpr_fix =
+ tcs && info->vertices_per_patch > tcs->info.base.tess.tcs_vertices_out;
+
+ if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
+ sctx->ls_vgpr_fix = ls_vgpr_fix;
+ sctx->do_update_shaders = true;
+ }
+ }
}
if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
* The LS VGPR fix prefers this too.
*/
key->opt.prefer_mono = 1;
+ key->opt.same_patch_vertices = sctx->same_patch_vertices;
}
key->part.tcs.epilog.prim_mode =