From 86832964cf26228427bbde3f1c60a5df040e312a Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Tue, 5 Jul 2022 15:15:16 +0800 Subject: [PATCH] radeonsi: vs load input re-calculate vertex index after culling MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit After culling, vertices are repacked, we overwrite the original vertex_id/instance_id, so vertex index for each thread need to be re-calculated. Reviewed-by: Marek Olšák Signed-off-by: Qiang Yu Part-of: --- src/gallium/drivers/radeonsi/si_shader_internal.h | 1 + src/gallium/drivers/radeonsi/si_shader_llvm.c | 9 ++ src/gallium/drivers/radeonsi/si_shader_llvm_vs.c | 100 +++++++++++++++------- 3 files changed, 77 insertions(+), 33 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 7bfc8c8..e75de69 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -139,6 +139,7 @@ struct si_shader_context { LLVMValueRef esgs_ring; LLVMValueRef gsvs_ring[4]; LLVMValueRef tess_offchip_ring; + LLVMValueRef instance_divisor_constbuf; LLVMValueRef gs_next_vertex[4]; LLVMValueRef gs_curprim_verts[4]; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index e78f83c..21ff551 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -884,6 +884,15 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad switch (ctx->stage) { case MESA_SHADER_VERTEX: si_llvm_init_vs_callbacks(ctx, ngg_cull_shader); + + /* preload instance_divisor_constbuf to be used for input load after culling */ + if (ctx->shader->key.ge.opt.ngg_culling && + ctx->shader->key.ge.part.vs.prolog.instance_divisor_is_fetched) { + LLVMValueRef buf = ac_get_arg(&ctx->ac, ctx->internal_bindings); + ctx->instance_divisor_constbuf = + ac_build_load_to_sgpr( + &ctx->ac, buf, LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0)); + } break; case MESA_SHADER_TESS_CTRL: diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index ac45d41..34140fa 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -39,6 +39,53 @@ static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i3 ctx->ac.i32, ""); } +static LLVMValueRef get_vertex_index(struct si_shader_context *ctx, + struct si_vs_prolog_bits *key, unsigned input_index, + LLVMValueRef instance_divisor_constbuf, + unsigned start_instance, unsigned base_vertex) +{ + LLVMValueRef instance_id = ctx->abi.instance_id_replaced ? + ctx->abi.instance_id_replaced : ctx->abi.instance_id; + LLVMValueRef vertex_id = ctx->abi.vertex_id_replaced ? + ctx->abi.vertex_id_replaced : ctx->abi.vertex_id; + + bool divisor_is_one = key->instance_divisor_is_one & (1u << input_index); + bool divisor_is_fetched =key->instance_divisor_is_fetched & (1u << input_index); + + LLVMValueRef index = NULL; + if (divisor_is_one) + index = instance_id; + else if (divisor_is_fetched) { + LLVMValueRef udiv_factors[4]; + + for (unsigned j = 0; j < 4; j++) { + udiv_factors[j] = si_buffer_load_const( + ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->ac.i32, input_index * 16 + j * 4, 0)); + udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); + } + + /* The faster NUW version doesn't work when InstanceID == UINT_MAX. + * Such InstanceID might not be achievable in a reasonable time though. + */ + index = ac_build_fast_udiv_nuw( + &ctx->ac, instance_id, udiv_factors[0], + udiv_factors[1], udiv_factors[2], udiv_factors[3]); + } + + if (divisor_is_one || divisor_is_fetched) { + /* Add StartInstance. */ + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMGetParam(ctx->main_fn, start_instance), ""); + } else { + /* VertexID + BaseVertex */ + index = LLVMBuildAdd(ctx->ac.builder, vertex_id, + LLVMGetParam(ctx->main_fn, base_vertex), ""); + } + + return index; +} + static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4]) { const struct si_shader_info *info = &ctx->shader->selector->info; @@ -114,7 +161,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L unsigned num_vbos_in_user_sgprs = ctx->shader->selector->info.num_vbos_in_user_sgprs; union si_vs_fix_fetch fix_fetch; LLVMValueRef vb_desc; - LLVMValueRef vertex_index; + LLVMValueRef vertex_index = NULL; LLVMValueRef tmp; if (input_index < num_vbos_in_user_sgprs) { @@ -125,7 +172,19 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L LLVMConstInt(ctx->ac.i32, index, 0)); } - vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index); + if (ctx->abi.vertex_id_replaced) { + /* Only ngg culling will replace vertex_id, and ngg culling is an optimization key + * field, so the shader must be monolithic. + */ + assert(ctx->shader->is_monolithic); + assert(ctx->abi.instance_id_replaced); + + vertex_index = get_vertex_index(ctx, &ctx->shader->key.ge.part.vs.prolog, + input_index, ctx->instance_divisor_constbuf, + ctx->args.start_instance.arg_index, + ctx->args.base_vertex.arg_index); + } else + vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index); /* Use the open-coded implementation for all loads of doubles and * of dword-sized data that needs fixups. We need to insert conversion @@ -947,6 +1006,8 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr]; ctx->abi.instance_id = input_vgprs[instance_id_vgpr]; + ctx->abi.vertex_id_replaced = NULL; + ctx->abi.instance_id_replaced = NULL; /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. @@ -978,37 +1039,10 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part } for (i = 0; i < key->vs_prolog.num_inputs; i++) { - bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i); - bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); - LLVMValueRef index = NULL; - - if (divisor_is_one) { - index = ctx->abi.instance_id; - } else if (divisor_is_fetched) { - LLVMValueRef udiv_factors[4]; - - for (unsigned j = 0; j < 4; j++) { - udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf, - LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0)); - udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); - } - /* The faster NUW version doesn't work when InstanceID == UINT_MAX. - * Such InstanceID might not be achievable in a reasonable time though. - */ - index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0], - udiv_factors[1], udiv_factors[2], udiv_factors[3]); - } - - if (divisor_is_one || divisor_is_fetched) { - /* Add StartInstance. */ - index = - LLVMBuildAdd(ctx->ac.builder, index, - LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), ""); - } else { - /* VertexID + BaseVertex */ - index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id, - LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), ""); - } + LLVMValueRef index = get_vertex_index(ctx, &key->vs_prolog.states, i, + instance_divisor_constbuf, + user_sgpr_base + SI_SGPR_START_INSTANCE, + user_sgpr_base + SI_SGPR_BASE_VERTEX); index = ac_to_float(&ctx->ac, index); ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, ""); -- 2.7.4