From: Marek Olšák Date: Sat, 14 May 2022 11:23:13 +0000 (-0400) Subject: radeonsi/gfx11: optimize attribute stores X-Git-Tag: upstream/22.3.5~8755 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f46cd73e29aa52163dd8facf0645c221f4ae04d1;p=platform%2Fupstream%2Fmesa.git radeonsi/gfx11: optimize attribute stores Reviewed-by: Mihai Preda Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 0185a81..5be0af0 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -1660,7 +1660,7 @@ void gfx10_ngg_build_end(struct si_shader_context *ctx) i++; } - si_llvm_build_vs_exports(ctx, outputs, i); + si_llvm_build_vs_exports(ctx, NULL, outputs, i); } ac_build_endif(&ctx->ac, 6002); } @@ -2211,7 +2211,8 @@ void gfx10_ngg_gs_build_end(struct si_shader_context *ctx) ac_build_endif(&ctx->ac, 5140); /* Export position and parameter data */ - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, ""); + LLVMValueRef num_export_threads = vertlive_scan.result_reduce; + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_export_threads, ""); ac_build_ifcc(&ctx->ac, tmp, 5145); { struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS]; @@ -2233,7 +2234,7 @@ void gfx10_ngg_gs_build_end(struct si_shader_context *ctx) } } - si_llvm_build_vs_exports(ctx, outputs, info->num_outputs); + si_llvm_build_vs_exports(ctx, num_export_threads, outputs, info->num_outputs); } ac_build_endif(&ctx->ac, 5145); } diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index b52187f..12ba6c3 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -272,7 +272,7 @@ void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef struct si_shader_output_values *shader_out); void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs, unsigned noutput, unsigned stream); -void si_llvm_build_vs_exports(struct si_shader_context *ctx, +void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_export_threads, struct si_shader_output_values *outputs, unsigned noutput); void si_llvm_vs_build_end(struct si_shader_context *ctx); void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index c5396c2..be2e574 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -606,7 +606,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, } if (stream == 0) - si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs); + si_llvm_build_vs_exports(&ctx, NULL, outputs, gsinfo->num_outputs); LLVMBuildBr(builder, end_bb); } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index bae4746..4408a03 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -504,10 +504,13 @@ static void si_vertex_color_clamping(struct si_shader_context *ctx, } } -/* Generate export instructions for hardware VS shader stage or NGG GS stage +/** + * Generate export instructions for hardware VS shader stage or NGG GS stage * (position and parameter data only). + * + * \param num_export_threads The number of threads that are active for exports. Only used by gfx11. */ -void si_llvm_build_vs_exports(struct si_shader_context *ctx, +void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_export_threads, struct si_shader_output_values *outputs, unsigned noutput) { struct si_shader *shader = ctx->shader; @@ -722,6 +725,30 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, } if (ctx->screen->info.gfx_level >= GFX11) { + /* Store primitive exports to alloca variables, so that we can read them outside this branch. */ + for (unsigned i = 0; i < shader->info.nr_param_exports; i++) { + for (unsigned chan = 0; chan < 4; chan++) { + param_exports[i].out[chan] = + ac_build_alloca_init(&ctx->ac, param_exports[i].out[chan], ""); + } + } + ac_build_endif(&ctx->ac, 0); + + if (!num_export_threads) + num_export_threads = si_unpack_param(ctx, ctx->args.merged_wave_info, 0, 8); + + /* We should always store full vec4s in groups of 8 lanes for the best performance even if + * some of them are garbage or have unused components, so align the number of export threads + * to 8. + */ + num_export_threads = LLVMBuildAdd(ctx->ac.builder, num_export_threads, + LLVMConstInt(ctx->ac.i32, 7, 0), ""); + num_export_threads = LLVMBuildAnd(ctx->ac.builder, num_export_threads, + LLVMConstInt(ctx->ac.i32, ~7, 0), ""); + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), num_export_threads, ""), 0); + /* Get the attribute ring address and descriptor. */ LLVMValueRef attr_address; if (ctx->stage == MESA_SHADER_VERTEX && shader->selector->info.base.vs.blit_sgprs_amd) { @@ -765,6 +792,11 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, /* Write attributes to the attribute ring buffer. */ for (unsigned i = 0; i < shader->info.nr_param_exports; i++) { + for (unsigned chan = 0; chan < 4; chan++) { + param_exports[i].out[chan] = + LLVMBuildLoad(ctx->ac.builder, param_exports[i].out[chan], ""); + } + LLVMValueRef vdata = ac_build_gather_values_extended(&ctx->ac, param_exports[i].out, 4, 1, false); @@ -812,7 +844,7 @@ void si_llvm_vs_build_end(struct si_shader_context *ctx) i++; } - si_llvm_build_vs_exports(ctx, outputs, i); + si_llvm_build_vs_exports(ctx, NULL, outputs, i); FREE(outputs); }