i++;
}
- si_llvm_build_vs_exports(ctx, outputs, i);
+ si_llvm_build_vs_exports(ctx, NULL, outputs, i);
}
ac_build_endif(&ctx->ac, 6002);
}
ac_build_endif(&ctx->ac, 5140);
/* Export position and parameter data */
- tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
+ LLVMValueRef num_export_threads = vertlive_scan.result_reduce;
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_export_threads, "");
ac_build_ifcc(&ctx->ac, tmp, 5145);
{
struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
}
}
- si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
+ si_llvm_build_vs_exports(ctx, num_export_threads, outputs, info->num_outputs);
}
ac_build_endif(&ctx->ac, 5145);
}
struct si_shader_output_values *shader_out);
void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
unsigned noutput, unsigned stream);
-void si_llvm_build_vs_exports(struct si_shader_context *ctx,
+void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_export_threads,
struct si_shader_output_values *outputs, unsigned noutput);
void si_llvm_vs_build_end(struct si_shader_context *ctx);
void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
}
}
-/* Generate export instructions for hardware VS shader stage or NGG GS stage
+/**
+ * Generate export instructions for hardware VS shader stage or NGG GS stage
* (position and parameter data only).
+ *
+ * \param num_export_threads The number of threads that are active for exports. Only used by gfx11.
*/
-void si_llvm_build_vs_exports(struct si_shader_context *ctx,
+void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_export_threads,
struct si_shader_output_values *outputs, unsigned noutput)
{
struct si_shader *shader = ctx->shader;
}
if (ctx->screen->info.gfx_level >= GFX11) {
+ /* Store primitive exports to alloca variables, so that we can read them outside this branch. */
+ for (unsigned i = 0; i < shader->info.nr_param_exports; i++) {
+ for (unsigned chan = 0; chan < 4; chan++) {
+ param_exports[i].out[chan] =
+ ac_build_alloca_init(&ctx->ac, param_exports[i].out[chan], "");
+ }
+ }
+ ac_build_endif(&ctx->ac, 0);
+
+ if (!num_export_threads)
+ num_export_threads = si_unpack_param(ctx, ctx->args.merged_wave_info, 0, 8);
+
+ /* We should always store full vec4s in groups of 8 lanes for the best performance even if
+ * some of them are garbage or have unused components, so align the number of export threads
+ * to 8.
+ */
+ num_export_threads = LLVMBuildAdd(ctx->ac.builder, num_export_threads,
+ LLVMConstInt(ctx->ac.i32, 7, 0), "");
+ num_export_threads = LLVMBuildAnd(ctx->ac.builder, num_export_threads,
+ LLVMConstInt(ctx->ac.i32, ~7, 0), "");
+ ac_build_ifcc(&ctx->ac,
+ LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+ ac_get_thread_id(&ctx->ac), num_export_threads, ""), 0);
+
/* Get the attribute ring address and descriptor. */
LLVMValueRef attr_address;
if (ctx->stage == MESA_SHADER_VERTEX && shader->selector->info.base.vs.blit_sgprs_amd) {
/* Write attributes to the attribute ring buffer. */
for (unsigned i = 0; i < shader->info.nr_param_exports; i++) {
+ for (unsigned chan = 0; chan < 4; chan++) {
+ param_exports[i].out[chan] =
+ LLVMBuildLoad(ctx->ac.builder, param_exports[i].out[chan], "");
+ }
+
LLVMValueRef vdata = ac_build_gather_values_extended(&ctx->ac, param_exports[i].out,
4, 1, false);
i++;
}
- si_llvm_build_vs_exports(ctx, outputs, i);
+ si_llvm_build_vs_exports(ctx, NULL, outputs, i);
FREE(outputs);
}