radv, aco, ac/nir: Tweak position export scheduling for NGG culling.
authorTimur Kristóf <timur.kristof@gmail.com>
Mon, 5 Jul 2021 13:26:18 +0000 (15:26 +0200)
committerMarge Bot <eric+marge@anholt.net>
Tue, 13 Jul 2021 23:56:33 +0000 (23:56 +0000)
The result is about +5-ish fps in Doom Eternal.

It turns out that the location of position exports matters more
than we thought, and it's actually better to keep them at the bottom
for culling shaders rather than schedule it up to the top.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>

src/amd/common/ac_nir.h
src/amd/common/ac_nir_lower_ngg.c
src/amd/compiler/aco_scheduler.cpp
src/amd/vulkan/radv_shader.c
src/amd/vulkan/radv_shader.h

index 4f4076c..4707499 100644 (file)
@@ -96,6 +96,7 @@ typedef struct
    unsigned lds_bytes_if_culling_off;
    bool can_cull;
    bool passthrough;
+   bool early_prim_export;
    uint64_t nggc_inputs_read_by_pos;
    uint64_t nggc_inputs_read_by_others;
 } ac_nir_ngg_config;
index 34ff7e1..2d35d65 100644 (file)
@@ -1290,6 +1290,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
       .lds_bytes_if_culling_off = lds_bytes_if_culling_off,
       .can_cull = can_cull,
       .passthrough = passthrough,
+      .early_prim_export = state.early_prim_export,
       .nggc_inputs_read_by_pos = state.inputs_needed_by_pos,
       .nggc_inputs_read_by_others = state.inputs_needed_by_others,
    };
index 9a17a81..9b4c9ff 100644 (file)
@@ -126,6 +126,8 @@ struct sched_ctx {
    int16_t last_SMEM_stall;
    int last_SMEM_dep_idx;
    MoveState mv;
+   bool schedule_pos_exports = true;
+   unsigned schedule_pos_export_div = 1;
 };
 
 /* This scheduler is a simple bottom-up pass based on ideas from
@@ -928,8 +930,8 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
                          Instruction* current, int idx)
 {
    assert(idx != 0);
-   int window_size = POS_EXP_WINDOW_SIZE;
-   int max_moves = POS_EXP_MAX_MOVES;
+   int window_size = POS_EXP_WINDOW_SIZE / ctx.schedule_pos_export_div;
+   int max_moves = POS_EXP_MAX_MOVES / ctx.schedule_pos_export_div;
    int16_t k = 0;
 
    DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false);
@@ -982,7 +984,7 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
       Instruction* current = block->instructions[idx].get();
 
-      if (block->kind & block_kind_export_end && current->isEXP()) {
+      if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) {
          unsigned target = current->exp().dest;
          if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
             ctx.mv.current = current;
@@ -1048,6 +1050,17 @@ schedule_program(Program* program, live& live_vars)
    ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
                            int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
 
+   /* NGG culling shaders are very sensitive to position export scheduling.
+    * Schedule less aggressively when early primitive export is used, and
+    * keep the position export at the very bottom when late primitive export is used.
+    */
+   if (program->info->has_ngg_culling && program->stage.num_sw_stages() == 1) {
+      if (!program->info->has_ngg_early_prim_export)
+         ctx.schedule_pos_exports = false;
+      else
+         ctx.schedule_pos_export_div = 4;
+   }
+
    for (Block& block : program->blocks)
       schedule_block(ctx, program, &block, live_vars);
 
index df8f47d..2fdfa3f 100644 (file)
@@ -1017,6 +1017,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
             key->vs.provoking_vtx_last);
 
       info->has_ngg_culling = out_conf.can_cull;
+      info->has_ngg_early_prim_export = out_conf.early_prim_export;
       info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
       info->is_ngg_passthrough = out_conf.passthrough;
       key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
index ab3dcac..cabf684 100644 (file)
@@ -264,6 +264,7 @@ struct radv_shader_info {
    bool is_ngg;
    bool is_ngg_passthrough;
    bool has_ngg_culling;
+   bool has_ngg_early_prim_export;
    uint32_t num_lds_blocks_when_not_culling;
    uint32_t num_tess_patches;
    struct {