From e0f4b52559942bb5a1d1b65e5544a2ec2265d227 Mon Sep 17 00:00:00 2001 From: Vitaliy Triang3l Kuzmin Date: Thu, 6 Apr 2023 23:09:35 +0300 Subject: [PATCH] aco: Add Primitive Ordered Pixel Shading waitcnt rules MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When letting the overlapping waves enter their ordered sections, there must be no memory accesses to resources which need primitive-ordered access that are still pending, or there would be a race between the current wave and the overlapping waves. Reviewed-by: Timur Kristóf Signed-off-by: Vitaliy Triang3l Kuzmin Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 19 +++++++++++++++++++ src/amd/compiler/aco_instruction_selection.cpp | 2 ++ src/amd/compiler/aco_ir.h | 1 + src/amd/compiler/aco_lower_to_hw_instr.cpp | 15 +++++++++++++++ 4 files changed, 37 insertions(+) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 64fdd87..a8bba34 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -509,6 +509,25 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, force_waitcnt(ctx, imm); } + /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the + * overlapping waves proceed into the ordered section. + */ + if (ctx.program->has_pops_overlapped_waves_wait && + (ctx.gfx_level >= GFX11 ? instr->isEXP() && instr->exp().done + : (instr->opcode == aco_opcode::s_sendmsg && + instr->sopp().imm == sendmsg_ordered_ps_done))) { + if (ctx.vm_cnt) + imm.vm = 0; + if (ctx.gfx_level >= GFX10 && ctx.vs_cnt) + imm.vs = 0; + /* Await SMEM loads too, as it's possible for an application to create them, like using a + * scalarization loop - pointless and unoptimal for an inherently divergent address of + * per-pixel data, but still can be done at least synthetically and must be handled correctly. + */ + if (ctx.program->has_smem_buffer_or_global_loads && ctx.lgkm_cnt) + imm.lgkm = 0; + } + check_instr(ctx, imm, delay, instr); /* It's required to wait for scalar stores before "writing back" data. diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 9037480..b25939d 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4398,6 +4398,8 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned { assert(align >= 4u); + bld.program->has_smem_buffer_or_global_loads = true; + bool buffer = info.resource.id() && info.resource.bytes() == 16; Temp addr = info.resource; if (!buffer && !addr.id()) { diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 1901ab7..212fbca 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2118,6 +2118,7 @@ public: Stage stage; bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ bool needs_wqm = false; /* there exists a p_wqm instruction */ + bool has_smem_buffer_or_global_loads = false; bool has_pops_overlapped_waves_wait = false; bool has_color_exports = false; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 419859b..ea43c3f 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2462,6 +2462,21 @@ lower_to_hw_instr(Program* program) block = &program->blocks[block_idx]; bld.reset(discard_block); + if (program->has_pops_overlapped_waves_wait && + (program->gfx_level >= GFX11 || discard_sends_pops_done)) { + /* If this discard early exit potentially exits the POPS ordered section, do + * the waitcnt necessary before resuming overlapping waves as the normal + * waitcnt insertion doesn't work in a discard early exit block. + */ + if (program->gfx_level >= GFX10) + bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0); + wait_imm pops_exit_wait_imm; + pops_exit_wait_imm.vm = 0; + if (program->has_smem_buffer_or_global_loads) + pops_exit_wait_imm.lgkm = 0; + bld.sopp(aco_opcode::s_waitcnt, -1, + pops_exit_wait_imm.pack(program->gfx_level)); + } if (discard_sends_pops_done) bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done); unsigned target = V_008DFC_SQ_EXP_NULL; -- 2.7.4