From 5ef7c5482975537ca838b51ca356f654ecf7a9ad Mon Sep 17 00:00:00 2001 From: Qiang Yu <yuq825@gmail.com> Date: Sat, 19 Aug 2023 15:36:00 +0800 Subject: [PATCH] aco: wait memory ops done before go to next shader part MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Next part don't know whether p_end_with_regs args are loaded from memory ops or not, need to wait it's done here. Other memory load needs to be waited too like: a = load_mem() b = ... if (...) { wait_mem(a) store_mem(a) } p_end_with_regs(b) "a" still needs to be waited, otherwise next shader part regs may be overwritten by unfinished memory loads. Memory stores are waited too. When >=gfx10 and last VGT has no parameter export, we need to wait all memeory stores done before pos export (see ac_nir_export_position). So when merged shader (ES+GS or VS+GS) is partially built, first stage needs to wait all memory stores done, otherwise second stage don't know if any memory stores pending before. Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Signe-off-by: Qiang Yu <yuq825@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24973> --- src/amd/compiler/aco_insert_waitcnt.cpp | 11 ++++++----- src/amd/compiler/aco_lower_to_hw_instr.cpp | 6 ++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 71d073965f8..6e292a9bb72 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -1082,6 +1082,12 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) } } + /* For last block of a program which has succeed shader part, wait all memory ops done + * before go to next shader part. + */ + if (block.kind & block_kind_end_with_regs) + force_waitcnt(ctx, queued_imm); + if (!queued_imm.empty()) emit_waitcnt(ctx, new_instructions, queued_imm); if (!queued_delay.empty()) @@ -1153,11 +1159,6 @@ insert_wait_states(Program* program) in_ctx[current.index] = ctx; } - if (current.instructions.empty()) { - out_ctx[current.index] = std::move(ctx); - continue; - } - loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth); done[current.index] = true; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 480ea19dd58..edadca72bb9 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -3065,6 +3065,12 @@ lower_to_hw_instr(Program* program) Builder bld(program, end_with_regs_block); bld.sopp(aco_opcode::s_branch, exit_block->index); + + /* For insert waitcnt pass to add waitcnt in exit block, otherwise waitcnt will be added + * after the s_branch which won't be executed. + */ + end_with_regs_block->kind &= ~block_kind_end_with_regs; + exit_block->kind |= block_kind_end_with_regs; } } -- 2.34.1