From 3058ab6090725273e9109c13c8a9798e328923e8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Thu, 2 Mar 2023 17:30:49 -0800 Subject: [PATCH] aco: Generalize vs_inputs to args_pending_vmem. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Handle arguments that need a waitcnt without relying on RADV specific VS input information. Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 8 +++----- src/amd/compiler/aco_instruction_selection.cpp | 23 ++++++++--------------- src/amd/compiler/aco_ir.h | 2 +- src/amd/compiler/aco_statistics.cpp | 12 +++++------- 4 files changed, 17 insertions(+), 28 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 948799f..9b34fb0 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -1032,11 +1032,9 @@ insert_wait_states(Program* program) std::stack> loop_header_indices; unsigned loop_progress = 0; - if (program->stage.has(SWStage::VS) && program->info.vs.dynamic_inputs) { - for (Definition def : program->vs_inputs) { - update_counters(in_ctx[0], event_vmem); - insert_wait_entry(in_ctx[0], def, event_vmem); - } + for (Definition def : program->args_pending_vmem) { + update_counters(in_ctx[0], event_vmem); + insert_wait_entry(in_ctx[0], def, event_vmem); } for (unsigned i = 0; i < program->blocks.size();) { diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 73971c5..7e02cfd 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11271,10 +11271,15 @@ add_startpgm(struct isel_context* ctx) ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4); } else { Temp dst = ctx->program->allocateTmp(type); + Definition def(dst); + def.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256}); ctx->arg_temps[i] = dst; - startpgm->definitions[arg] = Definition(dst); - startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256}); - arg++; + startpgm->definitions[arg++] = def; + + if (ctx->args->ac.args[i].pending_vmem) { + assert(file == AC_ARG_VGPR); + ctx->program->args_pending_vmem.push_back(def); + } } } @@ -11294,18 +11299,6 @@ add_startpgm(struct isel_context* ctx) } } - if (ctx->stage.has(SWStage::VS) && ctx->program->info.vs.dynamic_inputs) { - unsigned num_attributes = util_last_bit(ctx->program->info.vs.input_slot_usage_mask); - for (unsigned i = 0; i < num_attributes; i++) { - Definition def(get_arg(ctx, ctx->args->vs_inputs[i])); - - unsigned idx = ctx->args->vs_inputs[i].arg_index; - def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset)); - - ctx->program->vs_inputs.push_back(def); - } - } - return startpgm; } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 6eda598..d512de5 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2120,7 +2120,7 @@ public: unsigned next_divergent_if_logical_depth = 0; unsigned next_uniform_if_depth = 0; - std::vector vs_inputs; + std::vector args_pending_vmem; struct { FILE* output = stderr; diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp index 5662bc7..3403760 100644 --- a/src/amd/compiler/aco_statistics.cpp +++ b/src/amd/compiler/aco_statistics.cpp @@ -544,13 +544,11 @@ collect_preasm_stats(Program* program) double usage[(int)BlockCycleEstimator::resource_count] = {0}; std::vector blocks(program->blocks.size(), program); - if (program->stage.has(SWStage::VS) && program->info.vs.has_prolog) { - unsigned vs_input_latency = 320; - for (Definition def : program->vs_inputs) { - blocks[0].vm.push_back(vs_input_latency); - for (unsigned i = 0; i < def.size(); i++) - blocks[0].reg_available[def.physReg().reg() + i] = vs_input_latency; - } + constexpr const unsigned vmem_latency = 320; + for (const Definition def : program->args_pending_vmem) { + blocks[0].vm.push_back(vmem_latency); + for (unsigned i = 0; i < def.size(); i++) + blocks[0].reg_available[def.physReg().reg() + i] = vmem_latency; } for (Block& block : program->blocks) { -- 2.7.4