From 947bf0bd67b7047f247fc10874ced2db8d4f6527 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Fri, 5 Feb 2021 14:38:08 +0100 Subject: [PATCH] aco: don't decrease the vgpr_limit when encountering bpermute Instead we recalculate vgpr_limit on demand, depending on the number of needed shared VGPRs. Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 12 +++++------- src/amd/compiler/aco_instruction_selection.h | 1 - src/amd/compiler/aco_live_var_analysis.cpp | 4 +++- src/amd/compiler/aco_scheduler.cpp | 1 + 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 89d1b0e..75c632e 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -192,14 +192,8 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data); } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) { - /* GFX10 wave64 mode: emulate full-wave bpermute */ - if (!ctx->has_gfx10_wave64_bpermute) { - ctx->has_gfx10_wave64_bpermute = true; - /* Shared VGPRs are allocated in groups of 8/16 */ - ctx->program->config->num_shared_vgprs = ctx->program->chip_class >= GFX10_3 ? 16 : 8; - ctx->program->vgpr_limit -= ctx->program->chip_class >= GFX10_3 ? 8 : 4; - } + /* GFX10 wave64 mode: emulate full-wave bpermute */ Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index); Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo); Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp()); @@ -211,6 +205,10 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data input_data.setLateKill(true); same_half.setLateKill(true); + /* We need one pair of shared VGPRs: + * Note, that these have twice the allocation granularity of normal VGPRs */ + ctx->program->config->num_shared_vgprs = 2 * ctx->program->vgpr_alloc_granule; + return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half); } else { /* GFX8-9 or GFX10 wave32: bpermute works normally */ diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index d520045..dd478c4 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -60,7 +60,6 @@ struct isel_context { uint32_t first_temp_id; std::unordered_map> allocated_vec; Stage stage; - bool has_gfx10_wave64_bpermute = false; struct { bool has_branch; uint16_t loop_nest_depth = 0; diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 56d88e0..06827c4 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -302,6 +302,7 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves) uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves) { uint16_t vgprs = program->physical_vgprs / waves & ~(program->vgpr_alloc_granule - 1); + vgprs -= program->config->num_shared_vgprs / 2; return std::min(vgprs, program->vgpr_limit); } @@ -342,7 +343,8 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) program->max_reg_demand = new_demand; } else { program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); - program->num_waves = std::min(program->num_waves, program->physical_vgprs / get_vgpr_alloc(program, new_demand.vgpr)); + uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2; + program->num_waves = std::min(program->num_waves, program->physical_vgprs / vgpr_demand); program->max_waves = max_waves_per_simd; /* adjust max_waves for workgroup and LDS limits */ diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index ad85259..f84544d 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -892,6 +892,7 @@ void schedule_program(Program *program, live& live_vars) RegisterDemand demand; for (Block& block : program->blocks) demand.update(block.register_demand); + demand.vgpr += program->config->num_shared_vgprs / 2; sched_ctx ctx; ctx.mv.depends_on.resize(program->peekAllocationId()); -- 2.7.4