From: Daniel Schürmann Date: Fri, 5 Feb 2021 13:36:39 +0000 (+0100) Subject: aco: refactor GPR limit calculation X-Git-Tag: upstream/21.2.3~8031 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b98a4d4dd7eaf1f299a0e8cbf5c52d8f2a93da0d;p=platform%2Fupstream%2Fmesa.git aco: refactor GPR limit calculation This patch delays the calculation of GPR limits in order to precisely incorporate extra registers (VCC etc.) and shared VGPRs. Additionally, the allocation granularity is used to set the config. This has some effect on the reported SGPR stats. Totals (Navi10): SGPRs: 6971787 -> 17753642 (+154.65%) Reviewed-by: Rhys Perry Part-of: --- diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 16d2dfc..242966b 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -1180,8 +1180,6 @@ setup_isel_context(Program* program, } calc_min_waves(program); - program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); - program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); unsigned scratch_size = 0; if (program->stage == gs_copy_vs) { diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index a156d10..ef25b17 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -115,10 +115,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->physical_sgprs = 800; program->sgpr_alloc_granule = 16; program->sgpr_limit = 102; - if (family == CHIP_TONGA || family == CHIP_ICELAND) { - program->sgpr_alloc_granule = 96; - program->sgpr_limit = 94; /* workaround hardware bug */ - } + if (family == CHIP_TONGA || family == CHIP_ICELAND) + program->sgpr_alloc_granule = 96; /* workaround hardware bug */ } else { program->physical_sgprs = 512; program->sgpr_alloc_granule = 8; diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 5f6c5b0..56d88e0 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -285,17 +285,23 @@ uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs) return align(std::max(addressable_vgprs, granule), granule); } -uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves) +unsigned round_down(unsigned a, unsigned b) { - uint16_t sgprs = (program->physical_sgprs / max_waves) - program->sgpr_alloc_granule + 1; - sgprs = get_sgpr_alloc(program, sgprs); + return a - (a % b); +} + +uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves) +{ + /* it's not possible to allocate more than 128 SGPRs */ + uint16_t sgprs = std::min(program->physical_sgprs / waves, 128); + sgprs = round_down(sgprs, program->sgpr_alloc_granule); sgprs -= get_extra_sgprs(program); return std::min(sgprs, program->sgpr_limit); } -uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves) +uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves) { - uint16_t vgprs = program->physical_vgprs / max_waves & ~(program->vgpr_alloc_granule - 1); + uint16_t vgprs = program->physical_vgprs / waves & ~(program->vgpr_alloc_granule - 1); return std::min(vgprs, program->vgpr_limit); } @@ -326,8 +332,12 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu; unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit; + assert(program->min_waves >= 1); + uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + /* this won't compile, register pressure reduction necessary */ - if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) { + if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) { program->num_waves = 0; program->max_reg_demand = new_demand; } else { diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 16c341a..5b151d9 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -73,8 +73,10 @@ struct ra_ctx { std::unordered_map vectors; std::unordered_map split_vectors; aco_ptr pseudo_dummy; - unsigned max_used_sgpr = 0; - unsigned max_used_vgpr = 0; + uint16_t max_used_sgpr = 0; + uint16_t max_used_vgpr = 0; + uint16_t sgpr_limit; + uint16_t vgpr_limit; std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */ ra_test_policy policy; @@ -89,6 +91,8 @@ struct ra_ctx { policy(policy_) { pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0)); + sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); } }; @@ -650,14 +654,14 @@ void add_subdword_definition(Program *program, aco_ptr& instr, unsi void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) { - unsigned max_addressible_sgpr = ctx.program->sgpr_limit; + uint16_t max_addressible_sgpr = ctx.sgpr_limit; unsigned size = rc.size(); if (rc.type() == RegType::vgpr) { assert(reg >= 256); - unsigned hi = reg - 256 + size - 1; + uint16_t hi = reg - 256 + size - 1; ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi); } else if (reg + rc.size() <= max_addressible_sgpr) { - unsigned hi = reg + size - 1; + uint16_t hi = reg + size - 1; ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr)); } } @@ -1241,11 +1245,9 @@ bool get_reg_specified(ra_ctx& ctx, } bool increase_register_file(ra_ctx& ctx, RegType type) { - uint16_t max_addressible_sgpr = ctx.program->sgpr_limit; - uint16_t max_addressible_vgpr = ctx.program->vgpr_limit; - if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) { + if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) { update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr)); - } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) { + } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1)); } else { return false; @@ -2677,11 +2679,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bloc } /* num_gpr = rnd_up(max_used_gpr + 1) */ - program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4); - if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */ - program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit); - else - program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8); + program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1); + program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1); } } diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 02e16c0..39c53ea 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1774,14 +1774,16 @@ void spill(Program* program, live& live_vars) /* calculate target register demand */ RegisterDemand register_target = program->max_reg_demand; - if (register_target.sgpr > program->sgpr_limit) - register_target.vgpr += (register_target.sgpr - program->sgpr_limit + program->wave_size - 1 + 32) / program->wave_size; - register_target.sgpr = program->sgpr_limit; - - if (register_target.vgpr > program->vgpr_limit) - register_target.sgpr = program->sgpr_limit - 5; + uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + if (register_target.sgpr > sgpr_limit) + register_target.vgpr += (register_target.sgpr - sgpr_limit + program->wave_size - 1 + 32) / program->wave_size; + register_target.sgpr = sgpr_limit; + + if (register_target.vgpr > vgpr_limit) + register_target.sgpr = sgpr_limit - 5; int spills_to_vgpr = (program->max_reg_demand.sgpr - register_target.sgpr + program->wave_size - 1 + 32) / program->wave_size; - register_target.vgpr = program->vgpr_limit - spills_to_vgpr; + register_target.vgpr = vgpr_limit - spills_to_vgpr; /* initialize ctx */ spill_ctx ctx(register_target, program, live_vars.register_demand); diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 3b21741..72d8db1 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -679,6 +679,7 @@ bool validate_ra(Program *program) { bool err = false; aco::live live_vars = aco::live_var_analysis(program); std::vector> phi_sgpr_ops(program->blocks.size()); + uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->num_waves); std::map assignments; for (Block& block : program->blocks) { @@ -704,7 +705,7 @@ bool validate_ra(Program *program) { if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg()) err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i); if ((op.getTemp().type() == RegType::vgpr && op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) || - (op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < program->sgpr_limit)) + (op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < sgpr_limit)) err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i); if (op.physReg() == vcc && !program->needs_vcc) err |= ra_fail(program, loc, Location(), "Operand %d fixed to vcc but needs_vcc=false", i); @@ -725,7 +726,7 @@ bool validate_ra(Program *program) { if (assignments[def.tempId()].defloc.block) err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId()); if ((def.getTemp().type() == RegType::vgpr && def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) || - (def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < program->sgpr_limit)) + (def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < sgpr_limit)) err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i); if (def.physReg() == vcc && !program->needs_vcc) err |= ra_fail(program, loc, Location(), "Definition %d fixed to vcc but needs_vcc=false", i); diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp index c7df8f2..bbb83ee 100644 --- a/src/amd/compiler/tests/helpers.cpp +++ b/src/amd/compiler/tests/helpers.cpp @@ -80,6 +80,8 @@ void create_program(enum chip_class chip_class, Stage stage, unsigned wave_size, program.reset(new Program); aco::init_program(program.get(), stage, &info, chip_class, family, &config); + program->workgroup_size = UINT_MAX; + calc_min_waves(program.get()); program->debug.func = nullptr; program->debug.private_data = nullptr;