From 70aea6b41aeea6e9ff3341b07d27c5e4ede9ae02 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Fri, 18 Jun 2021 17:51:15 +0200 Subject: [PATCH] aco/ra: refactor collect_vars() to return a sorted vector The vector of IDs is sorted with decreasing sizes, and by increasing assigned registers. This decouples register assingment from ssa IDs. Totals from 12694 (9.41% of 134913) affected shaders: (GFX10.3) VGPRs: 757864 -> 757848 (-0.00%); split: -0.00%, +0.00% CodeSize: 72350540 -> 72348688 (-0.00%); split: -0.02%, +0.02% MaxWaves: 237018 -> 237020 (+0.00%); split: +0.00%, -0.00% Instrs: 13545494 -> 13544699 (-0.01%); split: -0.03%, +0.02% Latency: 148539203 -> 148533292 (-0.00%); split: -0.01%, +0.00% InvThroughput: 30319086 -> 30320382 (+0.00%); split: -0.01%, +0.01% VClause: 326875 -> 327028 (+0.05%); split: -0.05%, +0.09% SClause: 479833 -> 479837 (+0.00%); split: -0.00%, +0.00% Copies: 862152 -> 860914 (-0.14%); split: -0.43%, +0.28% Branches: 317775 -> 317777 (+0.00%) Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_register_allocation.cpp | 46 ++++++++++++++-------------- src/amd/compiler/tests/test_regalloc.cpp | 4 +-- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 7b88559..a41c61f 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1006,19 +1006,28 @@ find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interva return vars; } -/* collect variables from a register area and clear reg_file */ -std::set> +/* collect variables from a register area and clear reg_file + * variables are sorted in decreasing size and + * increasing assigned register + */ +std::vector collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) { std::vector ids = find_vars(ctx, reg_file, reg_interval); - std::set> vars; + std::sort(ids.begin(), ids.end(), + [&](unsigned a, unsigned b) + { + assignment& var_a = ctx.assignments[a]; + assignment& var_b = ctx.assignments[b]; + return var_a.rc.bytes() > var_b.rc.bytes() || + (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg); + }); for (unsigned id : ids) { assignment& var = ctx.assignments[id]; reg_file.clear(var.reg, var.rc); - vars.emplace(var.rc.bytes(), id); } - return vars; + return ids; } std::pair @@ -1073,17 +1082,11 @@ get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, bool get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, std::vector>& parallelcopies, - const std::set>& vars, - const PhysRegInterval bounds, aco_ptr& instr, - const PhysRegInterval def_reg) + const std::vector& vars, const PhysRegInterval bounds, + aco_ptr& instr, const PhysRegInterval def_reg) { - /* variables are sorted from small sized to large */ - /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders - * slightly though. */ - // TODO: sort by register instead of id - for (std::set>::const_reverse_iterator it = vars.rbegin(); - it != vars.rend(); ++it) { - unsigned id = it->second; + /* Variables are sorted from large to small and with increasing assigned register */ + for (unsigned id : vars) { assignment& var = ctx.assignments[id]; DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1); uint32_t size = info.size; @@ -1203,7 +1206,7 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, PhysRegInterval reg_win{best_pos, size}; /* collect variables and block reg file */ - std::set> new_vars = collect_vars(ctx, reg_file, reg_win); + std::vector new_vars = collect_vars(ctx, reg_file, reg_win); /* mark the area as blocked */ reg_file.block(reg_win.lo(), var.rc); @@ -1351,7 +1354,7 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, } } - std::set> vars = collect_vars(ctx, tmp_file, best_win); + std::vector vars = collect_vars(ctx, tmp_file, best_win); /* re-enable killed operands */ if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) { @@ -1820,8 +1823,7 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, } /* collect variables to be moved */ - std::set> vars = - collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); + std::vector vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); bool success = false; std::vector> pc; @@ -1952,8 +1954,7 @@ get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, RegisterFile tmp_file(register_file); - std::set> blocking_vars = - collect_vars(ctx, tmp_file, target); + std::vector blocking_vars = collect_vars(ctx, tmp_file, target); tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src tmp_file.block(operand.physReg(), operand.regClass()); @@ -2734,8 +2735,7 @@ register_allocation(Program* program, std::vector& live_out_per_block, ra const PhysRegInterval def_regs{definition.physReg(), definition.size()}; /* create parallelcopy pair to move blocking vars */ - std::set> vars = - collect_vars(ctx, register_file, def_regs); + std::vector vars = collect_vars(ctx, register_file, def_regs); RegisterFile tmp_file(register_file); /* re-enable the killed operands, so that we don't move the blocking vars there */ diff --git a/src/amd/compiler/tests/test_regalloc.cpp b/src/amd/compiler/tests/test_regalloc.cpp index 9edd2ee..a63ca2b 100644 --- a/src/amd/compiler/tests/test_regalloc.cpp +++ b/src/amd/compiler/tests/test_regalloc.cpp @@ -136,13 +136,13 @@ BEGIN_TEST(regalloc.precolor.vector.collect) if (!setup_cs("s2 s1 s1", GFX10)) return; - //! s1: %tmp2_2:s[0], s1: %tmp1_2:s[1], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp2:s[3], %tmp1:s[2], %tmp0:s[0-1] + //! s1: %tmp1_2:s[0], s1: %tmp2_2:s[1], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp1:s[2], %tmp2:s[3], %tmp0:s[0-1] //! p_unit_test %tmp0_2:s[2-3] Operand op(inputs[0]); op.setFixed(PhysReg(2)); bld.pseudo(aco_opcode::p_unit_test, op); - //! p_unit_test %tmp1_2:s[1], %tmp2_2:s[0] + //! p_unit_test %tmp1_2:s[0], %tmp2_2:s[1] bld.pseudo(aco_opcode::p_unit_test, inputs[1], inputs[2]); finish_ra_test(ra_test_policy()); -- 2.7.4