From 040142684cc2db47056c260edadbb3cea5f653b6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Sat, 2 Sep 2023 10:30:02 +0200 Subject: [PATCH] aco: make p_wqm a marker instruction without Operands/Definitions Totals from 28277 (36.93% of 76572) affected shaders: (GFX11) MaxWaves: 833930 -> 833898 (-0.00%); split: +0.01%, -0.01% Instrs: 21366950 -> 21353346 (-0.06%); split: -0.11%, +0.05% CodeSize: 112855368 -> 112610508 (-0.22%); split: -0.24%, +0.03% VGPRs: 1157748 -> 1158540 (+0.07%); split: -0.10%, +0.17% SpillSGPRs: 2465 -> 2463 (-0.08%); split: -0.16%, +0.08% Latency: 168339886 -> 168383646 (+0.03%); split: -0.10%, +0.12% InvThroughput: 25164895 -> 25158376 (-0.03%); split: -0.08%, +0.06% VClause: 347660 -> 346256 (-0.40%); split: -0.55%, +0.15% SClause: 794460 -> 799521 (+0.64%); split: -0.33%, +0.97% Copies: 1151908 -> 1148370 (-0.31%); split: -0.54%, +0.23% Branches: 359447 -> 359437 (-0.00%); split: -0.01%, +0.00% Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 18 ++++++++---------- src/amd/compiler/aco_lower_to_hw_instr.cpp | 3 +-- src/amd/compiler/aco_optimizer.cpp | 13 ------------- src/amd/compiler/aco_register_allocation.cpp | 4 +--- src/amd/compiler/aco_validate.cpp | 3 +-- 5 files changed, 11 insertions(+), 30 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 2e12d7d..246f3be 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -171,18 +171,16 @@ emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Temp emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false) { - if (bld.program->stage != fragment_fs) { - if (!dst.id()) - return src; - else - return bld.copy(Definition(dst), src); - } else if (!dst.id()) { - dst = bld.tmp(src.regClass()); + if (dst.id()) + bld.copy(Definition(dst), src); + else + dst = src; + + if (bld.program->stage == fragment_fs) { + bld.pseudo(aco_opcode::p_wqm); + bld.program->needs_wqm |= program_needs_wqm; } - assert(src.bytes() == dst.bytes()); - bld.pseudo(aco_opcode::p_wqm, Definition(dst), src); - bld.program->needs_wqm |= program_needs_wqm; return dst; } diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index ed22671..ba49e38 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2395,8 +2395,7 @@ lower_to_hw_instr(Program* program) handle_operands(copy_operations, &ctx, program->gfx_level, pi); break; } - case aco_opcode::p_parallelcopy: - case aco_opcode::p_wqm: { + case aco_opcode::p_parallelcopy: { std::map copy_operations; for (unsigned j = 0; j < instr->operands.size(); j++) { assert(instr->definitions[j].bytes() == instr->operands[j].bytes()); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index f8a2322..dfe7b5c 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -662,7 +662,6 @@ alu_can_accept_constant(const aco_ptr& instr, unsigned operand) case aco_opcode::v_cndmask_b32: return operand != 2; case aco_opcode::s_addk_i32: case aco_opcode::s_mulk_i32: - case aco_opcode::p_wqm: case aco_opcode::p_extract_vector: case aco_opcode::p_split_vector: case aco_opcode::v_readlane_b32: @@ -2071,11 +2070,6 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp); } break; - case aco_opcode::p_wqm: - if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_scc_invert()) { - ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp()); - } - break; case aco_opcode::s_mul_i32: /* Testing every uint32_t shows that 0x3f800000*n is never a denormal. * This pattern is created from a uniform nir_op_b2f. */ @@ -4787,13 +4781,6 @@ select_instruction(opt_ctx& ctx, aco_ptr& instr) instr->opcode == aco_opcode::s_cselect_b32) && instr->operands[2].isTemp()) { ctx.info[instr->operands[2].tempId()].set_scc_needed(); - } else if (instr->opcode == aco_opcode::p_wqm && instr->operands[0].isTemp() && - ctx.info[instr->definitions[0].tempId()].is_scc_needed()) { - /* Propagate label so it is correctly detected by the uniform bool transform */ - ctx.info[instr->operands[0].tempId()].set_scc_needed(); - - /* Fix definition to SCC, this will prevent RA from adding superfluous moves */ - instr->definitions[0].setFixed(scc); } /* check for literals */ diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 7c4535b..1acd899 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1888,7 +1888,6 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) case aco_opcode::p_create_vector: case aco_opcode::p_split_vector: case aco_opcode::p_parallelcopy: - case aco_opcode::p_wqm: case aco_opcode::p_start_linear_vgpr: break; default: return; } @@ -2942,8 +2941,7 @@ register_allocation(Program* program, std::vector& live_out_per_block, ra if (get_reg_specified(ctx, register_file, rc, instr, reg)) definition->setFixed(reg); } - } else if (instr->opcode == aco_opcode::p_wqm || - instr->opcode == aco_opcode::p_parallelcopy || + } else if (instr->opcode == aco_opcode::p_parallelcopy || (instr->opcode == aco_opcode::p_start_linear_vgpr && !instr->operands.empty())) { PhysReg reg = instr->operands[i].physReg(); diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 6b960a4..32c3d79 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -468,8 +468,7 @@ validate_ir(Program* program) check(program->gfx_level >= GFX9 || !def.regClass().is_subdword(), "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get()); } - } else if (instr->opcode == aco_opcode::p_parallelcopy || - instr->opcode == aco_opcode::p_wqm) { + } else if (instr->opcode == aco_opcode::p_parallelcopy) { check(instr->definitions.size() == instr->operands.size(), "Number of Operands does not match number of Definitions", instr.get()); for (unsigned i = 0; i < instr->operands.size(); i++) { -- 2.7.4