From b4383821e7ec10d2d6c3cfec6eb8fe54dddb0d38 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 7 Feb 2023 19:45:55 +0000 Subject: [PATCH] aco: don't modify exec in p_interp_gfx11 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The RDNA3 ISA docs say that lds_param_load write the entire quad regardless of exec, so this isn't needed. fossil-db (gfx1100): Totals from 5291 (3.93% of 134574) affected shaders: Instrs: 4891396 -> 4789628 (-2.08%) CodeSize: 25519032 -> 25111960 (-1.60%) Latency: 36122982 -> 36074300 (-0.13%); split: -0.14%, +0.00% InvThroughput: 4162436 -> 4161424 (-0.02%); split: -0.02%, +0.00% Copies: 263862 -> 263838 (-0.01%) PreSGPRs: 225012 -> 224179 (-0.37%) Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Timur Kristóf Part-of: --- src/amd/compiler/aco_builder_h.py | 2 +- src/amd/compiler/aco_instruction_selection.cpp | 11 +++++------ src/amd/compiler/aco_lower_to_hw_instr.cpp | 8 -------- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index f7552fd..816bc33 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -530,7 +530,7 @@ public: } <% import itertools -formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8), (2, 6), (3,6)]), +formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8), (2, 6), (3, 6), (1, 6)]), ("sop1", [Format.SOP1], 'SOP1_instruction', [(0, 1), (1, 0), (1, 1), (2, 1), (3, 2)]), ("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 2], [2, 3])), ("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 1, 2], [0, 1])), diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index d135e5d..b67b173 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5335,9 +5335,8 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */ Operand coord2_op(coord2); coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */ - bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm), bld.def(s1, scc), - Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component), coord1, - coord2_op, prim_mask_op); + bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()), + Operand::c32(idx), Operand::c32(component), coord1, coord2_op, prim_mask_op); return; } @@ -5416,9 +5415,9 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig if (in_exec_divergent_or_in_loop(ctx)) { Operand prim_mask_op = bld.m0(prim_mask); prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */ - bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm), bld.def(s1, scc), - Operand(v1.as_linear()), Operand::c32(idx), Operand::c32(component), - Operand::c32(dpp_ctrl), prim_mask_op); + bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()), + Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl), + prim_mask_op); } else { Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index ac5f953..a31e991 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2507,15 +2507,11 @@ lower_to_hw_instr(Program* program) case aco_opcode::p_interp_gfx11: { assert(instr->definitions[0].regClass() == v1 || instr->definitions[0].regClass() == v2b); - assert(instr->definitions[1].regClass() == bld.lm); - assert(instr->definitions[2].isFixed() && instr->definitions[2].physReg() == scc); assert(instr->operands[0].regClass() == v1.as_linear()); assert(instr->operands[1].isConstant()); assert(instr->operands[2].isConstant()); assert(instr->operands.back().physReg() == m0); Definition dst = instr->definitions[0]; - PhysReg exec_tmp = instr->definitions[1].physReg(); - Definition clobber_scc = instr->definitions[2]; PhysReg lin_vgpr = instr->operands[0].physReg(); unsigned attribute = instr->operands[1].constantValue(); unsigned component = instr->operands[2].constantValue(); @@ -2531,12 +2527,8 @@ lower_to_hw_instr(Program* program) dpp_ctrl = instr->operands[3].constantValue(); } - bld.sop1(Builder::s_mov, Definition(exec_tmp, bld.lm), Operand(exec, bld.lm)); - bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc, - Operand(exec, bld.lm)); bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1), attribute, component); - bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(exec_tmp, bld.lm)); Operand p(lin_vgpr, v1); Operand dst_op(dst.physReg(), v1); -- 2.7.4