From 16d2c7ad557b46104f91365ab3405f0a3ed7e36d Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 26 Oct 2022 21:13:15 +0100 Subject: [PATCH] aco/gfx11: perform FS input loads in WQM MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit fossil-db (gfx1100): Totals from 48184 (35.68% of 135032) affected shaders: MaxWaves: 1131876 -> 1131960 (+0.01%); split: +0.05%, -0.04% Instrs: 36755466 -> 36782290 (+0.07%); split: -0.04%, +0.11% CodeSize: 200812068 -> 200915348 (+0.05%); split: -0.04%, +0.09% VGPRs: 2163980 -> 2163828 (-0.01%); split: -0.15%, +0.14% Latency: 484174459 -> 484341018 (+0.03%); split: -0.06%, +0.09% InvThroughput: 87941284 -> 87944874 (+0.00%); split: -0.04%, +0.04% VClause: 652984 -> 653085 (+0.02%); split: -0.09%, +0.10% SClause: 1510995 -> 1528832 (+1.18%); split: -0.40%, +1.58% Copies: 1997689 -> 2001857 (+0.21%); split: -0.49%, +0.69% Branches: 676629 -> 676584 (-0.01%); split: -0.02%, +0.01% PreSGPRs: 2033070 -> 2036725 (+0.18%) PreVGPRs: 1903922 -> 1903897 (-0.00%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Fixes: 3730be9873d ("aco: mostly implement FS input loads on GFX11") Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index d90c995..352f793 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5319,14 +5319,17 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component); + Temp res; if (dst.regClass() == v2b) { Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p); - bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p, coord2, p10); + res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v1), p, coord2, p10); } else { Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p); - bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10); + res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), p, coord2, p10); } + /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */ + emit_wqm(bld, res, dst, true); } void @@ -5385,7 +5388,10 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig //TODO: this doesn't work in quad-divergent control flow and ignores vertex_id Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component); uint16_t dpp_ctrl = dpp_quad_perm(0, 0, 0, 0); - bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl); + Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl); + + /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */ + emit_wqm(bld, res, dst, true); } else { bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id), bld.m0(prim_mask), idx, component); -- 2.7.4