From: Georg Lehmann Date: Fri, 24 Nov 2023 14:45:44 +0000 (+0100) Subject: aco: don't optimize DPP across more than one block X-Git-Tag: upstream/23.3.3~162 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ae0873c0f81099d69ba4d75683788c745d9e6f43;p=platform%2Fupstream%2Fmesa.git aco: don't optimize DPP across more than one block Register write tracking doesn't work for inactive lanes, so this was unsafe. Foz-DB Navi31: Totals from 8 (0.01% of 78196) affected shaders: Instrs: 11513 -> 11515 (+0.02%) CodeSize: 61056 -> 61064 (+0.01%) Cc: mesa-stable Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10197 Reviewed-by: Daniel Schürmann Part-of: (cherry picked from commit 576afa85405a9796c2eb9c88cdfa28b441033009) --- diff --git a/.pick_status.json b/.pick_status.json index c076dd0..457e734 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -1844,7 +1844,7 @@ "description": "aco: don't optimize DPP across more than one block", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 48ada19..5978e7c 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -492,6 +492,13 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr& instr) if (!op_instr_idx.found()) continue; + /* is_overwritten_since only considers active lanes when the register could possibly + * have been overwritten from inactive lanes. Restrict this optimization to at most + * one block so that there is no possibility for clobbered inactive lanes. + */ + if (ctx.current_block->index - op_instr_idx.block > 1) + continue; + const Instruction* mov = ctx.get(op_instr_idx); if (mov->opcode != aco_opcode::v_mov_b32 || !mov->isDPP()) continue; diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp index 811e762..c0cb4fc 100644 --- a/src/amd/compiler/tests/test_optimizer_postRA.cpp +++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp @@ -571,6 +571,12 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf) //! buffer_store_dword %c:v[2], 0, %d:v[3], 0 offen bld.mubuf(aco_opcode::buffer_store_dword, c, Operand::zero(), d, Operand::zero(), 0, true); + //! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi + //! p_unit_test 10, %res10:v[12] + Temp result = + bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); + writeout(10, Operand(result, reg_v12)); + //! p_logical_end //! s2: %0:vcc = p_branch BB3 @@ -605,12 +611,6 @@ BEGIN_TEST(optimizer_postRA.dpp_across_cf) //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] - //! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi - //! p_unit_test 10, %res10:v[12] - Temp result = - bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); - writeout(10, Operand(result, reg_v12)); - finish_optimizer_postRA_test(); END_TEST