From: Georg Lehmann Date: Wed, 4 Oct 2023 09:51:21 +0000 (+0200) Subject: aco/gfx11: optimize dual source export X-Git-Tag: upstream/23.3.3~1271 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=34d8fa618515c85069ab68d082597168ac2cdc36;p=platform%2Fupstream%2Fmesa.git aco/gfx11: optimize dual source export We can combine dpp with the v_cndmask_b32. Foz-DB Navi31: Totals from 222 (0.28% of 79330) affected shaders: Instrs: 564392 -> 563373 (-0.18%); split: -0.19%, +0.01% CodeSize: 2867040 -> 2864728 (-0.08%); split: -0.09%, +0.01% Latency: 4278957 -> 4275199 (-0.09%); split: -0.09%, +0.00% InvThroughput: 586636 -> 585824 (-0.14%); split: -0.14%, +0.00% SClause: 20210 -> 20211 (+0.00%); split: -0.02%, +0.02% Copies: 39763 -> 39778 (+0.04%); split: -0.13%, +0.17% PreVGPRs: 13924 -> 13922 (-0.01%) Reviewed-by: Rhys Perry Part-of: --- diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index c398488..bf28ae1 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8057,7 +8057,7 @@ create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels)); exp->definitions[0] = bld.def(type); /* mrt0 */ exp->definitions[1] = bld.def(type); /* mrt1 */ - exp->definitions[2] = bld.def(v1); + exp->definitions[2] = bld.def(bld.lm); exp->definitions[3] = bld.def(bld.lm); exp->definitions[4] = bld.def(bld.lm, vcc); exp->definitions[5] = bld.def(s1, scc); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index e26860d..5300708 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2797,13 +2797,13 @@ lower_to_hw_instr(Program* program) case aco_opcode::p_dual_src_export_gfx11: { PhysReg dst0 = instr->definitions[0].physReg(); PhysReg dst1 = instr->definitions[1].physReg(); - Definition tmp = instr->definitions[2]; - Definition exec_tmp = instr->definitions[3]; + Definition exec_tmp = instr->definitions[2]; + Definition not_vcc_tmp = instr->definitions[3]; Definition clobber_vcc = instr->definitions[4]; Definition clobber_scc = instr->definitions[5]; - assert(tmp.regClass() == v1); assert(exec_tmp.regClass() == bld.lm); + assert(not_vcc_tmp.regClass() == bld.lm); assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc); assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc); @@ -2821,6 +2821,12 @@ lower_to_hw_instr(Program* program) bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg().advance(4), s1), Operand::c32(0x55555555)); + Operand src_even = Operand(clobber_vcc.physReg(), bld.lm); + + bld.sop1(Builder::s_not, not_vcc_tmp, clobber_scc, src_even); + + Operand src_odd = Operand(not_vcc_tmp.physReg(), bld.lm); + for (unsigned i = 0; i < 4; i++) { if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) { mrt0[i] = instr->operands[i]; @@ -2831,22 +2837,14 @@ lower_to_hw_instr(Program* program) Operand src0 = instr->operands[i]; Operand src1 = instr->operands[i + 4]; - uint32_t lane_sel_xor1 = 0; - for (unsigned j = 0; j < 8; j++) - lane_sel_xor1 |= (j ^ 1) << (j * 3); - - /* Swap odd, even lanes of mrt0. */ - bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0, lane_sel_xor1); - - /* Swap even lanes between mrt0 and mrt1. */ - bld.vop2(aco_opcode::v_cndmask_b32, tmp, Operand(dst0, v1), src1, - Operand(clobber_vcc.physReg(), bld.lm)); - bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src1, Operand(dst0, v1), - Operand(clobber_vcc.physReg(), bld.lm)); - - /* Swap odd, even lanes of mrt0 again. */ - bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), - Operand(tmp.physReg(), v1), lane_sel_xor1); + /* | even lanes | odd lanes + * mrt0 | src0 even | src1 even + * mrt1 | src0 odd | src1 odd + */ + bld.vop2_dpp(aco_opcode::v_cndmask_b32, Definition(dst0, v1), src1, src0, + src_even, dpp_row_xmask(1)); + bld.vop2_e64_dpp(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src0, src1, + src_odd, dpp_row_xmask(1)); mrt0[i] = Operand(dst0, v1); mrt1[i] = Operand(dst1, v1); diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 0525849..4a43cdf 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -565,9 +565,9 @@ validate_ir(Program* program) } else if (instr->opcode == aco_opcode::p_dual_src_export_gfx11) { check(instr->definitions.size() == 6, "p_dual_src_export_gfx11 must have 6 definitions", instr.get()); - check(instr->definitions[2].regClass().type() == RegType::vgpr && - instr->definitions[2].regClass().size() == 1, - "Third definition of p_dual_src_export_gfx11 must be a v1", instr.get()); + check(instr->definitions[2].regClass() == program->lane_mask, + "Third definition of p_dual_src_export_gfx11 must be a lane mask", + instr.get()); check(instr->definitions[3].regClass() == program->lane_mask, "Fourth definition of p_dual_src_export_gfx11 must be a lane mask", instr.get());