From 8d50385bbdfedff73976ba3ecd4bc4dbbd690b83 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 16 Jun 2021 17:19:36 +0100 Subject: [PATCH] aco: implement linear vgpr copies MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 54 ++++++++++++++++++++++++++++- src/amd/compiler/aco_validate.cpp | 10 +++++- src/amd/compiler/tests/test_to_hw_instr.cpp | 54 +++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 2 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index eecf3ac..a3f70da 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1113,6 +1113,54 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) } } +void +copy_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr) +{ + if (preserve_scc) + bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1)); + + for (unsigned i = 0; i < 2; i++) { + if (def.size() == 2) + bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op); + else + bld.vop1(aco_opcode::v_mov_b32, def, op); + + bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), + Operand(exec, bld.lm)); + } + + if (preserve_scc) + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1), + Operand::zero()); +} + +void +swap_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr) +{ + if (preserve_scc) + bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1)); + + Operand def_as_op = Operand(def.physReg(), def.regClass()); + Definition op_as_def = Definition(op.physReg(), op.regClass()); + + for (unsigned i = 0; i < 2; i++) { + if (bld.program->chip_class >= GFX9) { + bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op); + } else { + bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); + bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op); + bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); + } + + bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), + Operand(exec, bld.lm)); + } + + if (preserve_scc) + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1), + Operand::zero()); +} + bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc, PhysReg scratch_sgpr) @@ -1133,6 +1181,8 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres *preserve_scc = true; } else if (op.isConstant()) { copy_constant(ctx, bld, def, op); + } else if (def.regClass().is_linear_vgpr()) { + copy_linear_vgpr(bld, def, op, *preserve_scc, scratch_sgpr); } else if (def.regClass() == v1) { bld.vop1(aco_opcode::v_mov_b32, def, op); } else if (def.regClass() == v2) { @@ -1232,7 +1282,9 @@ do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool prese assert(op.regClass() == def.regClass()); Operand def_as_op = Operand(def.physReg(), def.regClass()); Definition op_as_def = Definition(op.physReg(), op.regClass()); - if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) { + if (def.regClass().is_linear_vgpr()) { + swap_linear_vgpr(bld, def, op, preserve_scc, pi->scratch_sgpr); + } else if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) { bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op); } else if (def.regClass() == v1) { assert(def.physReg().byte() == 0 && op.physReg().byte() == 0); diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 3e1603e..dc25aba 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -422,12 +422,20 @@ validate_ir(Program* program) for (unsigned i = 0; i < instr->operands.size(); i++) { check(instr->definitions[i].bytes() == instr->operands[i].bytes(), "Operand and Definition size must match", instr.get()); - if (instr->operands[i].isTemp()) + if (instr->operands[i].isTemp()) { check((instr->definitions[i].getTemp().type() == instr->operands[i].regClass().type()) || (instr->definitions[i].getTemp().type() == RegType::vgpr && instr->operands[i].regClass().type() == RegType::sgpr), "Operand and Definition types do not match", instr.get()); + check(instr->definitions[i].regClass().is_linear_vgpr() == + instr->operands[i].regClass().is_linear_vgpr(), + "Operand and Definition types do not match", instr.get()); + } else { + check(!instr->definitions[i].regClass().is_linear_vgpr(), + "Can only copy linear VGPRs into linear VGPRs, not constant/undef", + instr.get()); + } } } else if (instr->opcode == aco_opcode::p_phi) { check(instr->operands.size() == block.logical_preds.size(), diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index 853d407..0914bdc 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -668,3 +668,57 @@ BEGIN_TEST(to_hw_instr.insert) //! s_endpgm } END_TEST + +BEGIN_TEST(to_hw_instr.copy_linear_vgpr_scc) + if (!setup_cs(NULL, GFX10)) + return; + + PhysReg reg_s0{0}; + PhysReg reg_s1{1}; + PhysReg v0_lo{256}; + PhysReg v0_b3{256}; + v0_b3.reg_b += 3; + PhysReg v1_lo{257}; + + //>> p_unit_test 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); + + /* It would be better if the scc=s0 copy was done later, but handle_operands() is complex + * enough + */ + + //! s1: %0:scc = s_cmp_lg_i32 %0:s[0], 0 + //! s1: %0:m0 = s_mov_b32 %0:scc + //! lv1: %0:v[0] = v_mov_b32 %0:v[1] + //! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec + //! lv1: %0:v[0] = v_mov_b32 %0:v[1] + //! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec + //! s1: %0:scc = s_cmp_lg_i32 %0:m0, 0 + Instruction *instr = bld.pseudo( + aco_opcode::p_parallelcopy, + Definition(scc, s1), Definition(v0_lo, v1.as_linear()), + Operand(reg_s0, s1), Operand(v1_lo, v1.as_linear())); + instr->pseudo().scratch_sgpr = m0; + + finish_to_hw_instr_test(); +END_TEST + +BEGIN_TEST(to_hw_instr.swap_linear_vgpr) + if (!setup_cs(NULL, GFX10)) + return; + + PhysReg reg_v0{256}; + PhysReg reg_v1{257}; + RegClass v1_linear = v1.as_linear(); + + //>> p_unit_test 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); + + Instruction *instr = bld.pseudo( + aco_opcode::p_parallelcopy, + Definition(reg_v0, v1_linear), Definition(reg_v1, v1_linear), + Operand(reg_v1, v1_linear), Operand(reg_v0, v1_linear)); + instr->pseudo().scratch_sgpr = m0; + + finish_to_hw_instr_test(); +END_TEST -- 2.7.4