From 1a652244e4bdc0cefa907a91c81ab1efe1eafbd3 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 13 Oct 2020 13:32:38 +0100 Subject: [PATCH] aco: implement 16-bit literals MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We can copy any value into a 16-bit subregister with a 3 dword v_pack_b32_f16 on GFX10 or a v_and_b32+v_or_b32 on GFX9. Because the generated code can depend on the register assignment and to improve constant propagation, Builder::copy creates a p_create_vector in the case of sub-dword literals. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_builder_h.py | 2 +- src/amd/compiler/aco_lower_to_hw_instr.cpp | 41 ++++++++++ src/amd/compiler/aco_validate.cpp | 1 - src/amd/compiler/tests/test_to_hw_instr.cpp | 113 ++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 2 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index a6c2113..9f708c4 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -415,7 +415,7 @@ public: return sop1(aco_opcode::s_mov_b64, dst, op); } else if (dst.regClass() == v1 || dst.regClass() == v1.as_linear()) { return vop1(aco_opcode::v_mov_b32, dst, op); - } else if (op.bytes() > 2) { + } else if (op.bytes() > 2 || (op.isLiteral() && dst.regClass().is_subdword())) { return pseudo(aco_opcode::p_create_vector, dst, op); } else if (op.bytes() == 1 && op.isConstant()) { uint8_t val = op.constantValue(); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index d20a239..a68fa02 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -979,6 +979,26 @@ uint32_t get_intersection_mask(int a_start, int a_size, return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask; } +void copy_16bit_literal(lower_context *ctx, Builder& bld, Definition def, Operand op) +{ + if (ctx->program->chip_class < GFX10) { + unsigned offset = def.physReg().byte() * 8u; + def = Definition(PhysReg(def.physReg().reg()), v1); + Operand def_op(def.physReg(), v1); + bld.vop2(aco_opcode::v_and_b32, def, Operand(~(0xffffu << offset)), def_op); + bld.vop2(aco_opcode::v_or_b32, def, Operand(op.constantValue() << offset), def_op); + } else if (def.physReg().byte() == 2) { + Operand def_lo(def.physReg().advance(-2), v2b); + Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, def_lo, op); + static_cast(instr)->opsel = 0; + } else { + assert(def.physReg().byte() == 0); + Operand def_hi(def.physReg().advance(2), v2b); + Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, op, def_hi); + static_cast(instr)->opsel = 2; + } +} + bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr) { bool did_copy = false; @@ -1029,6 +1049,8 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool } else { bld.vop1(aco_opcode::v_mov_b32, def, op); } + } else if (def.regClass() == v2b && op.isLiteral()) { + copy_16bit_literal(ctx, bld, def, op); } else { bld.copy(def, op); } @@ -1141,6 +1163,25 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, Operand hi) { + if (lo.isConstant() && hi.isConstant()) { + bld.copy(def, Operand(lo.constantValue() | (hi.constantValue() << 16))); + return; + } else if (lo.isLiteral() && ctx->program->chip_class < GFX10) { + if (def.physReg().reg() != hi.physReg().reg()) + bld.copy(def, Operand(lo.constantValue())); + bld.copy(Definition(def.physReg().advance(2), v2b), hi); + if (def.physReg().reg() == hi.physReg().reg()) //TODO: create better code in this case with a v_lshlrev_b32+v_or_b32 + copy_16bit_literal(ctx, bld, Definition(def.physReg(), v2b), lo); + return; + } else if (hi.isLiteral() && ctx->program->chip_class < GFX10) { + if (def.physReg().reg() != lo.physReg().reg()) + bld.copy(def, Operand(hi.constantValue() << 16)); + bld.copy(Definition(def.physReg(), v2b), lo); + if (def.physReg().reg() == lo.physReg().reg()) + copy_16bit_literal(ctx, bld, Definition(def.physReg().advance(2), v2b), hi); + return; + } + if (ctx->program->chip_class >= GFX9) { Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi); /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */ diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index ac4a766..d8886cd 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -349,7 +349,6 @@ bool validate_ir(Program* program) check(!is_subdword || !has_const_sgpr || program->chip_class >= GFX9, "Sub-dword pseudo instructions can only take constants or SGPRs on GFX9+", instr.get()); - check(!is_subdword || !has_literal, "Sub-dword pseudo instructions cannot take literals", instr.get()); } if (instr->opcode == aco_opcode::p_create_vector) { diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index 3047a3c..0fe8e16 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -384,3 +384,116 @@ BEGIN_TEST(to_hw_instr.swap_subdword) finish_to_hw_instr_test(); } END_TEST + +BEGIN_TEST(to_hw_instr.subdword_constant) + PhysReg v0_lo{256}; + PhysReg v0_hi{256}; + PhysReg v0_b1{256}; + PhysReg v1_hi{257}; + v0_hi.reg_b += 2; + v0_b1.reg_b += 1; + v1_hi.reg_b += 2; + + for (unsigned i = GFX9; i <= GFX10; i++) { + if (!setup_cs(NULL, (chip_class)i)) + continue; + + /* 16-bit pack */ + //>> p_unit_test 0 + //! v1: %_:v[0] = v_pack_b32_f16 0.5, hi(%_:v[1][16:32]) + bld.pseudo(aco_opcode::p_unit_test, Operand(0u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand((uint16_t)0x3800), Operand(v1_hi, v2b)); + + //! p_unit_test 1 + //~gfx9! v1: %_:v[0] = v_mov_b32 0x4205 + //~gfx9! v2b: %_:v[0][16:32] = v_mov_b32 %_:v[1][16:32] dst_preserve + //~gfx10! v1: %_:v[0] = v_pack_b32_f16 0x4205, hi(%_:v[1][16:32]) + bld.pseudo(aco_opcode::p_unit_test, Operand(1u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand((uint16_t)0x4205), Operand(v1_hi, v2b)); + + //TODO: optimize this with GFX10. do_pack_2x16() isn't used in this case + //! p_unit_test 2 + //~gfx9! v2b: %_:v[0][16:32] = v_mov_b32 %_:v[0][0:16] dst_preserve + //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0] + //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0] + //~gfx10! v2b: %_:v[0][16:32] = v_mov_b32 %_:v[0][0:16] dst_preserve + //~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32]) + bld.pseudo(aco_opcode::p_unit_test, Operand(2u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand((uint16_t)0x4205), Operand(v0_lo, v2b)); + + //! p_unit_test 3 + //! v1: %_:v[0] = v_mov_b32 0x3c003800 + bld.pseudo(aco_opcode::p_unit_test, Operand(3u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand((uint16_t)0x3800), Operand((uint16_t)0x3c00)); + + //! p_unit_test 4 + //! v1: %_:v[0] = v_mov_b32 0x43064205 + bld.pseudo(aco_opcode::p_unit_test, Operand(4u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand((uint16_t)0x4205), Operand((uint16_t)0x4306)); + + //! p_unit_test 5 + //! v1: %_:v[0] = v_mov_b32 0x38004205 + bld.pseudo(aco_opcode::p_unit_test, Operand(5u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand((uint16_t)0x4205), Operand((uint16_t)0x3800)); + + /* 16-bit copy */ + //! p_unit_test 6 + //! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_preserve + bld.pseudo(aco_opcode::p_unit_test, Operand(6u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_lo, v2b), Operand((uint16_t)0x3800)); + + //! p_unit_test 7 + //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0] + //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0] + //~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32]) + bld.pseudo(aco_opcode::p_unit_test, Operand(7u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_lo, v2b), Operand((uint16_t)0x4205)); + + //! p_unit_test 8 + //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0] + //~gfx9! v1: %_:v[0] = v_or_b32 0x42050000, %_:v[0] + //~gfx10! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205 + bld.pseudo(aco_opcode::p_unit_test, Operand(8u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_hi, v2b), Operand((uint16_t)0x4205)); + + //! p_unit_test 9 + //! v1b: %_:v[0][8:16] = v_mov_b32 0 dst_preserve + //! v1b: %_:v[0][16:24] = v_mov_b32 56 dst_preserve + bld.pseudo(aco_opcode::p_unit_test, Operand(9u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_b1, v2b), Operand((uint16_t)0x3800)); + + //! p_unit_test 10 + //! v1b: %_:v[0][8:16] = v_mov_b32 5 dst_preserve + //! v1b: %_:v[0][16:24] = v_mul_u32_u24 2, 33 dst_preserve + bld.pseudo(aco_opcode::p_unit_test, Operand(10u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_b1, v2b), Operand((uint16_t)0x4205)); + + /* 8-bit copy */ + //! p_unit_test 11 + //! v1b: %_:v[0][0:8] = v_mul_u32_u24 2, 33 dst_preserve + bld.pseudo(aco_opcode::p_unit_test, Operand(11u)); + bld.pseudo(aco_opcode::p_parallelcopy, + Definition(v0_lo, v1b), Operand((uint8_t)0x42)); + + //! s_endpgm + + finish_to_hw_instr_test(); + } +END_TEST -- 2.7.4