From 55cd74d468fe7834c3da63864bf3099cdf98c57f Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 17 Jun 2022 13:53:08 +0100 Subject: [PATCH] aco: add LDSDIR instruction format MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_assembler.cpp | 11 +++++++ src/amd/compiler/aco_builder_h.py | 1 + src/amd/compiler/aco_insert_waitcnt.cpp | 18 +++++++--- src/amd/compiler/aco_ir.cpp | 1 + src/amd/compiler/aco_ir.h | 49 +++++++++++++++++++++------- src/amd/compiler/aco_lower_to_hw_instr.cpp | 2 +- src/amd/compiler/aco_opcodes.py | 37 ++++++++++++++------- src/amd/compiler/aco_opt_value_numbering.cpp | 6 ++++ src/amd/compiler/aco_print_ir.cpp | 9 +++++ src/amd/compiler/aco_validate.cpp | 9 +++++ 10 files changed, 115 insertions(+), 28 deletions(-) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index d4e1c06..58c2471 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -398,6 +398,17 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst out.push_back(encoding); break; } + case Format::LDSDIR: { + LDSDIR_instruction& dir = instr->ldsdir(); + uint32_t encoding = (0b11001110 << 24); + encoding |= opcode << 20; + encoding |= (uint32_t)dir.wait_vdst << 16; + encoding |= (uint32_t)dir.attr << 10; + encoding |= (uint32_t)dir.attr_chan << 8; + encoding |= reg(ctx, instr->definitions[0], 8); + out.push_back(encoding); + break; + } case Format::MUBUF: { MUBUF_instruction& mubuf = instr->mubuf(); uint32_t encoding = (0b111000 << 26); diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index 05c4aab..b837fb8 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -515,6 +515,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod ("sopc", [Format.SOPC], 'SOPC_instruction', [(1, 2)]), ("smem", [Format.SMEM], 'SMEM_instruction', [(0, 4), (0, 3), (1, 0), (1, 3), (1, 2), (0, 0)]), ("ds", [Format.DS], 'DS_instruction', [(1, 1), (1, 2), (0, 3), (0, 4)]), + ("ldsdir", [Format.LDSDIR], 'LDSDIR_instruction', [(1, 1)]), ("mubuf", [Format.MUBUF], 'MUBUF_instruction', [(0, 4), (1, 3)]), ("mtbuf", [Format.MTBUF], 'MTBUF_instruction', [(0, 4), (1, 3)]), ("mimg", [Format.MIMG], 'MIMG_instruction', itertools.product([0, 1], [3, 4, 5, 6, 7])), diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index e435f8f..eafea3a 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -68,7 +68,8 @@ enum wait_event : uint16_t { event_gds_gpr_lock = 1 << 9, event_vmem_gpr_lock = 1 << 10, event_sendmsg = 1 << 11, - num_events = 12, + event_ldsdir = 1 << 12, + num_events = 13, }; enum counter_type : uint8_t { @@ -86,7 +87,8 @@ enum vmem_type : uint8_t { }; static const uint16_t exp_events = - event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; + event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock | + event_ldsdir; static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg; static const uint16_t vm_events = event_vmem | event_flat; static const uint16_t vs_events = event_vmem_store; @@ -106,7 +108,8 @@ get_counters_for_event(wait_event ev) case event_exp_param: case event_exp_mrt_null: case event_gds_gpr_lock: - case event_vmem_gpr_lock: return counter_exp; + case event_vmem_gpr_lock: + case event_ldsdir: return counter_exp; default: return 0; } } @@ -154,8 +157,7 @@ struct wait_entry { if (counter == counter_exp) { imm.exp = wait_imm::unset_counter; - events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | - event_vmem_gpr_lock); + events &= ~exp_events; } if (counter == counter_vs) { @@ -681,6 +683,12 @@ gen(Instruction* instr, wait_ctx& ctx) } break; } + case Format::LDSDIR: { + LDSDIR_instruction& ldsdir = instr->ldsdir(); + update_counters(ctx, event_ldsdir, ldsdir.sync); + insert_wait_entry(ctx, instr->definitions[0], event_ldsdir); + break; + } case Format::MUBUF: case Format::MTBUF: case Format::MIMG: diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index f09524e..96f5a3a 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -195,6 +195,7 @@ get_sync_info(const Instruction* instr) case Format::GLOBAL: case Format::SCRATCH: return instr->flatlike().sync; case Format::DS: return instr->ds().sync; + case Format::LDSDIR: return instr->ldsdir().sync; default: return memory_sync_info(); } } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index f320763..8ae5557 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -77,24 +77,25 @@ enum class Format : std::uint16_t { SMEM = 6, /* LDS/GDS Format */ DS = 8, + LDSDIR = 9, /* Vector Memory Buffer Formats */ - MTBUF = 9, - MUBUF = 10, + MTBUF = 10, + MUBUF = 11, /* Vector Memory Image Format */ - MIMG = 11, + MIMG = 12, /* Export Format */ - EXP = 12, + EXP = 13, /* Flat Formats */ - FLAT = 13, - GLOBAL = 14, - SCRATCH = 15, + FLAT = 14, + GLOBAL = 15, + SCRATCH = 16, - PSEUDO_BRANCH = 16, - PSEUDO_BARRIER = 17, - PSEUDO_REDUCTION = 18, + PSEUDO_BRANCH = 17, + PSEUDO_BARRIER = 18, + PSEUDO_REDUCTION = 19, /* Vector ALU Formats */ - VOP3P = 19, + VOP3P = 20, VOP1 = 1 << 8, VOP2 = 1 << 9, VOPC = 1 << 10, @@ -999,6 +1000,7 @@ struct SOPP_instruction; struct SOPC_instruction; struct SMEM_instruction; struct DS_instruction; +struct LDSDIR_instruction; struct MTBUF_instruction; struct MUBUF_instruction; struct MIMG_instruction; @@ -1124,6 +1126,17 @@ struct Instruction { return *(DS_instruction*)this; } constexpr bool isDS() const noexcept { return format == Format::DS; } + LDSDIR_instruction& ldsdir() noexcept + { + assert(isLDSDIR()); + return *(LDSDIR_instruction*)this; + } + const LDSDIR_instruction& ldsdir() const noexcept + { + assert(isLDSDIR()); + return *(LDSDIR_instruction*)this; + } + constexpr bool isLDSDIR() const noexcept { return format == Format::LDSDIR; } MTBUF_instruction& mtbuf() noexcept { assert(isMTBUF()); @@ -1551,6 +1564,20 @@ struct DS_instruction : public Instruction { static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); /** + * LDS Direct instructions + * Operand(0): M0 + * Definition(0): VDST - Destination VGPR + */ +struct LDSDIR_instruction : public Instruction { + memory_sync_info sync; + uint8_t attr : 6; + uint8_t attr_chan : 2; + uint32_t wait_vdst : 4; + uint32_t padding : 28; +}; +static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); + +/** * Vector Memory Untyped-buffer Instructions * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) * Operand(1): VADDR - Address source. Can carry an index and/or offset diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index baab6f4..d930b1a 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2424,7 +2424,7 @@ lower_to_hw_instr(Program* program) } } } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() || - inst->isEXP()) { + inst->isEXP() || inst->isLDSDIR()) { // TODO: GFX6-9 can use vskip can_remove = false; } else if (inst->isSMEM()) { diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index ec2ccf6..cf1aaa4 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -58,17 +58,18 @@ class Format(Enum): SOPC = 5 SMEM = 6 DS = 8 - MTBUF = 9 - MUBUF = 10 - MIMG = 11 - EXP = 12 - FLAT = 13 - GLOBAL = 14 - SCRATCH = 15 - PSEUDO_BRANCH = 16 - PSEUDO_BARRIER = 17 - PSEUDO_REDUCTION = 18 - VOP3P = 19 + LDSDIR = 9 + MTBUF = 10 + MUBUF = 11 + MIMG = 12 + EXP = 13 + FLAT = 14 + GLOBAL = 15 + SCRATCH = 16 + PSEUDO_BRANCH = 17 + PSEUDO_BARRIER = 18 + PSEUDO_REDUCTION = 19 + VOP3P = 20 VOP1 = 1 << 8 VOP2 = 1 << 9 VOPC = 1 << 10 @@ -93,6 +94,11 @@ class Format(Enum): return [('uint16_t', 'offset0', '0'), ('uint8_t', 'offset1', '0'), ('bool', 'gds', 'false')] + elif self == Format.LDSDIR: + return [('uint8_t', 'attr', 0), + ('uint8_t', 'attr_chan', 0), + ('memory_sync_info', 'sync', 'memory_sync_info()'), + ('uint8_t', 'wait_vdst', 0)] elif self == Format.MTBUF: return [('unsigned', 'dfmt', None), ('unsigned', 'nfmt', None), @@ -1321,6 +1327,15 @@ DS = { for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) in DS: opcode(name, gfx7, gfx9, gfx10, gfx11, Format.DS, InstrClass.DS) + +# LDSDIR instructions: +LDSDIR = { + (0x00, "lds_param_load"), + (0x01, "lds_direct_load"), +} +for (code, name) in LDSDIR: + opcode(name, -1, -1, -1, code, Format.LDSDIR, InstrClass.DS) + # MUBUF instructions: MUBUF = { (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "buffer_load_format_x"), diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 076d4f2..892bdca 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -249,6 +249,12 @@ struct InstrPred { return aD.sync == bD.sync && aD.pass_flags == bD.pass_flags && aD.gds == bD.gds && aD.offset0 == bD.offset0 && aD.offset1 == bD.offset1; } + case Format::LDSDIR: { + LDSDIR_instruction& aD = a->ldsdir(); + LDSDIR_instruction& bD = b->ldsdir(); + return aD.sync == bD.sync && aD.attr == bD.attr && aD.attr_chan == bD.attr_chan && + aD.wait_vdst == bD.wait_vdst; + } case Format::MTBUF: { MTBUF_instruction& aM = a->mtbuf(); MTBUF_instruction& bM = b->mtbuf(); diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 044f815..76e6f02 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -363,6 +363,15 @@ print_instr_format_specific(const Instruction* instr, FILE* output) print_sync(ds.sync, output); break; } + case Format::LDSDIR: { + const LDSDIR_instruction& ldsdir = instr->ldsdir(); + if (instr->opcode == aco_opcode::lds_param_load) + fprintf(output, " attr%u.%c", ldsdir.attr, "xyzw"[ldsdir.attr_chan]); + if (ldsdir.wait_vdst != 15) + fprintf(output, " wait_vdst:%u", ldsdir.wait_vdst); + print_sync(ldsdir.sync, output); + break; + } case Format::MUBUF: { const MUBUF_instruction& mubuf = instr->mubuf(); if (mubuf.offset) diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 72a033a..52fbc4e 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -694,6 +694,15 @@ validate_ir(Program* program) "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get()); break; } + case Format::LDSDIR: { + check(instr->definitions.size() == 1 && instr->definitions[0].regClass() == v1, "LDSDIR must have an v1 definition", instr.get()); + check(instr->operands.size() == 1, "LDSDIR must have an operand", instr.get()); + if (!instr->operands.empty()) { + check(instr->operands[0].regClass() == s1, "LDSDIR must have an s1 operand", instr.get()); + check(instr->operands[0].isFixed() && instr->operands[0].physReg() == m0, "LDSDIR must have an operand fixed to m0", instr.get()); + } + break; + } default: break; } } -- 2.7.4