From 867323379e353a58e76a1340071751c43431f215 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 30 Jul 2020 19:44:28 +0100 Subject: [PATCH] aco: don't use SMEM for SSBO stores MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit fossil-db (Navi): Totals from 70 (0.05% of 138791) affected shaders: SGPRs: 2324 -> 2097 (-9.77%) VGPRs: 1344 -> 1480 (+10.12%) CodeSize: 157872 -> 154836 (-1.92%); split: -1.93%, +0.01% MaxWaves: 1288 -> 1260 (-2.17%) Instrs: 29730 -> 29108 (-2.09%); split: -2.13%, +0.04% Cycles: 394944 -> 391280 (-0.93%); split: -0.94%, +0.01% VMEM: 5288 -> 5695 (+7.70%); split: +11.97%, -4.27% SMEM: 2680 -> 2444 (-8.81%); split: +1.34%, -10.15% VClause: 291 -> 502 (+72.51%) SClause: 1176 -> 918 (-21.94%) Copies: 3549 -> 3517 (-0.90%); split: -1.80%, +0.90% Branches: 1230 -> 1228 (-0.16%) PreSGPRs: 1675 -> 1491 (-10.99%) PreVGPRs: 1101 -> 1223 (+11.08%) Totals from 70 (0.05% of 139517) affected shaders (RAVEN): SGPRs: 2368 -> 2121 (-10.43%) VGPRs: 1344 -> 1480 (+10.12%) CodeSize: 156664 -> 153252 (-2.18%) MaxWaves: 636 -> 622 (-2.20%) Instrs: 29968 -> 29226 (-2.48%) Cycles: 398284 -> 393492 (-1.20%) VMEM: 5544 -> 5930 (+6.96%); split: +11.72%, -4.76% SMEM: 2752 -> 2502 (-9.08%); split: +1.20%, -10.28% VClause: 292 -> 504 (+72.60%) SClause: 1236 -> 940 (-23.95%) Copies: 3907 -> 3852 (-1.41%); split: -2.20%, +0.79% Branches: 1230 -> 1228 (-0.16%) PreSGPRs: 1671 -> 1487 (-11.01%) PreVGPRs: 1102 -> 1225 (+11.16%) Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_exec_mask.cpp | 43 +--------- src/amd/compiler/aco_instruction_selection.cpp | 92 ++++++---------------- src/amd/compiler/aco_instruction_selection.h | 16 ---- .../compiler/aco_instruction_selection_setup.cpp | 5 +- src/amd/compiler/aco_ir.h | 1 - src/amd/compiler/aco_lower_to_hw_instr.cpp | 2 - src/amd/compiler/aco_opcodes.py | 2 - src/amd/compiler/tests/helpers.cpp | 5 +- src/amd/compiler/tests/test_isel.cpp | 6 +- 9 files changed, 28 insertions(+), 144 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 638157c..4ccffcc 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -146,7 +146,7 @@ bool needs_exact(aco_ptr& instr) { FLAT_instruction *flat = static_cast(instr.get()); return flat->disable_wqm; } else { - return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem; + return instr->format == Format::EXP; } } @@ -648,43 +648,6 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, return i; } -void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr& instr, Temp cur_exec) -{ - Operand offset = instr->operands[1]; - if (need_check) { - /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */ - Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u, bld.lm == s2)); - - if (offset.isLiteral()) - offset = bld.copy(bld.def(s1), offset); - - offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)), - offset, Operand(UINT32_MAX), bld.scc(nonempty)); - } else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) { - offset = bld.copy(bld.hint_m0(bld.def(s1)), offset); - } - if (!offset.isConstant()) - offset.setFixed(m0); - - switch (instr->operands[2].size()) { - case 1: - instr->opcode = aco_opcode::s_buffer_store_dword; - break; - case 2: - instr->opcode = aco_opcode::s_buffer_store_dwordx2; - break; - case 4: - instr->opcode = aco_opcode::s_buffer_store_dwordx4; - break; - default: - unreachable("Invalid SMEM buffer store size"); - } - instr->operands[1] = offset; - /* as_uniform() needs to be done here so it's done in exact mode and helper - * lanes don't contribute. */ - instr->operands[2] = Operand(bld.as_uniform(instr->operands[2])); -} - void process_instructions(exec_ctx& ctx, Block* block, std::vector>& instructions, unsigned idx) @@ -831,10 +794,6 @@ void process_instructions(exec_ctx& ctx, Block* block, instr->operands[0] = bld.scc(exit_cond); state = Exact; - } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) { - bool need_check = ctx.info[block->index].exec.size() != 1 && - !(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact); - lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first); } bld.insert(std::move(instr)); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index ddf94bf..d07a721 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3801,24 +3801,21 @@ unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset) } -aco_opcode get_buffer_store_op(bool smem, unsigned bytes) +aco_opcode get_buffer_store_op(unsigned bytes) { switch (bytes) { case 1: - assert(!smem); return aco_opcode::buffer_store_byte; case 2: - assert(!smem); return aco_opcode::buffer_store_short; case 4: - return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword; + return aco_opcode::buffer_store_dword; case 8: - return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2; + return aco_opcode::buffer_store_dwordx2; case 12: - assert(!smem); return aco_opcode::buffer_store_dwordx3; case 16: - return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4; + return aco_opcode::buffer_store_dwordx4; } unreachable("Unexpected store size"); return aco_opcode::num_opcodes; @@ -3943,7 +3940,7 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T assert(vdata.size() >= 1 && vdata.size() <= 4); Builder bld(ctx->program, ctx->block); - aco_opcode op = get_buffer_store_op(false, vdata.bytes()); + aco_opcode op = get_buffer_store_op(vdata.bytes()); const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset); Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); @@ -6198,70 +6195,29 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); - uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr)); - /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores - * TODO: this optimization is disabled for now because we still need to ensure correct ordering - */ - bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore)); - - bool smem = !nir_src_is_divergent(instr->src[2]) && - ctx->options->chip_class >= GFX8 && - ctx->options->chip_class < GFX10_3 && - (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) && - allow_smem; - if (smem) - offset = bld.as_uniform(offset); - bool smem_nonfs = smem && ctx->stage != fragment_fs; unsigned write_count = 0; Temp write_datas[32]; unsigned offsets[32]; - split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr), + split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count, write_datas, offsets); for (unsigned i = 0; i < write_count; i++) { - aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes()); - if (smem && ctx->stage == fragment_fs) - op = aco_opcode::p_fs_buffer_store_smem; - - if (smem) { - aco_ptr store{create_instruction(op, Format::SMEM, 3, 0)}; - store->operands[0] = Operand(rsrc); - if (offsets[i]) { - Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), - offset, Operand(offsets[i])); - store->operands[1] = Operand(off); - } else { - store->operands[1] = Operand(offset); - } - if (op != aco_opcode::p_fs_buffer_store_smem) - store->operands[1].setFixed(m0); - store->operands[2] = Operand(write_datas[i]); - store->glc = glc; - store->dlc = false; - store->disable_wqm = true; - store->sync = sync; - ctx->block->instructions.emplace_back(std::move(store)); - ctx->program->wb_smem_l1_on_end = true; - if (op == aco_opcode::p_fs_buffer_store_smem) { - ctx->block->kind |= block_kind_needs_lowering; - ctx->program->needs_exact = true; - } - } else { - aco_ptr store{create_instruction(op, Format::MUBUF, 4, 0)}; - store->operands[0] = Operand(rsrc); - store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); - store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); - store->operands[3] = Operand(write_datas[i]); - store->offset = offsets[i]; - store->offen = (offset.type() == RegType::vgpr); - store->glc = glc; - store->dlc = false; - store->disable_wqm = true; - store->sync = sync; - ctx->program->needs_exact = true; - ctx->block->instructions.emplace_back(std::move(store)); - } + aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); + + aco_ptr store{create_instruction(op, Format::MUBUF, 4, 0)}; + store->operands[0] = Operand(rsrc); + store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); + store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + store->operands[3] = Operand(write_datas[i]); + store->offset = offsets[i]; + store->offen = (offset.type() == RegType::vgpr); + store->glc = glc; + store->dlc = false; + store->disable_wqm = true; + store->sync = sync; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(store)); } } @@ -6469,7 +6425,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) } else { assert(ctx->options->chip_class == GFX6); - aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes()); + aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); Temp rsrc = get_gfx6_global_rsrc(bld, addr); @@ -6906,7 +6862,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { swizzle_component_size, &write_count, write_datas, offsets); for (unsigned i = 0; i < write_count; i++) { - aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes()); + aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true); static_cast(instr)->sync = memory_sync_info(storage_scratch, semantic_private); } @@ -11863,8 +11819,6 @@ void select_program(Program *program, append_logical_end(ctx.block); ctx.block->kind |= block_kind_uniform; Builder bld(ctx.program, ctx.block); - if (ctx.program->wb_smem_l1_on_end) - bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile)); bld.sopp(aco_opcode::s_endpgm); cleanup_cfg(program); diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index e145c8f..49352c1 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -46,8 +46,6 @@ enum resource_flags { has_nonglc_vmem_store = 0x8, has_vmem_store = has_glc_vmem_store | has_nonglc_vmem_store, - has_vmem_loadstore = has_vmem_store | has_glc_vmem_load | has_nonglc_vmem_load, - has_nonglc_vmem_loadstore = has_nonglc_vmem_load | has_nonglc_vmem_store, buffer_is_restrict = 0x10, }; @@ -200,20 +198,6 @@ inline uint8_t get_all_buffer_resource_flags(isel_context *ctx, nir_ssa_def *def return res; } -inline bool can_subdword_ssbo_store_use_smem(nir_intrinsic_instr *intrin) -{ - unsigned wrmask = nir_intrinsic_write_mask(intrin); - if (util_last_bit(wrmask) != util_bitcount(wrmask) || - util_bitcount(wrmask) * intrin->src[0].ssa->bit_size % 32 || - util_bitcount(wrmask) != intrin->src[0].ssa->num_components) - return false; - - if (nir_intrinsic_align_mul(intrin) % 4 || nir_intrinsic_align_offset(intrin) % 4) - return false; - - return true; -} - void init_context(isel_context *ctx, nir_shader *shader); void cleanup_context(isel_context *ctx); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index e2a0148..fb17618 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -189,10 +189,7 @@ void fill_desc_set_info(isel_context *ctx, nir_function_impl *impl) res = intrin->src[0].ssa; break; case nir_intrinsic_store_ssbo: - if (nir_src_is_divergent(intrin->src[2]) || - ctx->program->chip_class < GFX8 || ctx->program->chip_class >= GFX10_3 || - (intrin->src[0].ssa->bit_size < 32 && !can_subdword_ssbo_store_use_smem(intrin))) - flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store; + flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store; res = intrin->src[1].ssa; break; case nir_intrinsic_load_global: diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 6387c90..1eafcd5 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1640,7 +1640,6 @@ public: Stage stage; bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ bool needs_wqm = false; /* there exists a p_wqm instruction */ - bool wb_smem_l1_on_end = false; std::vector constant_data; Temp private_segment_buffer; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index f7a4012..0a015ad 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1875,8 +1875,6 @@ void lower_to_hw_instr(Program* program) bld.reset(discard_block); bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0, V_008DFC_SQ_EXP_NULL, false, true, true); - if (program->wb_smem_l1_on_end) - bld.smem(aco_opcode::s_dcache_wb); bld.sopp(aco_opcode::s_endpgm); bld.reset(&ctx.instructions); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 95fa9c9..4a839db 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -285,8 +285,6 @@ opcode("p_demote_to_helper") opcode("p_is_helper") opcode("p_exit_early_if") -opcode("p_fs_buffer_store_smem", format=Format.SMEM) - # simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64 opcode("p_bpermute") diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp index 55843c1..ca698e7 100644 --- a/src/amd/compiler/tests/helpers.cpp +++ b/src/amd/compiler/tests/helpers.cpp @@ -136,10 +136,7 @@ void finish_program(Program *program) for (Block& block : program->blocks) { if (block.linear_succs.size() == 0) { block.kind |= block_kind_uniform; - Builder bld(program, &block); - if (program->wb_smem_l1_on_end) - bld.smem(aco_opcode::s_dcache_wb, false); - bld.sopp(aco_opcode::s_endpgm); + Builder(program, &block).sopp(aco_opcode::s_endpgm); } } } diff --git a/src/amd/compiler/tests/test_isel.cpp b/src/amd/compiler/tests/test_isel.cpp index 48425b7..a2212b3 100644 --- a/src/amd/compiler/tests/test_isel.cpp +++ b/src/amd/compiler/tests/test_isel.cpp @@ -67,10 +67,8 @@ BEGIN_TEST(isel.compute.simple) uint res; }; void main() { - //~gfx7>> v1: %data = p_parallelcopy 42 - //~gfx7>> buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation - //~gfx8>> s1: %data = p_parallelcopy 42 - //~gfx8>> s_buffer_store_dword %_, 0, %data storage:buffer semantics: scope:invocation + //>> v1: %data = p_parallelcopy 42 + //buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation res = 42; } ); -- 2.7.4