From 867323379e353a58e76a1340071751c43431f215 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Thu, 30 Jul 2020 19:44:28 +0100
Subject: [PATCH] aco: don't use SMEM for SSBO stores
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

fossil-db (Navi):
Totals from 70 (0.05% of 138791) affected shaders:
SGPRs: 2324 -> 2097 (-9.77%)
VGPRs: 1344 -> 1480 (+10.12%)
CodeSize: 157872 -> 154836 (-1.92%); split: -1.93%, +0.01%
MaxWaves: 1288 -> 1260 (-2.17%)
Instrs: 29730 -> 29108 (-2.09%); split: -2.13%, +0.04%
Cycles: 394944 -> 391280 (-0.93%); split: -0.94%, +0.01%
VMEM: 5288 -> 5695 (+7.70%); split: +11.97%, -4.27%
SMEM: 2680 -> 2444 (-8.81%); split: +1.34%, -10.15%
VClause: 291 -> 502 (+72.51%)
SClause: 1176 -> 918 (-21.94%)
Copies: 3549 -> 3517 (-0.90%); split: -1.80%, +0.90%
Branches: 1230 -> 1228 (-0.16%)
PreSGPRs: 1675 -> 1491 (-10.99%)
PreVGPRs: 1101 -> 1223 (+11.08%)

Totals from 70 (0.05% of 139517) affected shaders (RAVEN):
SGPRs: 2368 -> 2121 (-10.43%)
VGPRs: 1344 -> 1480 (+10.12%)
CodeSize: 156664 -> 153252 (-2.18%)
MaxWaves: 636 -> 622 (-2.20%)
Instrs: 29968 -> 29226 (-2.48%)
Cycles: 398284 -> 393492 (-1.20%)
VMEM: 5544 -> 5930 (+6.96%); split: +11.72%, -4.76%
SMEM: 2752 -> 2502 (-9.08%); split: +1.20%, -10.28%
VClause: 292 -> 504 (+72.60%)
SClause: 1236 -> 940 (-23.95%)
Copies: 3907 -> 3852 (-1.41%); split: -2.20%, +0.79%
Branches: 1230 -> 1228 (-0.16%)
PreSGPRs: 1671 -> 1487 (-11.01%)
PreVGPRs: 1102 -> 1225 (+11.16%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Daniel SchÃ¼rmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6143>
---
 src/amd/compiler/aco_insert_exec_mask.cpp          | 43 +---------
 src/amd/compiler/aco_instruction_selection.cpp     | 92 ++++++----------------
 src/amd/compiler/aco_instruction_selection.h       | 16 ----
 .../compiler/aco_instruction_selection_setup.cpp   |  5 +-
 src/amd/compiler/aco_ir.h                          |  1 -
 src/amd/compiler/aco_lower_to_hw_instr.cpp         |  2 -
 src/amd/compiler/aco_opcodes.py                    |  2 -
 src/amd/compiler/tests/helpers.cpp                 |  5 +-
 src/amd/compiler/tests/test_isel.cpp               |  6 +-
 9 files changed, 28 insertions(+), 144 deletions(-)
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index 638157c..4ccffcc 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -146,7 +146,7 @@ bool needs_exact(aco_ptr<Instruction>& instr) {
       FLAT_instruction *flat = static_cast<FLAT_instruction *>(instr.get());
       return flat->disable_wqm;
    } else {
-      return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem;
+      return instr->format == Format::EXP;
    }
 }
 
@@ -648,43 +648,6 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
    return i;
 }
 
-void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instruction>& instr, Temp cur_exec)
-{
-   Operand offset = instr->operands[1];
-   if (need_check) {
-      /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
-      Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u, bld.lm == s2));
-
-      if (offset.isLiteral())
-         offset = bld.copy(bld.def(s1), offset);
-
-      offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)),
-                        offset, Operand(UINT32_MAX), bld.scc(nonempty));
-   } else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) {
-      offset = bld.copy(bld.hint_m0(bld.def(s1)), offset);
-   }
-   if (!offset.isConstant())
-      offset.setFixed(m0);
-
-   switch (instr->operands[2].size()) {
-   case 1:
-      instr->opcode = aco_opcode::s_buffer_store_dword;
-      break;
-   case 2:
-      instr->opcode = aco_opcode::s_buffer_store_dwordx2;
-      break;
-   case 4:
-      instr->opcode = aco_opcode::s_buffer_store_dwordx4;
-      break;
-   default:
-      unreachable("Invalid SMEM buffer store size");
-   }
-   instr->operands[1] = offset;
-   /* as_uniform() needs to be done here so it's done in exact mode and helper
-    * lanes don't contribute. */
-   instr->operands[2] = Operand(bld.as_uniform(instr->operands[2]));
-}
-
 void process_instructions(exec_ctx& ctx, Block* block,
                           std::vector<aco_ptr<Instruction>>& instructions,
                           unsigned idx)
@@ -831,10 +794,6 @@ void process_instructions(exec_ctx& ctx, Block* block,
          instr->operands[0] = bld.scc(exit_cond);
          state = Exact;
 
-      } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) {
-         bool need_check = ctx.info[block->index].exec.size() != 1 &&
-                           !(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact);
-         lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first);
       }
 
       bld.insert(std::move(instr));
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index ddf94bf..d07a721 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3801,24 +3801,21 @@ unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
 }
 
 
-aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
+aco_opcode get_buffer_store_op(unsigned bytes)
 {
    switch (bytes) {
    case 1:
-      assert(!smem);
       return aco_opcode::buffer_store_byte;
    case 2:
-      assert(!smem);
       return aco_opcode::buffer_store_short;
    case 4:
-      return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
+      return aco_opcode::buffer_store_dword;
    case 8:
-      return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
+      return aco_opcode::buffer_store_dwordx2;
    case 12:
-      assert(!smem);
       return aco_opcode::buffer_store_dwordx3;
    case 16:
-      return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
+      return aco_opcode::buffer_store_dwordx4;
    }
    unreachable("Unexpected store size");
    return aco_opcode::num_opcodes;
@@ -3943,7 +3940,7 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
    assert(vdata.size() >= 1 && vdata.size() <= 4);
 
    Builder bld(ctx->program, ctx->block);
-   aco_opcode op = get_buffer_store_op(false, vdata.bytes());
+   aco_opcode op = get_buffer_store_op(vdata.bytes());
    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
 
    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
@@ -6198,70 +6195,29 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
 
    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
-   uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
-   /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
-    * TODO: this optimization is disabled for now because we still need to ensure correct ordering
-    */
-   bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
-
-   bool smem = !nir_src_is_divergent(instr->src[2]) &&
-               ctx->options->chip_class >= GFX8 &&
-               ctx->options->chip_class < GFX10_3 &&
-               (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
-               allow_smem;
-   if (smem)
-      offset = bld.as_uniform(offset);
-   bool smem_nonfs = smem && ctx->stage != fragment_fs;
 
    unsigned write_count = 0;
    Temp write_datas[32];
    unsigned offsets[32];
-   split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
+   split_buffer_store(ctx, instr, false, RegType::vgpr,
                       data, writemask, 16, &write_count, write_datas, offsets);
 
    for (unsigned i = 0; i < write_count; i++) {
-      aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
-      if (smem && ctx->stage == fragment_fs)
-         op = aco_opcode::p_fs_buffer_store_smem;
-
-      if (smem) {
-         aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
-         store->operands[0] = Operand(rsrc);
-         if (offsets[i]) {
-            Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                      offset, Operand(offsets[i]));
-            store->operands[1] = Operand(off);
-         } else {
-            store->operands[1] = Operand(offset);
-         }
-         if (op != aco_opcode::p_fs_buffer_store_smem)
-            store->operands[1].setFixed(m0);
-         store->operands[2] = Operand(write_datas[i]);
-         store->glc = glc;
-         store->dlc = false;
-         store->disable_wqm = true;
-         store->sync = sync;
-         ctx->block->instructions.emplace_back(std::move(store));
-         ctx->program->wb_smem_l1_on_end = true;
-         if (op == aco_opcode::p_fs_buffer_store_smem) {
-            ctx->block->kind |= block_kind_needs_lowering;
-            ctx->program->needs_exact = true;
-         }
-      } else {
-         aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
-         store->operands[0] = Operand(rsrc);
-         store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
-         store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
-         store->operands[3] = Operand(write_datas[i]);
-         store->offset = offsets[i];
-         store->offen = (offset.type() == RegType::vgpr);
-         store->glc = glc;
-         store->dlc = false;
-         store->disable_wqm = true;
-         store->sync = sync;
-         ctx->program->needs_exact = true;
-         ctx->block->instructions.emplace_back(std::move(store));
-      }
+      aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
+
+      aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
+      store->operands[0] = Operand(rsrc);
+      store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+      store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+      store->operands[3] = Operand(write_datas[i]);
+      store->offset = offsets[i];
+      store->offen = (offset.type() == RegType::vgpr);
+      store->glc = glc;
+      store->dlc = false;
+      store->disable_wqm = true;
+      store->sync = sync;
+      ctx->program->needs_exact = true;
+      ctx->block->instructions.emplace_back(std::move(store));
    }
 }
 
@@ -6469,7 +6425,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
       } else {
          assert(ctx->options->chip_class == GFX6);
 
-         aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
+         aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
 
          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
 
@@ -6906,7 +6862,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
                       swizzle_component_size, &write_count, write_datas, offsets);
 
    for (unsigned i = 0; i < write_count; i++) {
-      aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
+      aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
       Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
       static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_scratch, semantic_private);
    }
@@ -11863,8 +11819,6 @@ void select_program(Program *program,
    append_logical_end(ctx.block);
    ctx.block->kind |= block_kind_uniform;
    Builder bld(ctx.program, ctx.block);
-   if (ctx.program->wb_smem_l1_on_end)
-      bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
    bld.sopp(aco_opcode::s_endpgm);
 
    cleanup_cfg(program);
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h
index e145c8f..49352c1 100644
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -46,8 +46,6 @@ enum resource_flags {
    has_nonglc_vmem_store = 0x8,
 
    has_vmem_store = has_glc_vmem_store | has_nonglc_vmem_store,
-   has_vmem_loadstore = has_vmem_store | has_glc_vmem_load | has_nonglc_vmem_load,
-   has_nonglc_vmem_loadstore = has_nonglc_vmem_load | has_nonglc_vmem_store,
 
    buffer_is_restrict = 0x10,
 };
@@ -200,20 +198,6 @@ inline uint8_t get_all_buffer_resource_flags(isel_context *ctx, nir_ssa_def *def
    return res;
 }
 
-inline bool can_subdword_ssbo_store_use_smem(nir_intrinsic_instr *intrin)
-{
-   unsigned wrmask = nir_intrinsic_write_mask(intrin);
-   if (util_last_bit(wrmask) != util_bitcount(wrmask) ||
-       util_bitcount(wrmask) * intrin->src[0].ssa->bit_size % 32 ||
-       util_bitcount(wrmask) != intrin->src[0].ssa->num_components)
-      return false;
-
-   if (nir_intrinsic_align_mul(intrin) % 4 || nir_intrinsic_align_offset(intrin) % 4)
-      return false;
-
-   return true;
-}
-
 void init_context(isel_context *ctx, nir_shader *shader);
 void cleanup_context(isel_context *ctx);
 
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index e2a0148..fb17618 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -189,10 +189,7 @@ void fill_desc_set_info(isel_context *ctx, nir_function_impl *impl)
             res = intrin->src[0].ssa;
             break;
          case nir_intrinsic_store_ssbo:
-            if (nir_src_is_divergent(intrin->src[2]) ||
-                ctx->program->chip_class < GFX8 || ctx->program->chip_class >= GFX10_3 ||
-                (intrin->src[0].ssa->bit_size < 32 && !can_subdword_ssbo_store_use_smem(intrin)))
-               flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
+            flags |= glc ? has_glc_vmem_store : has_nonglc_vmem_store;
             res = intrin->src[1].ssa;
             break;
          case nir_intrinsic_load_global:
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 6387c90..1eafcd5 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1640,7 +1640,6 @@ public:
    Stage stage;
    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
    bool needs_wqm = false; /* there exists a p_wqm instruction */
-   bool wb_smem_l1_on_end = false;
 
    std::vector<uint8_t> constant_data;
    Temp private_segment_buffer;
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index f7a4012..0a015ad 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -1875,8 +1875,6 @@ void lower_to_hw_instr(Program* program)
                   bld.reset(discard_block);
                   bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
                           0, V_008DFC_SQ_EXP_NULL, false, true, true);
-                  if (program->wb_smem_l1_on_end)
-                     bld.smem(aco_opcode::s_dcache_wb);
                   bld.sopp(aco_opcode::s_endpgm);
 
                   bld.reset(&ctx.instructions);
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index 95fa9c9..4a839db 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -285,8 +285,6 @@ opcode("p_demote_to_helper")
 opcode("p_is_helper")
 opcode("p_exit_early_if")
 
-opcode("p_fs_buffer_store_smem", format=Format.SMEM)
-
 # simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
 opcode("p_bpermute")
 
diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp
index 55843c1..ca698e7 100644
--- a/src/amd/compiler/tests/helpers.cpp
+++ b/src/amd/compiler/tests/helpers.cpp
@@ -136,10 +136,7 @@ void finish_program(Program *program)
    for (Block& block : program->blocks) {
       if (block.linear_succs.size() == 0) {
          block.kind |= block_kind_uniform;
-         Builder bld(program, &block);
-         if (program->wb_smem_l1_on_end)
-            bld.smem(aco_opcode::s_dcache_wb, false);
-         bld.sopp(aco_opcode::s_endpgm);
+         Builder(program, &block).sopp(aco_opcode::s_endpgm);
       }
    }
 }
diff --git a/src/amd/compiler/tests/test_isel.cpp b/src/amd/compiler/tests/test_isel.cpp
index 48425b7..a2212b3 100644
--- a/src/amd/compiler/tests/test_isel.cpp
+++ b/src/amd/compiler/tests/test_isel.cpp
@@ -67,10 +67,8 @@ BEGIN_TEST(isel.compute.simple)
             uint res;
          };
          void main() {
-            //~gfx7>> v1: %data = p_parallelcopy 42
-            //~gfx7>> buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation
-            //~gfx8>> s1: %data = p_parallelcopy 42
-            //~gfx8>> s_buffer_store_dword %_, 0, %data storage:buffer semantics: scope:invocation
+            //>> v1: %data = p_parallelcopy 42
+            //buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation
             res = 42;
          }
       );
-- 
2.7.4