ac/nir/ngg: Slightly improve attribute ring offset calculation.
authorTimur Kristóf <timur.kristof@gmail.com>
Thu, 30 Mar 2023 21:44:18 +0000 (23:44 +0200)
committerMarge Bot <emma+marge@anholt.net>
Fri, 31 Mar 2023 17:02:17 +0000 (17:02 +0000)
Inspired by Nicolai Hähnle's commit in LLPC.
Instead of using a SALU instruction to add to the scalar
offset, rely on the buffer swizzling and use constant offset.

Fossil DB stats on GFX1100:

Totals from 47910 (35.51% of 134913) affected shaders:
CodeSize: 87927612 -> 86968136 (-1.09%)
Instrs: 17584007 -> 17440094 (-0.82%)
Latency: 97232173 -> 97126311 (-0.11%)
InvThroughput: 9904586 -> 9905288 (+0.01%); split: -0.02%, +0.02%
VClause: 544430 -> 542566 (-0.34%)

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22227>

src/amd/common/ac_nir_lower_ngg.c

index b7deb08..c2ed883 100644 (file)
@@ -2229,12 +2229,11 @@ export_vertex_params_gfx11(nir_builder *b, nir_ssa_def *export_tid, nir_ssa_def
       if (exported_params & BITFIELD_BIT(offset))
          continue;
 
-      nir_ssa_def *soffset = nir_iadd_imm(b, attr_offset, offset * 16 * 32);
-
       nir_ssa_def *comp[4];
       for (unsigned j = 0; j < 4; j++)
          comp[j] = outputs[i].chan[j] ? outputs[i].chan[j] : undef;
-      nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, soffset, vindex,
+      nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, attr_offset, vindex,
+                           .base = offset * 16,
                            .memory_modes = nir_var_shader_out,
                            .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD);
       exported_params |= BITFIELD_BIT(offset);
@@ -3750,10 +3749,11 @@ ms_store_arrayed_output_intrin(nir_builder *b,
        * (Also much better than storing and reloading from the scratch ring.)
        */
       const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
+      unsigned param_offset = s->vs_output_param_offset[io_sem.location];
       nir_ssa_def *ring = nir_load_ring_attr_amd(b);
       nir_ssa_def *soffset = nir_load_ring_attr_offset_amd(b);
-      soffset = nir_iadd_imm(b, soffset, s->vs_output_param_offset[io_sem.location] * 16 * 32);
-      nir_store_buffer_amd(b, store_val, ring, base_addr_off, soffset, arr_index, .base = const_off,
+      nir_store_buffer_amd(b, store_val, ring, base_addr_off, soffset, arr_index,
+                           .base = const_off + param_offset * 16,
                            .memory_modes = nir_var_shader_out,
                            .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD);
    } else if (out_mode == ms_out_mode_var) {