radv: move computing NGG info and GS info to radv_nir_shader_info_link()

author Samuel Pitoiset <samuel.pitoiset@gmail.com>

Fri, 26 Aug 2022 10:03:11 +0000 (12:03 +0200)

committer Marge Bot <emma+marge@anholt.net>

Thu, 1 Sep 2022 17:02:17 +0000 (17:02 +0000)
author Samuel Pitoiset <samuel.pitoiset@gmail.com>
Fri, 26 Aug 2022 10:03:11 +0000 (12:03 +0200)
committer Marge Bot <emma+marge@anholt.net>
Thu, 1 Sep 2022 17:02:17 +0000 (17:02 +0000)
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c

index b74a6ab..0452dec 100644 (file)
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1894,142 +1894,6 @@ radv_pipeline_init_depth_stencil_state(struct radv_graphics_pipeline *pipeline,
  }
  
  static void
-gfx9_get_gs_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
-                 struct radv_pipeline_stage *gs_stage)
-{
-   const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
-   struct radv_shader_info *gs_info = &gs_stage->info;
-   struct radv_shader_info *es_info = &es_stage->info;
-   struct gfx9_gs_info *out = &gs_stage->info.gs_ring_info;
-
-   const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
-   const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY ||
-                               gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
-
-   /* All these are in dwords: */
-   /* We can't allow using the whole LDS, because GS waves compete with
-    * other shader stages for LDS space. */
-   const unsigned max_lds_size = 8 * 1024;
-   const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
-   unsigned esgs_lds_size;
-
-   /* All these are per subgroup: */
-   const unsigned max_out_prims = 32 * 1024;
-   const unsigned max_es_verts = 255;
-   const unsigned ideal_gs_prims = 64;
-   unsigned max_gs_prims, gs_prims;
-   unsigned min_es_verts, es_verts, worst_case_es_verts;
-
-   if (uses_adjacency || gs_num_invocations > 1)
-      max_gs_prims = 127 / gs_num_invocations;
-   else
-      max_gs_prims = 255;
-
-   /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
-    * Make sure we don't go over the maximum value.
-    */
-   if (gs_info->gs.vertices_out > 0) {
-      max_gs_prims =
-         MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
-   }
-   assert(max_gs_prims > 0);
-
-   /* If the primitive has adjacency, halve the number of vertices
-    * that will be reused in multiple primitives.
-    */
-   min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
-
-   gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
-   worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
-
-   /* Compute ESGS LDS size based on the worst case number of ES vertices
-    * needed to create the target number of GS prims per subgroup.
-    */
-   esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-
-   /* If total LDS usage is too big, refactor partitions based on ratio
-    * of ESGS item sizes.
-    */
-   if (esgs_lds_size > max_lds_size) {
-      /* Our target GS Prims Per Subgroup was too large. Calculate
-       * the maximum number of GS Prims Per Subgroup that will fit
-       * into LDS, capped by the maximum that the hardware can support.
-       */
-      gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
-      assert(gs_prims > 0);
-      worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
-
-      esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-      assert(esgs_lds_size <= max_lds_size);
-   }
-
-   /* Now calculate remaining ESGS information. */
-   if (esgs_lds_size)
-      es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
-   else
-      es_verts = max_es_verts;
-
-   /* Vertices for adjacency primitives are not always reused, so restore
-    * it for ES_VERTS_PER_SUBGRP.
-    */
-   min_es_verts = gs_info->gs.vertices_in;
-
-   /* For normal primitives, the VGT only checks if they are past the ES
-    * verts per subgroup after allocating a full GS primitive and if they
-    * are, kick off a new subgroup.  But if those additional ES verts are
-    * unique (e.g. not reused) we need to make sure there is enough LDS
-    * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
-    */
-   es_verts -= min_es_verts - 1;
-
-   const uint32_t es_verts_per_subgroup = es_verts;
-   const uint32_t gs_prims_per_subgroup = gs_prims;
-   const uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
-   const uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
-   out->lds_size = align(esgs_lds_size, 128) / 128;
-   out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
-                             S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
-                             S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
-   out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
-   out->vgt_esgs_ring_itemsize = esgs_itemsize;
-   assert(max_prims_per_subgroup <= max_out_prims);
-
-   unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size,
-      es_verts_per_subgroup, gs_inst_prims_in_subgroup);
-   es_info->workgroup_size = workgroup_size;
-   gs_info->workgroup_size = workgroup_size;
-}
-
-static void
-clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
-                         bool use_adjacency)
-{
-   unsigned max_reuse = max_esverts - min_verts_per_prim;
-   if (use_adjacency)
-      max_reuse /= 2;
-   *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
-}
-
-static unsigned
-radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage,
-                            const struct radv_pipeline_stage *gs_stage)
-{
-   if (gs_stage) {
-      return gs_stage->nir->info.gs.vertices_in;
-   }
-
-   if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
-      if (es_stage->nir->info.tess.point_mode)
-         return 1;
-      if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
-         return 2;
-      return 3;
-   }
-
-   return 3;
-}
-
-static void
  gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level,
                         uint32_t oversub_pc_lines)
  {
@@ -2038,245 +1902,6 @@ gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level,
        S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
  }
  
-static unsigned
-radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage,
-                                 const struct radv_pipeline_stage *gs_stage)
-{
-   if (gs_stage) {
-      return gs_stage->nir->info.gs.input_primitive;
-   }
-
-   if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
-      if (es_stage->nir->info.tess.point_mode)
-         return SHADER_PRIM_POINTS;
-      if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
-         return SHADER_PRIM_LINES;
-      return SHADER_PRIM_TRIANGLES;
-   }
-
-   return SHADER_PRIM_TRIANGLES;
-}
-
-static void
-gfx10_get_ngg_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
-                   struct radv_pipeline_stage *gs_stage)
-{
-   const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
-   struct radv_shader_info *gs_info = gs_stage ? &gs_stage->info : NULL;
-   struct radv_shader_info *es_info = &es_stage->info;
-   const unsigned max_verts_per_prim = radv_get_num_input_vertices(es_stage, gs_stage);
-   const unsigned min_verts_per_prim = gs_stage ? max_verts_per_prim : 1;
-   struct gfx10_ngg_info *out = gs_stage ? &gs_info->ngg_info : &es_info->ngg_info;
-
-   const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1;
-
-   const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage);
-   const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY ||
-                               input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
-
-   /* All these are in dwords: */
-   /* We can't allow using the whole LDS, because GS waves compete with
-    * other shader stages for LDS space.
-    *
-    * TODO: We should really take the shader's internal LDS use into
-    *       account. The linker will fail if the size is greater than
-    *       8K dwords.
-    */
-   const unsigned max_lds_size = 8 * 1024 - 768;
-   const unsigned target_lds_size = max_lds_size;
-   unsigned esvert_lds_size = 0;
-   unsigned gsprim_lds_size = 0;
-
-   /* All these are per subgroup: */
-   const unsigned min_esverts = gfx_level >= GFX10_3 ? 29 : 24;
-   bool max_vert_out_per_gs_instance = false;
-   unsigned max_esverts_base = 128;
-   unsigned max_gsprims_base = 128; /* default prim group size clamp */
-
-   /* Hardware has the following non-natural restrictions on the value
-    * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
-    * the draw:
-    *  - at most 252 for any line input primitive type
-    *  - at most 251 for any quad input primitive type
-    *  - at most 251 for triangle strips with adjacency (this happens to
-    *    be the natural limit for triangle *lists* with adjacency)
-    */
-   max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
-
-   if (gs_stage) {
-      unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
-
-      if (max_out_verts_per_gsprim <= 256) {
-         if (max_out_verts_per_gsprim) {
-            max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
-         }
-      } else {
-         /* Use special multi-cycling mode in which each GS
-          * instance gets its own subgroup. Does not work with
-          * tessellation. */
-         max_vert_out_per_gs_instance = true;
-         max_gsprims_base = 1;
-         max_out_verts_per_gsprim = gs_info->gs.vertices_out;
-      }
-
-      esvert_lds_size = es_info->esgs_itemsize / 4;
-      gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
-   } else {
-      /* VS and TES. */
-      /* LDS size for passing data from GS to ES. */
-      struct radv_streamout_info *so_info = &es_info->so;
-
-      if (so_info->num_outputs)
-         esvert_lds_size = 4 * so_info->num_outputs + 1;
-
-      /* GS stores Primitive IDs (one DWORD) into LDS at the address
-       * corresponding to the ES thread of the provoking vertex. All
-       * ES threads load and export PrimitiveID for their thread.
-       */
-      if (es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id)
-         esvert_lds_size = MAX2(esvert_lds_size, 1);
-   }
-
-   unsigned max_gsprims = max_gsprims_base;
-   unsigned max_esverts = max_esverts_base;
-
-   if (esvert_lds_size)
-      max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
-   if (gsprim_lds_size)
-      max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
-
-   max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-   clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
-   assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-
-   if (esvert_lds_size || gsprim_lds_size) {
-      /* Now that we have a rough proportionality between esverts
-       * and gsprims based on the primitive type, scale both of them
-       * down simultaneously based on required LDS space.
-       *
-       * We could be smarter about this if we knew how much vertex
-       * reuse to expect.
-       */
-      unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
-      if (lds_total > target_lds_size) {
-         max_esverts = max_esverts * target_lds_size / lds_total;
-         max_gsprims = max_gsprims * target_lds_size / lds_total;
-
-         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
-         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-      }
-   }
-
-   /* Round up towards full wave sizes for better ALU utilization. */
-   if (!max_vert_out_per_gs_instance) {
-      unsigned orig_max_esverts;
-      unsigned orig_max_gsprims;
-      unsigned wavesize;
-
-      if (gs_stage) {
-         wavesize = gs_info->wave_size;
-      } else {
-         wavesize = es_info->wave_size;
-      }
-
-      do {
-         orig_max_esverts = max_esverts;
-         orig_max_gsprims = max_gsprims;
-
-         max_esverts = align(max_esverts, wavesize);
-         max_esverts = MIN2(max_esverts, max_esverts_base);
-         if (esvert_lds_size)
-            max_esverts =
-               MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
-         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-
-         /* Hardware restriction: minimum value of max_esverts */
-         if (gfx_level == GFX10)
-            max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
-         else
-            max_esverts = MAX2(max_esverts, min_esverts);
-
-         max_gsprims = align(max_gsprims, wavesize);
-         max_gsprims = MIN2(max_gsprims, max_gsprims_base);
-         if (gsprim_lds_size) {
-            /* Don't count unusable vertices to the LDS
-             * size. Those are vertices above the maximum
-             * number of vertices that can occur in the
-             * workgroup, which is e.g. max_gsprims * 3
-             * for triangles.
-             */
-            unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-            max_gsprims = MIN2(max_gsprims,
-                               (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
-         }
-         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
-         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-      } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
-
-      /* Verify the restriction. */
-      if (gfx_level == GFX10)
-         assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
-      else
-         assert(max_esverts >= min_esverts);
-   } else {
-      /* Hardware restriction: minimum value of max_esverts */
-      if (gfx_level == GFX10)
-         max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
-      else
-         max_esverts = MAX2(max_esverts, min_esverts);
-   }
-
-   unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
-                               : gs_stage
-                                  ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
-                                  : max_esverts;
-   assert(max_out_vertices <= 256);
-
-   unsigned prim_amp_factor = 1;
-   if (gs_stage) {
-      /* Number of output primitives per GS input primitive after
-       * GS instancing. */
-      prim_amp_factor = gs_info->gs.vertices_out;
-   }
-
-   /* On Gfx10, the GE only checks against the maximum number of ES verts
-    * after allocating a full GS primitive. So we need to ensure that
-    * whenever this check passes, there is enough space for a full
-    * primitive without vertex reuse.
-    */
-   if (gfx_level == GFX10)
-      out->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
-   else
-      out->hw_max_esverts = max_esverts;
-
-   out->max_gsprims = max_gsprims;
-   out->max_out_verts = max_out_vertices;
-   out->prim_amp_factor = prim_amp_factor;
-   out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
-   out->ngg_emit_size = max_gsprims * gsprim_lds_size;
-   out->enable_vertex_grouping = true;
-
-   /* Don't count unusable vertices. */
-   out->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
-
-   if (gs_stage) {
-      out->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
-   } else {
-      out->vgt_esgs_ring_itemsize = 1;
-   }
-
-   assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
-
-   unsigned workgroup_size =
-      ac_compute_ngg_workgroup_size(
-         max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
-   if (gs_stage) {
-      gs_info->workgroup_size = workgroup_size;
-   }
-   es_info->workgroup_size = workgroup_size;
-}
-
  static void
  radv_pipeline_init_gs_ring_state(struct radv_graphics_pipeline *pipeline, const struct gfx9_gs_info *gs)
  {
@@ -3328,7 +2953,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,
                                  &stages[i].info);
     }
  
-   radv_nir_shader_info_link(device, pipeline_key, stages, last_vgt_api_stage);
+   radv_nir_shader_info_link(device, pipeline_key, stages, pipeline_has_ngg, last_vgt_api_stage);
  
     if (stages[MESA_SHADER_TESS_CTRL].nir) {
        for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s) {
@@ -3359,21 +2984,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline,
              stages[MESA_SHADER_TASK].nir->info.workgroup_size, false, UINT32_MAX);
     }
  
-   if (pipeline_has_ngg) {
-      if (last_vgt_api_stage != MESA_SHADER_MESH) {
-         struct radv_pipeline_stage *es_stage =
-            stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
-         struct radv_pipeline_stage *gs_stage =
-            stages[MESA_SHADER_GEOMETRY].nir ? &stages[MESA_SHADER_GEOMETRY] : NULL;
-
-         gfx10_get_ngg_info(device, es_stage, gs_stage);
-      }
-   } else if (stages[MESA_SHADER_GEOMETRY].nir) {
-      struct radv_pipeline_stage *es_stage =
-         stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
-
-      gfx9_get_gs_info(device, es_stage, &stages[MESA_SHADER_GEOMETRY]);
-   } else {
+   if (!pipeline_has_ngg && !stages[MESA_SHADER_GEOMETRY].nir) {
        gl_shader_stage hw_vs_api_stage =
           stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
        stages[hw_vs_api_stage].info.workgroup_size = stages[hw_vs_api_stage].info.wave_size;
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h

index 3c523fa..c14eb51 100644 (file)
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -2804,7 +2804,7 @@ void radv_nir_shader_info_init(struct radv_shader_info *info);
  
  void radv_nir_shader_info_link(struct radv_device *device,
                                 const struct radv_pipeline_key *pipeline_key,
-                               struct radv_pipeline_stage *stages,
+                               struct radv_pipeline_stage *stages, bool pipeline_has_ngg,
                                 gl_shader_stage last_vgt_api_stage);
  
  bool radv_thread_trace_init(struct radv_device *device);
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c

index ae192f5..c811df0 100644 (file)
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -798,9 +798,385 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
     info->ballot_bit_size = radv_get_ballot_bit_size(device, nir->info.stage, info);
  }
  
+static void
+gfx9_get_gs_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
+                 struct radv_pipeline_stage *gs_stage)
+{
+   const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
+   struct radv_shader_info *gs_info = &gs_stage->info;
+   struct radv_shader_info *es_info = &es_stage->info;
+   struct gfx9_gs_info *out = &gs_stage->info.gs_ring_info;
+
+   const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
+   const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY ||
+                               gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
+
+   /* All these are in dwords: */
+   /* We can't allow using the whole LDS, because GS waves compete with
+    * other shader stages for LDS space. */
+   const unsigned max_lds_size = 8 * 1024;
+   const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
+   unsigned esgs_lds_size;
+
+   /* All these are per subgroup: */
+   const unsigned max_out_prims = 32 * 1024;
+   const unsigned max_es_verts = 255;
+   const unsigned ideal_gs_prims = 64;
+   unsigned max_gs_prims, gs_prims;
+   unsigned min_es_verts, es_verts, worst_case_es_verts;
+
+   if (uses_adjacency || gs_num_invocations > 1)
+      max_gs_prims = 127 / gs_num_invocations;
+   else
+      max_gs_prims = 255;
+
+   /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
+    * Make sure we don't go over the maximum value.
+    */
+   if (gs_info->gs.vertices_out > 0) {
+      max_gs_prims =
+         MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
+   }
+   assert(max_gs_prims > 0);
+
+   /* If the primitive has adjacency, halve the number of vertices
+    * that will be reused in multiple primitives.
+    */
+   min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
+
+   gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
+   worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+   /* Compute ESGS LDS size based on the worst case number of ES vertices
+    * needed to create the target number of GS prims per subgroup.
+    */
+   esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+
+   /* If total LDS usage is too big, refactor partitions based on ratio
+    * of ESGS item sizes.
+    */
+   if (esgs_lds_size > max_lds_size) {
+      /* Our target GS Prims Per Subgroup was too large. Calculate
+       * the maximum number of GS Prims Per Subgroup that will fit
+       * into LDS, capped by the maximum that the hardware can support.
+       */
+      gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
+      assert(gs_prims > 0);
+      worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+      esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+      assert(esgs_lds_size <= max_lds_size);
+   }
+
+   /* Now calculate remaining ESGS information. */
+   if (esgs_lds_size)
+      es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
+   else
+      es_verts = max_es_verts;
+
+   /* Vertices for adjacency primitives are not always reused, so restore
+    * it for ES_VERTS_PER_SUBGRP.
+    */
+   min_es_verts = gs_info->gs.vertices_in;
+
+   /* For normal primitives, the VGT only checks if they are past the ES
+    * verts per subgroup after allocating a full GS primitive and if they
+    * are, kick off a new subgroup.  But if those additional ES verts are
+    * unique (e.g. not reused) we need to make sure there is enough LDS
+    * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
+    */
+   es_verts -= min_es_verts - 1;
+
+   const uint32_t es_verts_per_subgroup = es_verts;
+   const uint32_t gs_prims_per_subgroup = gs_prims;
+   const uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
+   const uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
+   out->lds_size = align(esgs_lds_size, 128) / 128;
+   out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
+                             S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
+                             S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
+   out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
+   out->vgt_esgs_ring_itemsize = esgs_itemsize;
+   assert(max_prims_per_subgroup <= max_out_prims);
+
+   unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size,
+      es_verts_per_subgroup, gs_inst_prims_in_subgroup);
+   es_info->workgroup_size = workgroup_size;
+   gs_info->workgroup_size = workgroup_size;
+}
+
+static void
+clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
+                         bool use_adjacency)
+{
+   unsigned max_reuse = max_esverts - min_verts_per_prim;
+   if (use_adjacency)
+      max_reuse /= 2;
+   *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
+}
+
+static unsigned
+radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage,
+                            const struct radv_pipeline_stage *gs_stage)
+{
+   if (gs_stage) {
+      return gs_stage->nir->info.gs.vertices_in;
+   }
+
+   if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
+      if (es_stage->nir->info.tess.point_mode)
+         return 1;
+      if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
+         return 2;
+      return 3;
+   }
+
+   return 3;
+}
+
+static unsigned
+radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage,
+                                 const struct radv_pipeline_stage *gs_stage)
+{
+   if (gs_stage) {
+      return gs_stage->nir->info.gs.input_primitive;
+   }
+
+   if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
+      if (es_stage->nir->info.tess.point_mode)
+         return SHADER_PRIM_POINTS;
+      if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
+         return SHADER_PRIM_LINES;
+      return SHADER_PRIM_TRIANGLES;
+   }
+
+   return SHADER_PRIM_TRIANGLES;
+}
+
+static void
+gfx10_get_ngg_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
+                   struct radv_pipeline_stage *gs_stage)
+{
+   const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
+   struct radv_shader_info *gs_info = gs_stage ? &gs_stage->info : NULL;
+   struct radv_shader_info *es_info = &es_stage->info;
+   const unsigned max_verts_per_prim = radv_get_num_input_vertices(es_stage, gs_stage);
+   const unsigned min_verts_per_prim = gs_stage ? max_verts_per_prim : 1;
+   struct gfx10_ngg_info *out = gs_stage ? &gs_info->ngg_info : &es_info->ngg_info;
+
+   const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1;
+
+   const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage);
+   const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY ||
+                               input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
+
+   /* All these are in dwords: */
+   /* We can't allow using the whole LDS, because GS waves compete with
+    * other shader stages for LDS space.
+    *
+    * TODO: We should really take the shader's internal LDS use into
+    *       account. The linker will fail if the size is greater than
+    *       8K dwords.
+    */
+   const unsigned max_lds_size = 8 * 1024 - 768;
+   const unsigned target_lds_size = max_lds_size;
+   unsigned esvert_lds_size = 0;
+   unsigned gsprim_lds_size = 0;
+
+   /* All these are per subgroup: */
+   const unsigned min_esverts = gfx_level >= GFX10_3 ? 29 : 24;
+   bool max_vert_out_per_gs_instance = false;
+   unsigned max_esverts_base = 128;
+   unsigned max_gsprims_base = 128; /* default prim group size clamp */
+
+   /* Hardware has the following non-natural restrictions on the value
+    * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
+    * the draw:
+    *  - at most 252 for any line input primitive type
+    *  - at most 251 for any quad input primitive type
+    *  - at most 251 for triangle strips with adjacency (this happens to
+    *    be the natural limit for triangle *lists* with adjacency)
+    */
+   max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
+
+   if (gs_stage) {
+      unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
+
+      if (max_out_verts_per_gsprim <= 256) {
+         if (max_out_verts_per_gsprim) {
+            max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
+         }
+      } else {
+         /* Use special multi-cycling mode in which each GS
+          * instance gets its own subgroup. Does not work with
+          * tessellation. */
+         max_vert_out_per_gs_instance = true;
+         max_gsprims_base = 1;
+         max_out_verts_per_gsprim = gs_info->gs.vertices_out;
+      }
+
+      esvert_lds_size = es_info->esgs_itemsize / 4;
+      gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
+   } else {
+      /* VS and TES. */
+      /* LDS size for passing data from GS to ES. */
+      struct radv_streamout_info *so_info = &es_info->so;
+
+      if (so_info->num_outputs)
+         esvert_lds_size = 4 * so_info->num_outputs + 1;
+
+      /* GS stores Primitive IDs (one DWORD) into LDS at the address
+       * corresponding to the ES thread of the provoking vertex. All
+       * ES threads load and export PrimitiveID for their thread.
+       */
+      if (es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id)
+         esvert_lds_size = MAX2(esvert_lds_size, 1);
+   }
+
+   unsigned max_gsprims = max_gsprims_base;
+   unsigned max_esverts = max_esverts_base;
+
+   if (esvert_lds_size)
+      max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
+   if (gsprim_lds_size)
+      max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
+
+   max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+   clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
+   assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+
+   if (esvert_lds_size || gsprim_lds_size) {
+      /* Now that we have a rough proportionality between esverts
+       * and gsprims based on the primitive type, scale both of them
+       * down simultaneously based on required LDS space.
+       *
+       * We could be smarter about this if we knew how much vertex
+       * reuse to expect.
+       */
+      unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
+      if (lds_total > target_lds_size) {
+         max_esverts = max_esverts * target_lds_size / lds_total;
+         max_gsprims = max_gsprims * target_lds_size / lds_total;
+
+         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
+         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+      }
+   }
+
+   /* Round up towards full wave sizes for better ALU utilization. */
+   if (!max_vert_out_per_gs_instance) {
+      unsigned orig_max_esverts;
+      unsigned orig_max_gsprims;
+      unsigned wavesize;
+
+      if (gs_stage) {
+         wavesize = gs_info->wave_size;
+      } else {
+         wavesize = es_info->wave_size;
+      }
+
+      do {
+         orig_max_esverts = max_esverts;
+         orig_max_gsprims = max_gsprims;
+
+         max_esverts = align(max_esverts, wavesize);
+         max_esverts = MIN2(max_esverts, max_esverts_base);
+         if (esvert_lds_size)
+            max_esverts =
+               MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
+         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+
+         /* Hardware restriction: minimum value of max_esverts */
+         if (gfx_level == GFX10)
+            max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
+         else
+            max_esverts = MAX2(max_esverts, min_esverts);
+
+         max_gsprims = align(max_gsprims, wavesize);
+         max_gsprims = MIN2(max_gsprims, max_gsprims_base);
+         if (gsprim_lds_size) {
+            /* Don't count unusable vertices to the LDS
+             * size. Those are vertices above the maximum
+             * number of vertices that can occur in the
+             * workgroup, which is e.g. max_gsprims * 3
+             * for triangles.
+             */
+            unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+            max_gsprims = MIN2(max_gsprims,
+                               (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
+         }
+         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
+         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+      } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
+
+      /* Verify the restriction. */
+      if (gfx_level == GFX10)
+         assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
+      else
+         assert(max_esverts >= min_esverts);
+   } else {
+      /* Hardware restriction: minimum value of max_esverts */
+      if (gfx_level == GFX10)
+         max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
+      else
+         max_esverts = MAX2(max_esverts, min_esverts);
+   }
+
+   unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
+                               : gs_stage
+                                  ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
+                                  : max_esverts;
+   assert(max_out_vertices <= 256);
+
+   unsigned prim_amp_factor = 1;
+   if (gs_stage) {
+      /* Number of output primitives per GS input primitive after
+       * GS instancing. */
+      prim_amp_factor = gs_info->gs.vertices_out;
+   }
+
+   /* On Gfx10, the GE only checks against the maximum number of ES verts
+    * after allocating a full GS primitive. So we need to ensure that
+    * whenever this check passes, there is enough space for a full
+    * primitive without vertex reuse.
+    */
+   if (gfx_level == GFX10)
+      out->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
+   else
+      out->hw_max_esverts = max_esverts;
+
+   out->max_gsprims = max_gsprims;
+   out->max_out_verts = max_out_vertices;
+   out->prim_amp_factor = prim_amp_factor;
+   out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
+   out->ngg_emit_size = max_gsprims * gsprim_lds_size;
+   out->enable_vertex_grouping = true;
+
+   /* Don't count unusable vertices. */
+   out->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
+
+   if (gs_stage) {
+      out->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
+   } else {
+      out->vgt_esgs_ring_itemsize = 1;
+   }
+
+   assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
+
+   unsigned workgroup_size =
+      ac_compute_ngg_workgroup_size(
+         max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
+   if (gs_stage) {
+      gs_info->workgroup_size = workgroup_size;
+   }
+   es_info->workgroup_size = workgroup_size;
+}
+
  void
  radv_nir_shader_info_link(struct radv_device *device, const struct radv_pipeline_key *pipeline_key,
-                          struct radv_pipeline_stage *stages, gl_shader_stage last_vgt_api_stage)
+                          struct radv_pipeline_stage *stages, bool pipeline_has_ngg,
+                          gl_shader_stage last_vgt_api_stage)
  {
     if (stages[MESA_SHADER_FRAGMENT].nir) {
        assert(last_vgt_api_stage != MESA_SHADER_NONE);
@@ -897,4 +1273,20 @@ radv_nir_shader_info_link(struct radv_device *device, const struct radv_pipeline
        /* Task/mesh I/O uses the task ring buffers. */
        stages[MESA_SHADER_MESH].info.ms.has_task = true;
     }
+
+   if (pipeline_has_ngg) {
+      if (last_vgt_api_stage != MESA_SHADER_MESH) {
+         struct radv_pipeline_stage *es_stage =
+            stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
+         struct radv_pipeline_stage *gs_stage =
+            stages[MESA_SHADER_GEOMETRY].nir ? &stages[MESA_SHADER_GEOMETRY] : NULL;
+
+         gfx10_get_ngg_info(device, es_stage, gs_stage);
+      }
+   } else if (stages[MESA_SHADER_GEOMETRY].nir) {
+      struct radv_pipeline_stage *es_stage =
+         stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
+
+      gfx9_get_gs_info(device, es_stage, &stages[MESA_SHADER_GEOMETRY]);
+   }
  }
author	Samuel Pitoiset <samuel.pitoiset@gmail.com>
	Fri, 26 Aug 2022 10:03:11 +0000 (12:03 +0200)
committer	Marge Bot <emma+marge@anholt.net>
	Thu, 1 Sep 2022 17:02:17 +0000 (17:02 +0000)
src/amd/vulkan/radv_pipeline.c		patch \| blob \| history
src/amd/vulkan/radv_private.h		patch \| blob \| history
src/amd/vulkan/radv_shader_info.c		patch \| blob \| history