}
static void
-gfx9_get_gs_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
- struct radv_pipeline_stage *gs_stage)
-{
- const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
- struct radv_shader_info *gs_info = &gs_stage->info;
- struct radv_shader_info *es_info = &es_stage->info;
- struct gfx9_gs_info *out = &gs_stage->info.gs_ring_info;
-
- const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
- const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY ||
- gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
-
- /* All these are in dwords: */
- /* We can't allow using the whole LDS, because GS waves compete with
- * other shader stages for LDS space. */
- const unsigned max_lds_size = 8 * 1024;
- const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
- unsigned esgs_lds_size;
-
- /* All these are per subgroup: */
- const unsigned max_out_prims = 32 * 1024;
- const unsigned max_es_verts = 255;
- const unsigned ideal_gs_prims = 64;
- unsigned max_gs_prims, gs_prims;
- unsigned min_es_verts, es_verts, worst_case_es_verts;
-
- if (uses_adjacency || gs_num_invocations > 1)
- max_gs_prims = 127 / gs_num_invocations;
- else
- max_gs_prims = 255;
-
- /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
- * Make sure we don't go over the maximum value.
- */
- if (gs_info->gs.vertices_out > 0) {
- max_gs_prims =
- MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
- }
- assert(max_gs_prims > 0);
-
- /* If the primitive has adjacency, halve the number of vertices
- * that will be reused in multiple primitives.
- */
- min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
-
- gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
- worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
-
- /* Compute ESGS LDS size based on the worst case number of ES vertices
- * needed to create the target number of GS prims per subgroup.
- */
- esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-
- /* If total LDS usage is too big, refactor partitions based on ratio
- * of ESGS item sizes.
- */
- if (esgs_lds_size > max_lds_size) {
- /* Our target GS Prims Per Subgroup was too large. Calculate
- * the maximum number of GS Prims Per Subgroup that will fit
- * into LDS, capped by the maximum that the hardware can support.
- */
- gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
- assert(gs_prims > 0);
- worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
-
- esgs_lds_size = esgs_itemsize * worst_case_es_verts;
- assert(esgs_lds_size <= max_lds_size);
- }
-
- /* Now calculate remaining ESGS information. */
- if (esgs_lds_size)
- es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
- else
- es_verts = max_es_verts;
-
- /* Vertices for adjacency primitives are not always reused, so restore
- * it for ES_VERTS_PER_SUBGRP.
- */
- min_es_verts = gs_info->gs.vertices_in;
-
- /* For normal primitives, the VGT only checks if they are past the ES
- * verts per subgroup after allocating a full GS primitive and if they
- * are, kick off a new subgroup. But if those additional ES verts are
- * unique (e.g. not reused) we need to make sure there is enough LDS
- * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
- */
- es_verts -= min_es_verts - 1;
-
- const uint32_t es_verts_per_subgroup = es_verts;
- const uint32_t gs_prims_per_subgroup = gs_prims;
- const uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
- const uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
- out->lds_size = align(esgs_lds_size, 128) / 128;
- out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
- S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
- S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
- out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
- out->vgt_esgs_ring_itemsize = esgs_itemsize;
- assert(max_prims_per_subgroup <= max_out_prims);
-
- unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size,
- es_verts_per_subgroup, gs_inst_prims_in_subgroup);
- es_info->workgroup_size = workgroup_size;
- gs_info->workgroup_size = workgroup_size;
-}
-
-static void
-clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
- bool use_adjacency)
-{
- unsigned max_reuse = max_esverts - min_verts_per_prim;
- if (use_adjacency)
- max_reuse /= 2;
- *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
-}
-
-static unsigned
-radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage,
- const struct radv_pipeline_stage *gs_stage)
-{
- if (gs_stage) {
- return gs_stage->nir->info.gs.vertices_in;
- }
-
- if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
- if (es_stage->nir->info.tess.point_mode)
- return 1;
- if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
- return 2;
- return 3;
- }
-
- return 3;
-}
-
-static void
gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint32_t oversub_pc_lines)
{
S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
}
-static unsigned
-radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage,
- const struct radv_pipeline_stage *gs_stage)
-{
- if (gs_stage) {
- return gs_stage->nir->info.gs.input_primitive;
- }
-
- if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
- if (es_stage->nir->info.tess.point_mode)
- return SHADER_PRIM_POINTS;
- if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
- return SHADER_PRIM_LINES;
- return SHADER_PRIM_TRIANGLES;
- }
-
- return SHADER_PRIM_TRIANGLES;
-}
-
-static void
-gfx10_get_ngg_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
- struct radv_pipeline_stage *gs_stage)
-{
- const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
- struct radv_shader_info *gs_info = gs_stage ? &gs_stage->info : NULL;
- struct radv_shader_info *es_info = &es_stage->info;
- const unsigned max_verts_per_prim = radv_get_num_input_vertices(es_stage, gs_stage);
- const unsigned min_verts_per_prim = gs_stage ? max_verts_per_prim : 1;
- struct gfx10_ngg_info *out = gs_stage ? &gs_info->ngg_info : &es_info->ngg_info;
-
- const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1;
-
- const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage);
- const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY ||
- input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
-
- /* All these are in dwords: */
- /* We can't allow using the whole LDS, because GS waves compete with
- * other shader stages for LDS space.
- *
- * TODO: We should really take the shader's internal LDS use into
- * account. The linker will fail if the size is greater than
- * 8K dwords.
- */
- const unsigned max_lds_size = 8 * 1024 - 768;
- const unsigned target_lds_size = max_lds_size;
- unsigned esvert_lds_size = 0;
- unsigned gsprim_lds_size = 0;
-
- /* All these are per subgroup: */
- const unsigned min_esverts = gfx_level >= GFX10_3 ? 29 : 24;
- bool max_vert_out_per_gs_instance = false;
- unsigned max_esverts_base = 128;
- unsigned max_gsprims_base = 128; /* default prim group size clamp */
-
- /* Hardware has the following non-natural restrictions on the value
- * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
- * the draw:
- * - at most 252 for any line input primitive type
- * - at most 251 for any quad input primitive type
- * - at most 251 for triangle strips with adjacency (this happens to
- * be the natural limit for triangle *lists* with adjacency)
- */
- max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
-
- if (gs_stage) {
- unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
-
- if (max_out_verts_per_gsprim <= 256) {
- if (max_out_verts_per_gsprim) {
- max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
- }
- } else {
- /* Use special multi-cycling mode in which each GS
- * instance gets its own subgroup. Does not work with
- * tessellation. */
- max_vert_out_per_gs_instance = true;
- max_gsprims_base = 1;
- max_out_verts_per_gsprim = gs_info->gs.vertices_out;
- }
-
- esvert_lds_size = es_info->esgs_itemsize / 4;
- gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
- } else {
- /* VS and TES. */
- /* LDS size for passing data from GS to ES. */
- struct radv_streamout_info *so_info = &es_info->so;
-
- if (so_info->num_outputs)
- esvert_lds_size = 4 * so_info->num_outputs + 1;
-
- /* GS stores Primitive IDs (one DWORD) into LDS at the address
- * corresponding to the ES thread of the provoking vertex. All
- * ES threads load and export PrimitiveID for their thread.
- */
- if (es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id)
- esvert_lds_size = MAX2(esvert_lds_size, 1);
- }
-
- unsigned max_gsprims = max_gsprims_base;
- unsigned max_esverts = max_esverts_base;
-
- if (esvert_lds_size)
- max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
- if (gsprim_lds_size)
- max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
-
- max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
- clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
- assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-
- if (esvert_lds_size || gsprim_lds_size) {
- /* Now that we have a rough proportionality between esverts
- * and gsprims based on the primitive type, scale both of them
- * down simultaneously based on required LDS space.
- *
- * We could be smarter about this if we knew how much vertex
- * reuse to expect.
- */
- unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
- if (lds_total > target_lds_size) {
- max_esverts = max_esverts * target_lds_size / lds_total;
- max_gsprims = max_gsprims * target_lds_size / lds_total;
-
- max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
- clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
- assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
- }
- }
-
- /* Round up towards full wave sizes for better ALU utilization. */
- if (!max_vert_out_per_gs_instance) {
- unsigned orig_max_esverts;
- unsigned orig_max_gsprims;
- unsigned wavesize;
-
- if (gs_stage) {
- wavesize = gs_info->wave_size;
- } else {
- wavesize = es_info->wave_size;
- }
-
- do {
- orig_max_esverts = max_esverts;
- orig_max_gsprims = max_gsprims;
-
- max_esverts = align(max_esverts, wavesize);
- max_esverts = MIN2(max_esverts, max_esverts_base);
- if (esvert_lds_size)
- max_esverts =
- MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
- max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-
- /* Hardware restriction: minimum value of max_esverts */
- if (gfx_level == GFX10)
- max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
- else
- max_esverts = MAX2(max_esverts, min_esverts);
-
- max_gsprims = align(max_gsprims, wavesize);
- max_gsprims = MIN2(max_gsprims, max_gsprims_base);
- if (gsprim_lds_size) {
- /* Don't count unusable vertices to the LDS
- * size. Those are vertices above the maximum
- * number of vertices that can occur in the
- * workgroup, which is e.g. max_gsprims * 3
- * for triangles.
- */
- unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
- max_gsprims = MIN2(max_gsprims,
- (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
- }
- clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
- assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
- } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
-
- /* Verify the restriction. */
- if (gfx_level == GFX10)
- assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
- else
- assert(max_esverts >= min_esverts);
- } else {
- /* Hardware restriction: minimum value of max_esverts */
- if (gfx_level == GFX10)
- max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
- else
- max_esverts = MAX2(max_esverts, min_esverts);
- }
-
- unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
- : gs_stage
- ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
- : max_esverts;
- assert(max_out_vertices <= 256);
-
- unsigned prim_amp_factor = 1;
- if (gs_stage) {
- /* Number of output primitives per GS input primitive after
- * GS instancing. */
- prim_amp_factor = gs_info->gs.vertices_out;
- }
-
- /* On Gfx10, the GE only checks against the maximum number of ES verts
- * after allocating a full GS primitive. So we need to ensure that
- * whenever this check passes, there is enough space for a full
- * primitive without vertex reuse.
- */
- if (gfx_level == GFX10)
- out->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
- else
- out->hw_max_esverts = max_esverts;
-
- out->max_gsprims = max_gsprims;
- out->max_out_verts = max_out_vertices;
- out->prim_amp_factor = prim_amp_factor;
- out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
- out->ngg_emit_size = max_gsprims * gsprim_lds_size;
- out->enable_vertex_grouping = true;
-
- /* Don't count unusable vertices. */
- out->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
-
- if (gs_stage) {
- out->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
- } else {
- out->vgt_esgs_ring_itemsize = 1;
- }
-
- assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
-
- unsigned workgroup_size =
- ac_compute_ngg_workgroup_size(
- max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
- if (gs_stage) {
- gs_info->workgroup_size = workgroup_size;
- }
- es_info->workgroup_size = workgroup_size;
-}
-
static void
radv_pipeline_init_gs_ring_state(struct radv_graphics_pipeline *pipeline, const struct gfx9_gs_info *gs)
{
&stages[i].info);
}
- radv_nir_shader_info_link(device, pipeline_key, stages, last_vgt_api_stage);
+ radv_nir_shader_info_link(device, pipeline_key, stages, pipeline_has_ngg, last_vgt_api_stage);
if (stages[MESA_SHADER_TESS_CTRL].nir) {
for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s) {
stages[MESA_SHADER_TASK].nir->info.workgroup_size, false, UINT32_MAX);
}
- if (pipeline_has_ngg) {
- if (last_vgt_api_stage != MESA_SHADER_MESH) {
- struct radv_pipeline_stage *es_stage =
- stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
- struct radv_pipeline_stage *gs_stage =
- stages[MESA_SHADER_GEOMETRY].nir ? &stages[MESA_SHADER_GEOMETRY] : NULL;
-
- gfx10_get_ngg_info(device, es_stage, gs_stage);
- }
- } else if (stages[MESA_SHADER_GEOMETRY].nir) {
- struct radv_pipeline_stage *es_stage =
- stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
-
- gfx9_get_gs_info(device, es_stage, &stages[MESA_SHADER_GEOMETRY]);
- } else {
+ if (!pipeline_has_ngg && !stages[MESA_SHADER_GEOMETRY].nir) {
gl_shader_stage hw_vs_api_stage =
stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
stages[hw_vs_api_stage].info.workgroup_size = stages[hw_vs_api_stage].info.wave_size;
info->ballot_bit_size = radv_get_ballot_bit_size(device, nir->info.stage, info);
}
+static void
+gfx9_get_gs_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
+ struct radv_pipeline_stage *gs_stage)
+{
+ const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
+ struct radv_shader_info *gs_info = &gs_stage->info;
+ struct radv_shader_info *es_info = &es_stage->info;
+ struct gfx9_gs_info *out = &gs_stage->info.gs_ring_info;
+
+ const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
+ const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY ||
+ gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
+
+ /* All these are in dwords: */
+ /* We can't allow using the whole LDS, because GS waves compete with
+ * other shader stages for LDS space. */
+ const unsigned max_lds_size = 8 * 1024;
+ const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
+ unsigned esgs_lds_size;
+
+ /* All these are per subgroup: */
+ const unsigned max_out_prims = 32 * 1024;
+ const unsigned max_es_verts = 255;
+ const unsigned ideal_gs_prims = 64;
+ unsigned max_gs_prims, gs_prims;
+ unsigned min_es_verts, es_verts, worst_case_es_verts;
+
+ if (uses_adjacency || gs_num_invocations > 1)
+ max_gs_prims = 127 / gs_num_invocations;
+ else
+ max_gs_prims = 255;
+
+ /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
+ * Make sure we don't go over the maximum value.
+ */
+ if (gs_info->gs.vertices_out > 0) {
+ max_gs_prims =
+ MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
+ }
+ assert(max_gs_prims > 0);
+
+ /* If the primitive has adjacency, halve the number of vertices
+ * that will be reused in multiple primitives.
+ */
+ min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
+
+ gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
+ worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+ /* Compute ESGS LDS size based on the worst case number of ES vertices
+ * needed to create the target number of GS prims per subgroup.
+ */
+ esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+
+ /* If total LDS usage is too big, refactor partitions based on ratio
+ * of ESGS item sizes.
+ */
+ if (esgs_lds_size > max_lds_size) {
+ /* Our target GS Prims Per Subgroup was too large. Calculate
+ * the maximum number of GS Prims Per Subgroup that will fit
+ * into LDS, capped by the maximum that the hardware can support.
+ */
+ gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
+ assert(gs_prims > 0);
+ worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+ esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+ assert(esgs_lds_size <= max_lds_size);
+ }
+
+ /* Now calculate remaining ESGS information. */
+ if (esgs_lds_size)
+ es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
+ else
+ es_verts = max_es_verts;
+
+ /* Vertices for adjacency primitives are not always reused, so restore
+ * it for ES_VERTS_PER_SUBGRP.
+ */
+ min_es_verts = gs_info->gs.vertices_in;
+
+ /* For normal primitives, the VGT only checks if they are past the ES
+ * verts per subgroup after allocating a full GS primitive and if they
+ * are, kick off a new subgroup. But if those additional ES verts are
+ * unique (e.g. not reused) we need to make sure there is enough LDS
+ * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
+ */
+ es_verts -= min_es_verts - 1;
+
+ const uint32_t es_verts_per_subgroup = es_verts;
+ const uint32_t gs_prims_per_subgroup = gs_prims;
+ const uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
+ const uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
+ out->lds_size = align(esgs_lds_size, 128) / 128;
+ out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
+ S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
+ S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
+ out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
+ out->vgt_esgs_ring_itemsize = esgs_itemsize;
+ assert(max_prims_per_subgroup <= max_out_prims);
+
+ unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size,
+ es_verts_per_subgroup, gs_inst_prims_in_subgroup);
+ es_info->workgroup_size = workgroup_size;
+ gs_info->workgroup_size = workgroup_size;
+}
+
+static void
+clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
+ bool use_adjacency)
+{
+ unsigned max_reuse = max_esverts - min_verts_per_prim;
+ if (use_adjacency)
+ max_reuse /= 2;
+ *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
+}
+
+static unsigned
+radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage,
+ const struct radv_pipeline_stage *gs_stage)
+{
+ if (gs_stage) {
+ return gs_stage->nir->info.gs.vertices_in;
+ }
+
+ if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
+ if (es_stage->nir->info.tess.point_mode)
+ return 1;
+ if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
+ return 2;
+ return 3;
+ }
+
+ return 3;
+}
+
+static unsigned
+radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage,
+ const struct radv_pipeline_stage *gs_stage)
+{
+ if (gs_stage) {
+ return gs_stage->nir->info.gs.input_primitive;
+ }
+
+ if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
+ if (es_stage->nir->info.tess.point_mode)
+ return SHADER_PRIM_POINTS;
+ if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
+ return SHADER_PRIM_LINES;
+ return SHADER_PRIM_TRIANGLES;
+ }
+
+ return SHADER_PRIM_TRIANGLES;
+}
+
+static void
+gfx10_get_ngg_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
+ struct radv_pipeline_stage *gs_stage)
+{
+ const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
+ struct radv_shader_info *gs_info = gs_stage ? &gs_stage->info : NULL;
+ struct radv_shader_info *es_info = &es_stage->info;
+ const unsigned max_verts_per_prim = radv_get_num_input_vertices(es_stage, gs_stage);
+ const unsigned min_verts_per_prim = gs_stage ? max_verts_per_prim : 1;
+ struct gfx10_ngg_info *out = gs_stage ? &gs_info->ngg_info : &es_info->ngg_info;
+
+ const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1;
+
+ const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage);
+ const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY ||
+ input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
+
+ /* All these are in dwords: */
+ /* We can't allow using the whole LDS, because GS waves compete with
+ * other shader stages for LDS space.
+ *
+ * TODO: We should really take the shader's internal LDS use into
+ * account. The linker will fail if the size is greater than
+ * 8K dwords.
+ */
+ const unsigned max_lds_size = 8 * 1024 - 768;
+ const unsigned target_lds_size = max_lds_size;
+ unsigned esvert_lds_size = 0;
+ unsigned gsprim_lds_size = 0;
+
+ /* All these are per subgroup: */
+ const unsigned min_esverts = gfx_level >= GFX10_3 ? 29 : 24;
+ bool max_vert_out_per_gs_instance = false;
+ unsigned max_esverts_base = 128;
+ unsigned max_gsprims_base = 128; /* default prim group size clamp */
+
+ /* Hardware has the following non-natural restrictions on the value
+ * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
+ * the draw:
+ * - at most 252 for any line input primitive type
+ * - at most 251 for any quad input primitive type
+ * - at most 251 for triangle strips with adjacency (this happens to
+ * be the natural limit for triangle *lists* with adjacency)
+ */
+ max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
+
+ if (gs_stage) {
+ unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
+
+ if (max_out_verts_per_gsprim <= 256) {
+ if (max_out_verts_per_gsprim) {
+ max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
+ }
+ } else {
+ /* Use special multi-cycling mode in which each GS
+ * instance gets its own subgroup. Does not work with
+ * tessellation. */
+ max_vert_out_per_gs_instance = true;
+ max_gsprims_base = 1;
+ max_out_verts_per_gsprim = gs_info->gs.vertices_out;
+ }
+
+ esvert_lds_size = es_info->esgs_itemsize / 4;
+ gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
+ } else {
+ /* VS and TES. */
+ /* LDS size for passing data from GS to ES. */
+ struct radv_streamout_info *so_info = &es_info->so;
+
+ if (so_info->num_outputs)
+ esvert_lds_size = 4 * so_info->num_outputs + 1;
+
+ /* GS stores Primitive IDs (one DWORD) into LDS at the address
+ * corresponding to the ES thread of the provoking vertex. All
+ * ES threads load and export PrimitiveID for their thread.
+ */
+ if (es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id)
+ esvert_lds_size = MAX2(esvert_lds_size, 1);
+ }
+
+ unsigned max_gsprims = max_gsprims_base;
+ unsigned max_esverts = max_esverts_base;
+
+ if (esvert_lds_size)
+ max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
+ if (gsprim_lds_size)
+ max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
+
+ max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+ clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
+ assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+
+ if (esvert_lds_size || gsprim_lds_size) {
+ /* Now that we have a rough proportionality between esverts
+ * and gsprims based on the primitive type, scale both of them
+ * down simultaneously based on required LDS space.
+ *
+ * We could be smarter about this if we knew how much vertex
+ * reuse to expect.
+ */
+ unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
+ if (lds_total > target_lds_size) {
+ max_esverts = max_esverts * target_lds_size / lds_total;
+ max_gsprims = max_gsprims * target_lds_size / lds_total;
+
+ max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+ clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
+ assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+ }
+ }
+
+ /* Round up towards full wave sizes for better ALU utilization. */
+ if (!max_vert_out_per_gs_instance) {
+ unsigned orig_max_esverts;
+ unsigned orig_max_gsprims;
+ unsigned wavesize;
+
+ if (gs_stage) {
+ wavesize = gs_info->wave_size;
+ } else {
+ wavesize = es_info->wave_size;
+ }
+
+ do {
+ orig_max_esverts = max_esverts;
+ orig_max_gsprims = max_gsprims;
+
+ max_esverts = align(max_esverts, wavesize);
+ max_esverts = MIN2(max_esverts, max_esverts_base);
+ if (esvert_lds_size)
+ max_esverts =
+ MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
+ max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+
+ /* Hardware restriction: minimum value of max_esverts */
+ if (gfx_level == GFX10)
+ max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
+ else
+ max_esverts = MAX2(max_esverts, min_esverts);
+
+ max_gsprims = align(max_gsprims, wavesize);
+ max_gsprims = MIN2(max_gsprims, max_gsprims_base);
+ if (gsprim_lds_size) {
+ /* Don't count unusable vertices to the LDS
+ * size. Those are vertices above the maximum
+ * number of vertices that can occur in the
+ * workgroup, which is e.g. max_gsprims * 3
+ * for triangles.
+ */
+ unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+ max_gsprims = MIN2(max_gsprims,
+ (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
+ }
+ clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
+ assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+ } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
+
+ /* Verify the restriction. */
+ if (gfx_level == GFX10)
+ assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
+ else
+ assert(max_esverts >= min_esverts);
+ } else {
+ /* Hardware restriction: minimum value of max_esverts */
+ if (gfx_level == GFX10)
+ max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
+ else
+ max_esverts = MAX2(max_esverts, min_esverts);
+ }
+
+ unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
+ : gs_stage
+ ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
+ : max_esverts;
+ assert(max_out_vertices <= 256);
+
+ unsigned prim_amp_factor = 1;
+ if (gs_stage) {
+ /* Number of output primitives per GS input primitive after
+ * GS instancing. */
+ prim_amp_factor = gs_info->gs.vertices_out;
+ }
+
+ /* On Gfx10, the GE only checks against the maximum number of ES verts
+ * after allocating a full GS primitive. So we need to ensure that
+ * whenever this check passes, there is enough space for a full
+ * primitive without vertex reuse.
+ */
+ if (gfx_level == GFX10)
+ out->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
+ else
+ out->hw_max_esverts = max_esverts;
+
+ out->max_gsprims = max_gsprims;
+ out->max_out_verts = max_out_vertices;
+ out->prim_amp_factor = prim_amp_factor;
+ out->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
+ out->ngg_emit_size = max_gsprims * gsprim_lds_size;
+ out->enable_vertex_grouping = true;
+
+ /* Don't count unusable vertices. */
+ out->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
+
+ if (gs_stage) {
+ out->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
+ } else {
+ out->vgt_esgs_ring_itemsize = 1;
+ }
+
+ assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
+
+ unsigned workgroup_size =
+ ac_compute_ngg_workgroup_size(
+ max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
+ if (gs_stage) {
+ gs_info->workgroup_size = workgroup_size;
+ }
+ es_info->workgroup_size = workgroup_size;
+}
+
void
radv_nir_shader_info_link(struct radv_device *device, const struct radv_pipeline_key *pipeline_key,
- struct radv_pipeline_stage *stages, gl_shader_stage last_vgt_api_stage)
+ struct radv_pipeline_stage *stages, bool pipeline_has_ngg,
+ gl_shader_stage last_vgt_api_stage)
{
if (stages[MESA_SHADER_FRAGMENT].nir) {
assert(last_vgt_api_stage != MESA_SHADER_NONE);
/* Task/mesh I/O uses the task ring buffers. */
stages[MESA_SHADER_MESH].info.ms.has_task = true;
}
+
+ if (pipeline_has_ngg) {
+ if (last_vgt_api_stage != MESA_SHADER_MESH) {
+ struct radv_pipeline_stage *es_stage =
+ stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
+ struct radv_pipeline_stage *gs_stage =
+ stages[MESA_SHADER_GEOMETRY].nir ? &stages[MESA_SHADER_GEOMETRY] : NULL;
+
+ gfx10_get_ngg_info(device, es_stage, gs_stage);
+ }
+ } else if (stages[MESA_SHADER_GEOMETRY].nir) {
+ struct radv_pipeline_stage *es_stage =
+ stages[MESA_SHADER_TESS_EVAL].nir ? &stages[MESA_SHADER_TESS_EVAL] : &stages[MESA_SHADER_VERTEX];
+
+ gfx9_get_gs_info(device, es_stage, &stages[MESA_SHADER_GEOMETRY]);
+ }
}