From: Timur Kristóf Date: Tue, 22 Aug 2023 20:39:22 +0000 (+0200) Subject: ac/nir/ngg: Wait for attribute ring stores in mesh shaders. X-Git-Tag: upstream/23.3.3~2548 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=93b4f200dead198e680991a1e95bf3d3b58f87bd;p=platform%2Fupstream%2Fmesa.git ac/nir/ngg: Wait for attribute ring stores in mesh shaders. Make sure that both per-vertex and per-primitive attribute ring stores are finished before position or primitive export instructions are executed. This is necessary because we need to ensure that mesh shader waves work correctly when they have either vertex-only or primitive-only waves. Cc: mesa-stable Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 6d59ad5..3ed2a36 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -21,6 +21,13 @@ VARYING_BIT_VIEWPORT | \ VARYING_BIT_PRIMITIVE_SHADING_RATE) +#define MS_VERT_ARG_EXP_MASK \ + (VARYING_BIT_CULL_DIST0 | \ + VARYING_BIT_CULL_DIST1 | \ + VARYING_BIT_CLIP_DIST0 | \ + VARYING_BIT_CLIP_DIST1 | \ + VARYING_BIT_PSIZ) + enum { nggc_passflag_used_by_pos = 1, nggc_passflag_used_by_other = 2, @@ -4424,69 +4431,117 @@ emit_ms_finale(nir_builder *b, lower_ngg_ms_state *s) * current thread's vertex attributes in a way the HW can export. */ - /* Export vertices. */ - nir_def *has_output_vertex = nir_ilt(b, invocation_index, num_vtx); - nir_if *if_has_output_vertex = nir_push_if(b, has_output_vertex); - { - const uint64_t per_vertex_outputs = - s->per_vertex_outputs & ~s->layout.attr_ring.vtx_attr.mask; - ms_emit_arrayed_outputs(b, invocation_index, per_vertex_outputs, s); + uint64_t per_vertex_outputs = + s->per_vertex_outputs & ~s->layout.attr_ring.vtx_attr.mask; + uint64_t per_primitive_outputs = + s->per_primitive_outputs & ~s->layout.attr_ring.prm_attr.mask & ~SPECIAL_MS_OUT_MASK; - ac_nir_export_position(b, s->gfx_level, s->clipdist_enable_mask, - !s->has_param_exports, false, true, - s->per_vertex_outputs | VARYING_BIT_POS, s->outputs); + /* Insert layer output store if the pipeline uses multiview but the API shader doesn't write it. */ + if (s->insert_layer_output) { + b->shader->info.outputs_written |= VARYING_BIT_LAYER; + b->shader->info.per_primitive_outputs |= VARYING_BIT_LAYER; + per_primitive_outputs |= VARYING_BIT_LAYER; + } - /* Export generic attributes on GFX10.3 - * (On GFX11 they are already stored in the attribute ring.) - */ - if (s->has_param_exports && s->gfx_level == GFX10_3) { - ac_nir_export_parameters(b, s->vs_output_param_offset, per_vertex_outputs, 0, s->outputs, - NULL, NULL); - } + const bool has_special_param_exports = + (per_vertex_outputs & MS_VERT_ARG_EXP_MASK) || + (per_primitive_outputs & MS_PRIM_ARG_EXP_MASK); - const uint64_t per_vertex_special = VARYING_BIT_CULL_DIST0 | VARYING_BIT_CULL_DIST1 | - VARYING_BIT_CLIP_DIST0 | VARYING_BIT_CLIP_DIST1 | - VARYING_BIT_PSIZ; + const bool wait_attr_ring = s->gfx_level == GFX11 && has_special_param_exports; - /* GFX11+: also store special outputs to the attribute ring so PS can load them. */ - if (s->gfx_level >= GFX11 && (per_vertex_outputs & per_vertex_special)) { - ms_emit_attribute_ring_output_stores(b, per_vertex_outputs & per_vertex_special, s); + /* Export vertices. */ + if ((per_vertex_outputs & ~VARYING_BIT_POS) || !wait_attr_ring) { + nir_def *has_output_vertex = nir_ilt(b, invocation_index, num_vtx); + nir_if *if_has_output_vertex = nir_push_if(b, has_output_vertex); + { + ms_emit_arrayed_outputs(b, invocation_index, per_vertex_outputs, s); + + if (!wait_attr_ring) + ac_nir_export_position(b, s->gfx_level, s->clipdist_enable_mask, + !s->has_param_exports, false, true, + s->per_vertex_outputs | VARYING_BIT_POS, s->outputs); + + /* Export generic attributes on GFX10.3 + * (On GFX11 they are already stored in the attribute ring.) + */ + if (s->has_param_exports && s->gfx_level == GFX10_3) { + ac_nir_export_parameters(b, s->vs_output_param_offset, per_vertex_outputs, 0, s->outputs, + NULL, NULL); + } + + /* GFX11+: also store special outputs to the attribute ring so PS can load them. */ + if (s->gfx_level >= GFX11 && (per_vertex_outputs & MS_VERT_ARG_EXP_MASK)) { + ms_emit_attribute_ring_output_stores(b, per_vertex_outputs & MS_VERT_ARG_EXP_MASK, s); + } } + nir_pop_if(b, if_has_output_vertex); } - nir_pop_if(b, if_has_output_vertex); /* Export primitives. */ - nir_def *has_output_primitive = nir_ilt(b, invocation_index, num_prm); - nir_if *if_has_output_primitive = nir_push_if(b, has_output_primitive); - { - uint64_t per_primitive_outputs = - s->per_primitive_outputs & ~s->layout.attr_ring.prm_attr.mask & ~SPECIAL_MS_OUT_MASK; - ms_emit_arrayed_outputs(b, invocation_index, per_primitive_outputs, s); - - /* Insert layer output store if the pipeline uses multiview but the API shader doesn't write it. */ - if (s->insert_layer_output) { - s->outputs[VARYING_SLOT_LAYER][0] = nir_load_view_index(b); - b->shader->info.outputs_written |= VARYING_BIT_LAYER; - b->shader->info.per_primitive_outputs |= VARYING_BIT_LAYER; - per_primitive_outputs |= VARYING_BIT_LAYER; - } + if (per_primitive_outputs || !wait_attr_ring) { + nir_def *has_output_primitive = nir_ilt(b, invocation_index, num_prm); + nir_if *if_has_output_primitive = nir_push_if(b, has_output_primitive); + { + ms_emit_arrayed_outputs(b, invocation_index, per_primitive_outputs, s); + + /* Insert layer output store if the pipeline uses multiview but the API shader doesn't write it. */ + if (s->insert_layer_output) { + s->outputs[VARYING_SLOT_LAYER][0] = nir_load_view_index(b); + } - ms_emit_primitive_export(b, invocation_index, num_vtx, per_primitive_outputs, s); + if (!wait_attr_ring) + ms_emit_primitive_export(b, invocation_index, num_vtx, per_primitive_outputs, s); - /* Export generic attributes on GFX10.3 - * (On GFX11 they are already stored in the attribute ring.) - */ - if (s->has_param_exports && s->gfx_level == GFX10_3) { - ac_nir_export_parameters(b, s->vs_output_param_offset, per_primitive_outputs, 0, - s->outputs, NULL, NULL); + /* Export generic attributes on GFX10.3 + * (On GFX11 they are already stored in the attribute ring.) + */ + if (s->has_param_exports && s->gfx_level == GFX10_3) { + ac_nir_export_parameters(b, s->vs_output_param_offset, per_primitive_outputs, 0, + s->outputs, NULL, NULL); + } + + /* GFX11+: also store special per-primitive outputs to the attribute ring so PS can load them. */ + if (s->gfx_level >= GFX11) { + ms_emit_attribute_ring_output_stores(b, per_primitive_outputs & MS_PRIM_ARG_EXP_MASK, s); + } } + nir_pop_if(b, if_has_output_primitive); + } + + /* When we need to wait for attribute ring stores, we emit both position and primitive + * export instructions after a barrier to make sure both per-vertex and per-primitive + * attribute ring stores are finished before the GPU starts rasterization. + */ + if (wait_attr_ring) { + /* Wait for attribute stores to finish. */ + nir_barrier(b, .execution_scope = SCOPE_SUBGROUP, + .memory_scope = SCOPE_DEVICE, + .memory_semantics = NIR_MEMORY_RELEASE, + .memory_modes = nir_var_shader_out); + + /* Position export only */ + nir_def *has_output_vertex = nir_ilt(b, invocation_index, num_vtx); + nir_if *if_has_output_vertex = nir_push_if(b, has_output_vertex); + { + ms_emit_arrayed_outputs(b, invocation_index, per_vertex_outputs, s); + ac_nir_export_position(b, s->gfx_level, s->clipdist_enable_mask, + !s->has_param_exports, false, true, + s->per_vertex_outputs | VARYING_BIT_POS, s->outputs); + } + nir_pop_if(b, if_has_output_vertex); + + nir_def *has_output_primitive = nir_ilt(b, invocation_index, num_prm); + nir_if *if_has_output_primitive = nir_push_if(b, has_output_primitive); + { + ms_emit_arrayed_outputs(b, invocation_index, per_primitive_outputs, s); + if (s->insert_layer_output) { + s->outputs[VARYING_SLOT_LAYER][0] = nir_load_view_index(b); + } - /* GFX11+: also store special per-primitive outputs to the attribute ring so PS can load them. */ - if (s->gfx_level >= GFX11) { - ms_emit_attribute_ring_output_stores(b, per_primitive_outputs & MS_PRIM_ARG_EXP_MASK, s); + ms_emit_primitive_export(b, invocation_index, num_vtx, per_primitive_outputs, s); } + nir_pop_if(b, if_has_output_primitive); } - nir_pop_if(b, if_has_output_primitive); } static void