From 80506be31bf3bffea46b3fcd5d58ed7b174a2e4d Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Sat, 24 Dec 2022 14:55:29 +0800 Subject: [PATCH] ac/nir/ngg,radv,radeonsi: nogs use ac_nir_export_(position|parameter) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Reviewed-by: Timur Kristóf Reviewed-by: Marek Olšák Signed-off-by: Qiang Yu Part-of: --- src/amd/common/ac_nir.h | 7 ++- src/amd/common/ac_nir_lower_ngg.c | 89 +++++++++++++++----------------- src/amd/vulkan/radv_shader.c | 4 +- src/gallium/drivers/radeonsi/si_shader.c | 14 +++-- 4 files changed, 60 insertions(+), 54 deletions(-) diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index 81b9b02..fbdec73 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -156,20 +156,23 @@ typedef struct { unsigned max_workgroup_size; unsigned wave_size; + uint32_t clipdist_enable_mask; const uint8_t *vs_output_param_offset; /* GFX11+ */ + bool has_param_exports; bool can_cull; bool disable_streamout; bool has_gen_prim_query; bool has_xfb_prim_query; + bool kill_pointsize; + bool force_vrs; /* VS */ unsigned num_vertices_per_primitive; bool early_prim_export; bool passthrough; bool use_edgeflags; - int primitive_id_location; + bool export_primitive_id; uint32_t instance_rate_inputs; - uint32_t clipdist_enable_mask; uint32_t user_clip_plane_enable_mask; /* GS */ diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index dc20f59..875a54d 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -82,7 +82,6 @@ typedef struct bool streamout_enabled; bool has_user_edgeflags; unsigned max_num_waves; - unsigned position_store_base; /* LDS params */ unsigned pervertex_lds_bytes; @@ -597,14 +596,7 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b, lower_ngg_nogs_state *st) prim_id = nir_load_primitive_id(b); } - nir_io_semantics io_sem = { - .location = VARYING_SLOT_PRIMITIVE_ID, - .num_slots = 1, - }; - - nir_store_output(b, prim_id, nir_imm_zero(b, 1, 32), - .base = st->options->primitive_id_location, - .src_type = nir_type_uint32, .io_semantics = io_sem); + st->outputs[VARYING_SLOT_PRIMITIVE_ID][0] = prim_id; /* Update outputs_written to reflect that the pass added a new output. */ b->shader->info.outputs_written |= VARYING_BIT_PRIMITIVE_ID; @@ -777,9 +769,6 @@ remove_extra_pos_output(nir_builder *b, nir_instr *instr, void *state) nir_ssa_def *store_val = intrin->src[0].ssa; unsigned store_pos_component = nir_intrinsic_component(intrin); - /* save the store base for re-construct store output instruction */ - s->position_store_base = nir_intrinsic_base(intrin); - nir_instr_remove(instr); if (store_val->parent_instr->type == nir_instr_type_alu) { @@ -2105,12 +2094,8 @@ ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nog type[c] = src_type; } - /* remove the edge flag output anyway as it should not be passed to next stage */ - bool is_edge_slot = slot == VARYING_SLOT_EDGE; - /* remove non-pos-export slot when GFX11, they are written to buffer memory */ - bool is_pos_export_slot = slot < VARYING_SLOT_MAX && (BITFIELD64_BIT(slot) & POS_EXPORT_MASK); - if (is_edge_slot || (s->options->gfx_level >= GFX11 && !is_pos_export_slot)) - nir_instr_remove(instr); + /* remove all store output instructions */ + nir_instr_remove(instr); } } } @@ -2260,9 +2245,9 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option }; const bool need_prim_id_store_shared = - options->primitive_id_location >= 0 && shader->info.stage == MESA_SHADER_VERTEX; + options->export_primitive_id && shader->info.stage == MESA_SHADER_VERTEX; - if (options->primitive_id_location >= 0) { + if (options->export_primitive_id) { nir_variable *prim_id_var = nir_variable_create(shader, nir_var_shader_out, glsl_uint_type(), "ngg_prim_id"); prim_id_var->data.location = VARYING_SLOT_PRIMITIVE_ID; prim_id_var->data.driver_location = VARYING_SLOT_PRIMITIVE_ID; @@ -2326,7 +2311,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option ngg_nogs_get_pervertex_lds_size(shader->info.stage, shader->num_outputs, state.streamout_enabled, - options->primitive_id_location >= 0, + options->export_primitive_id, state.has_user_edgeflags); if (need_prim_id_store_shared) { @@ -2337,7 +2322,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared); } - nir_intrinsic_instr *export_vertex_instr; nir_ssa_def *es_thread = options->can_cull ? nir_load_var(b, es_accepted_var) : has_input_vertex(b); @@ -2357,11 +2341,8 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option nir_cf_reinsert(&extracted, b->cursor); b->cursor = nir_after_cf_list(&if_es_thread->then_list); - if (options->primitive_id_location >= 0) + if (options->export_primitive_id) emit_store_ngg_nogs_es_primitive_id(b, &state); - - /* Export all vertex attributes (including the primitive ID) */ - export_vertex_instr = nir_export_vertex_amd(b); } nir_pop_if(b, if_es_thread); @@ -2376,12 +2357,11 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option * it seems that it's best to put the position export always at the end, and * then let ACO schedule it up (slightly) only when early prim export is used. */ - b->cursor = nir_before_instr(&export_vertex_instr->instr); + b->cursor = nir_after_cf_list(&if_es_thread->then_list); nir_ssa_def *pos_val = nir_load_var(b, state.position_value_var); - nir_io_semantics io_sem = { .location = VARYING_SLOT_POS, .num_slots = 1 }; - nir_store_output(b, pos_val, nir_imm_int(b, 0), .base = state.position_store_base, - .component = 0, .io_semantics = io_sem, .src_type = nir_type_float32); + for (int i = 0; i < 4; i++) + state.outputs[VARYING_SLOT_POS][i] = nir_channel(b, pos_val, i); } /* Gather outputs data and types */ @@ -2407,23 +2387,40 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, prim_exp_arg_var)); } - /* Export varyings for GFX11+ */ - if (state.options->gfx_level >= GFX11) { - vs_output outputs[64]; - - b->cursor = nir_after_cf_list(&if_es_thread->then_list); - unsigned num_outputs = gather_vs_outputs(b, outputs, &state); - - if (num_outputs) { - b->cursor = nir_after_cf_node(&if_es_thread->cf_node); - create_vertex_param_phis(b, num_outputs, outputs); - - b->cursor = nir_after_cf_list(&impl->body); + uint64_t export_outputs = shader->info.outputs_written; + if (options->kill_pointsize) + export_outputs &= ~VARYING_BIT_PSIZ; - if (!num_es_threads) - num_es_threads = nir_load_merged_wave_info_amd(b); - export_vertex_params_gfx11(b, NULL, num_es_threads, num_outputs, outputs, - options->vs_output_param_offset); + b->cursor = nir_after_cf_list(&if_es_thread->then_list); + ac_nir_export_position(b, options->gfx_level, + options->clipdist_enable_mask, + !options->has_param_exports, + options->force_vrs, + export_outputs, state.outputs); + + if (options->has_param_exports) { + if (state.options->gfx_level >= GFX11) { + /* Export varyings for GFX11+ */ + vs_output outputs[64]; + unsigned num_outputs = gather_vs_outputs(b, outputs, &state); + + if (num_outputs) { + b->cursor = nir_after_cf_node(&if_es_thread->cf_node); + create_vertex_param_phis(b, num_outputs, outputs); + + b->cursor = nir_after_cf_list(&impl->body); + + if (!num_es_threads) + num_es_threads = nir_load_merged_wave_info_amd(b); + export_vertex_params_gfx11(b, NULL, num_es_threads, num_outputs, outputs, + options->vs_output_param_offset); + } + } else { + ac_nir_export_parameter(b, options->vs_output_param_offset, + shader->info.outputs_written, + shader->info.outputs_written_16bit, + state.outputs, state.outputs_16bit_lo, + state.outputs_16bit_hi); } } diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 6ca949f..b05fa2e 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1444,7 +1444,9 @@ void radv_lower_ngg(struct radv_device *device, struct radv_pipeline_stage *ngg_ options.gfx_level = device->physical_device->rad_info.gfx_level; options.max_workgroup_size = info->workgroup_size; options.wave_size = info->wave_size; + options.clipdist_enable_mask = info->outinfo.clip_dist_mask | info->outinfo.cull_dist_mask; options.vs_output_param_offset = info->outinfo.vs_output_param_offset; + options.has_param_exports = info->outinfo.param_exports; options.can_cull = nir->info.stage != MESA_SHADER_GEOMETRY && info->has_ngg_culling; options.disable_streamout = !device->physical_device->use_ngg_streamout; options.has_gen_prim_query = info->has_ngg_prim_query; @@ -1460,7 +1462,7 @@ void radv_lower_ngg(struct radv_device *device, struct radv_pipeline_stage *ngg_ options.num_vertices_per_primitive = num_vertices_per_prim; options.early_prim_export = info->has_ngg_early_prim_export; options.passthrough = info->is_ngg_passthrough; - options.primitive_id_location = info->outinfo.export_prim_id ? VARYING_SLOT_PRIMITIVE_ID : -1; + options.export_primitive_id = info->outinfo.export_prim_id; options.instance_rate_inputs = pl_key->vs.instance_rate_inputs << VERT_ATTRIB_GENERIC0; NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 0c6e615..2d7d9cb 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1604,6 +1604,10 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) const union si_shader_key *key = &shader->key; assert(key->ge.as_ngg); + unsigned clipdist_mask = + (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) | + sel->info.culldist_mask; + ac_nir_lower_ngg_options options = { .family = sel->screen->info.family, .gfx_level = sel->screen->info.gfx_level, @@ -1612,6 +1616,10 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) .can_cull = !!key->ge.opt.ngg_culling, .disable_streamout = key->ge.opt.remove_streamout, .vs_output_param_offset = shader->info.vs_output_param_offset, + .has_param_exports = shader->info.nr_param_exports, + .clipdist_enable_mask = clipdist_mask, + .kill_pointsize = key->ge.opt.kill_pointsize, + .force_vrs = sel->screen->options.vrs2x2, }; if (nir->info.stage == MESA_SHADER_VERTEX || @@ -1635,8 +1643,6 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) unsigned clip_plane_enable = SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling); - unsigned clipdist_mask = - (sel->info.clipdist_mask & clip_plane_enable) | sel->info.culldist_mask; options.num_vertices_per_primitive = gfx10_ngg_get_vertices_per_prim(shader); options.early_prim_export = gfx10_ngg_export_prim_early(shader); @@ -1644,10 +1650,8 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) options.use_edgeflags = gfx10_edgeflags_have_effect(shader); options.has_gen_prim_query = options.has_xfb_prim_query = sel->screen->use_ngg_streamout && !sel->info.base.vs.blit_sgprs_amd; - options.primitive_id_location = - key->ge.mono.u.vs_export_prim_id ? sel->info.num_outputs : -1; + options.export_primitive_id = key->ge.mono.u.vs_export_prim_id; options.instance_rate_inputs = instance_rate_inputs; - options.clipdist_enable_mask = clipdist_mask; options.user_clip_plane_enable_mask = clip_plane_enable; NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options); -- 2.7.4