From 93fb84237f7a5c104076de4c8561c65a4f4fb51c Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 14 Oct 2022 17:15:39 +0100 Subject: [PATCH] ac/nir: add ac_nir_lower_ngg_options These signatures were getting ridiculous. Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_nir.h | 50 +++++----- src/amd/common/ac_nir_lower_ngg.c | 198 ++++++++++++++++---------------------- src/amd/vulkan/radv_shader.c | 34 ++++--- 3 files changed, 132 insertions(+), 150 deletions(-) diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index 0f9808a..b7e5e45 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -119,32 +119,36 @@ bool ac_nir_lower_indirect_derefs(nir_shader *shader, enum amd_gfx_level gfx_level); +typedef struct { + enum radeon_family family; + enum amd_gfx_level gfx_level; + + unsigned max_workgroup_size; + unsigned wave_size; + bool can_cull; + bool disable_streamout; + + /* VS */ + unsigned num_vertices_per_primitive; + bool early_prim_export; + bool passthrough; + bool use_edgeflags; + bool has_prim_query; + int primitive_id_location; + uint32_t instance_rate_inputs; + uint32_t clipdist_enable_mask; + uint32_t user_clip_plane_enable_mask; + + /* GS */ + unsigned gs_out_vtx_bytes; + bool has_xfb_query; +} ac_nir_lower_ngg_options; + void -ac_nir_lower_ngg_nogs(nir_shader *shader, - enum radeon_family family, - unsigned num_vertices_per_primitive, - unsigned max_workgroup_size, - unsigned wave_size, - bool can_cull, - bool early_prim_export, - bool passthrough, - bool use_edgeflags, - bool has_prim_query, - bool disable_streamout, - int primitive_id_location, - uint32_t instance_rate_inputs, - uint32_t clipdist_enable_mask, - uint32_t user_clip_plane_enable_mask); +ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options); void -ac_nir_lower_ngg_gs(nir_shader *shader, - enum amd_gfx_level gfx_level, - unsigned wave_size, - unsigned max_workgroup_size, - unsigned gs_out_vtx_bytes, - bool has_xfb_query, - bool can_cull, - bool disable_streamout); +ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options); void ac_nir_lower_ngg_ms(nir_shader *shader, diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 523a951..94f6a7a 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -47,6 +47,8 @@ typedef struct typedef struct { + const ac_nir_lower_ngg_options *options; + nir_variable *position_value_var; nir_variable *prim_exp_arg_var; nir_variable *es_accepted_var; @@ -57,24 +59,17 @@ typedef struct struct u_vector saved_uniforms; - bool passthrough; bool early_prim_export; - bool use_edgeflags; - bool has_prim_query; bool streamout_enabled; bool has_user_edgeflags; - unsigned wave_size; unsigned max_num_waves; - unsigned num_vertices_per_primitives; unsigned position_store_base; - int primitive_id_location; /* LDS params */ unsigned pervertex_lds_bytes; uint64_t inputs_needed_by_pos; uint64_t inputs_needed_by_others; - uint32_t instance_rate_inputs; nir_instr *compact_arg_stores[4]; nir_intrinsic_instr *overwrite_args; @@ -82,8 +77,6 @@ typedef struct /* clip distance */ nir_variable *clip_vertex_var; nir_variable *clipdist_neg_mask_var; - unsigned clipdist_enable_mask; - unsigned user_clip_plane_enable_mask; bool has_clipdist; } lower_ngg_nogs_state; @@ -102,23 +95,21 @@ typedef struct typedef struct { + const ac_nir_lower_ngg_options *options; + nir_function_impl *impl; - enum amd_gfx_level gfx_level; nir_variable *output_vars[VARYING_SLOT_MAX][4]; nir_variable *current_clear_primflag_idx_var; int const_out_vtxcnt[4]; int const_out_prmcnt[4]; - unsigned wave_size; unsigned max_num_waves; unsigned num_vertices_per_primitive; nir_ssa_def *lds_addr_gs_out_vtx; nir_ssa_def *lds_addr_gs_scratch; unsigned lds_bytes_per_gs_out_vertex; unsigned lds_offs_primflags; - bool has_xfb_query; bool found_out_vtxcnt[4]; bool output_compile_time_known; - bool can_cull; bool streamout_enabled; gs_output_info output_info[VARYING_SLOT_MAX]; } lower_ngg_gs_state; @@ -421,10 +412,10 @@ emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives, static void ngg_nogs_init_vertex_indices_vars(nir_builder *b, nir_function_impl *impl, lower_ngg_nogs_state *st) { - for (unsigned v = 0; v < st->num_vertices_per_primitives; ++v) { + for (unsigned v = 0; v < st->options->num_vertices_per_primitive; ++v) { st->gs_vtx_indices_vars[v] = nir_local_variable_create(impl, glsl_uint_type(), "gs_vtx_addr"); - nir_ssa_def *vtx = st->passthrough ? + nir_ssa_def *vtx = st->options->passthrough ? nir_ubfe(b, nir_load_packed_passthrough_primitive_amd(b), nir_imm_int(b, 10 * v), nir_imm_int(b, 9)) : nir_ubfe(b, nir_load_gs_vertex_offset_amd(b, .base = v / 2u), @@ -437,15 +428,16 @@ ngg_nogs_init_vertex_indices_vars(nir_builder *b, nir_function_impl *impl, lower static nir_ssa_def * emit_ngg_nogs_prim_exp_arg(nir_builder *b, lower_ngg_nogs_state *st) { - if (st->passthrough) { + if (st->options->passthrough) { return nir_load_packed_passthrough_primitive_amd(b); } else { nir_ssa_def *vtx_idx[3] = {0}; - for (unsigned v = 0; v < st->num_vertices_per_primitives; ++v) + for (unsigned v = 0; v < st->options->num_vertices_per_primitive; ++v) vtx_idx[v] = nir_load_var(b, st->gs_vtx_indices_vars[v]); - return emit_pack_ngg_prim_exp_arg(b, st->num_vertices_per_primitives, vtx_idx, NULL, st->use_edgeflags); + return emit_pack_ngg_prim_exp_arg(b, st->options->num_vertices_per_primitive, vtx_idx, NULL, + st->options->use_edgeflags); } } @@ -480,7 +472,7 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *st, nir_ssa_def edge_flag_offset = packed_location * 16; } - for (int i = 0; i < st->num_vertices_per_primitives; i++) { + for (int i = 0; i < st->options->num_vertices_per_primitive; i++) { nir_ssa_def *vtx_idx = nir_load_var(b, st->gs_vtx_indices_vars[i]); nir_ssa_def *addr = pervertex_lds_addr(b, vtx_idx, st->pervertex_lds_bytes); nir_ssa_def *edge = nir_load_shared(b, 1, 32, addr, .base = edge_flag_offset); @@ -489,11 +481,12 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *st, nir_ssa_def arg = nir_iand(b, arg, mask); } - if (st->has_prim_query) { + if (st->options->has_prim_query) { nir_if *if_shader_query = nir_push_if(b, nir_load_prim_gen_query_enabled_amd(b)); { /* Number of active GS threads. Each has 1 output primitive. */ - nir_ssa_def *num_gs_threads = nir_bit_count(b, nir_ballot(b, 1, st->wave_size, nir_imm_bool(b, true))); + nir_ssa_def *num_gs_threads = + nir_bit_count(b, nir_ballot(b, 1, st->options->wave_size, nir_imm_bool(b, true))); /* Activate only 1 lane and add the number of primitives to query result. */ nir_if *if_elected = nir_push_if(b, nir_elect(b, 1)); { @@ -523,12 +516,12 @@ emit_ngg_nogs_prim_id_store_shared(nir_builder *b, lower_ngg_nogs_state *st) * It will be exported as a per-vertex attribute. */ nir_ssa_def *gs_vtx_indices[3]; - for (unsigned i = 0; i < st->num_vertices_per_primitives; i++) + for (unsigned i = 0; i < st->options->num_vertices_per_primitive; i++) gs_vtx_indices[i] = nir_load_var(b, st->gs_vtx_indices_vars[i]); nir_ssa_def *provoking_vertex = nir_load_provoking_vtx_in_prim_amd(b); nir_ssa_def *provoking_vtx_idx = nir_select_from_ssa_def_array( - b, gs_vtx_indices, st->num_vertices_per_primitives, provoking_vertex); + b, gs_vtx_indices, st->options->num_vertices_per_primitive, provoking_vertex); nir_ssa_def *prim_id = nir_load_primitive_id(b); nir_ssa_def *addr = pervertex_lds_addr(b, provoking_vtx_idx, st->pervertex_lds_bytes); @@ -563,7 +556,7 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b, lower_ngg_nogs_state *st) }; nir_store_output(b, prim_id, nir_imm_zero(b, 1, 32), - .base = st->primitive_id_location, + .base = st->options->primitive_id_location, .src_type = nir_type_uint32, .io_semantics = io_sem); } @@ -640,7 +633,7 @@ remove_culling_shader_output(nir_builder *b, nir_instr *instr, void *state) base += component; /* valid clipdist component mask */ - unsigned mask = (s->clipdist_enable_mask >> base) & writemask; + unsigned mask = (s->options->clipdist_enable_mask >> base) & writemask; u_foreach_bit(i, mask) { add_clipdist_bit(b, nir_channel(b, store_val, i), base + i, s->clipdist_neg_mask_var); @@ -838,7 +831,7 @@ cleanup_culling_shader_after_dce(nir_shader *shader, uses_vs_instance_id = true; break; case nir_intrinsic_load_input: - if (state->instance_rate_inputs & + if (state->options->instance_rate_inputs & (1u << (nir_intrinsic_base(intrin) - VERT_ATTRIB_GENERIC0))) uses_vs_instance_id = true; else @@ -978,7 +971,7 @@ compact_vertices_after_culling(nir_builder *b, nir_ssa_def *exporter_vtx_indices[3] = {0}; /* Load the index of the ES threads that will export the current GS thread's vertices */ - for (unsigned v = 0; v < nogs_state->num_vertices_per_primitives; ++v) { + for (unsigned v = 0; v < nogs_state->options->num_vertices_per_primitive; ++v) { nir_ssa_def *vtx_addr = nir_load_var(b, gs_vtxaddr_vars[v]); nir_ssa_def *exporter_vtx_idx = nir_load_shared(b, 1, 8, vtx_addr, .base = lds_es_exporter_tid); exporter_vtx_indices[v] = nir_u2u32(b, exporter_vtx_idx); @@ -986,8 +979,8 @@ compact_vertices_after_culling(nir_builder *b, } nir_ssa_def *prim_exp_arg = - emit_pack_ngg_prim_exp_arg(b, nogs_state->num_vertices_per_primitives, - exporter_vtx_indices, NULL, nogs_state->use_edgeflags); + emit_pack_ngg_prim_exp_arg(b, nogs_state->options->num_vertices_per_primitive, + exporter_vtx_indices, NULL, nogs_state->options->use_edgeflags); nir_store_var(b, prim_exp_arg_var, prim_exp_arg, 0x1u); } nir_pop_if(b, if_gs_accepted); @@ -1260,7 +1253,7 @@ cull_primitive_accepted(nir_builder *b, void *state) nir_store_var(b, s->gs_accepted_var, nir_imm_true(b), 0x1u); /* Store the accepted state to LDS for ES threads */ - for (unsigned vtx = 0; vtx < s->num_vertices_per_primitives; ++vtx) + for (unsigned vtx = 0; vtx < s->options->num_vertices_per_primitive; ++vtx) nir_store_shared(b, nir_imm_intN_t(b, 1, 8), s->vtx_addr[vtx], .base = lds_es_vertex_accepted); } @@ -1269,7 +1262,7 @@ clipdist_culling_es_part(nir_builder *b, lower_ngg_nogs_state *nogs_state, nir_ssa_def *es_vertex_lds_addr) { /* no gl_ClipDistance used but we have user defined clip plane */ - if (nogs_state->user_clip_plane_enable_mask && !nogs_state->has_clipdist) { + if (nogs_state->options->user_clip_plane_enable_mask && !nogs_state->has_clipdist) { /* use gl_ClipVertex if defined */ nir_variable *clip_vertex_var = b->shader->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_VERTEX) ? @@ -1278,7 +1271,7 @@ clipdist_culling_es_part(nir_builder *b, lower_ngg_nogs_state *nogs_state, /* clip against user defined clip planes */ for (unsigned i = 0; i < 8; i++) { - if (!(nogs_state->user_clip_plane_enable_mask & BITFIELD_BIT(i))) + if (!(nogs_state->options->user_clip_plane_enable_mask & BITFIELD_BIT(i))) continue; nir_ssa_def *plane = nir_load_user_clip_plane(b, .ucp_id = i); @@ -1349,7 +1342,8 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c nir_local_variable_create(impl, glsl_uint_type(), "repacked_arg_3"), }; - if (nogs_state->clipdist_enable_mask || nogs_state->user_clip_plane_enable_mask) { + if (nogs_state->options->clipdist_enable_mask || + nogs_state->options->user_clip_plane_enable_mask) { nogs_state->clip_vertex_var = nir_local_variable_create(impl, glsl_vec4_type(), "clip_vertex"); nogs_state->clipdist_neg_mask_var = @@ -1450,20 +1444,21 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c { /* Load vertex indices from input VGPRs */ nir_ssa_def *vtx_idx[3] = {0}; - for (unsigned vertex = 0; vertex < nogs_state->num_vertices_per_primitives; ++vertex) + for (unsigned vertex = 0; vertex < nogs_state->options->num_vertices_per_primitive; + ++vertex) vtx_idx[vertex] = nir_load_var(b, nogs_state->gs_vtx_indices_vars[vertex]); nir_ssa_def *pos[3][4] = {0}; /* Load W positions of vertices first because the culling code will use these first */ - for (unsigned vtx = 0; vtx < nogs_state->num_vertices_per_primitives; ++vtx) { + for (unsigned vtx = 0; vtx < nogs_state->options->num_vertices_per_primitive; ++vtx) { nogs_state->vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], pervertex_lds_bytes); pos[vtx][3] = nir_load_shared(b, 1, 32, nogs_state->vtx_addr[vtx], .base = lds_es_pos_w); nir_store_var(b, gs_vtxaddr_vars[vtx], nogs_state->vtx_addr[vtx], 0x1u); } /* Load the X/W, Y/W positions of vertices */ - for (unsigned vtx = 0; vtx < nogs_state->num_vertices_per_primitives; ++vtx) { + for (unsigned vtx = 0; vtx < nogs_state->options->num_vertices_per_primitive; ++vtx) { nir_ssa_def *xy = nir_load_shared(b, 2, 32, nogs_state->vtx_addr[vtx], .base = lds_es_pos_x); pos[vtx][0] = nir_channel(b, xy, 0); pos[vtx][1] = nir_channel(b, xy, 1); @@ -1472,7 +1467,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c nir_ssa_def *accepted_by_clipdist; if (nogs_state->has_clipdist) { nir_ssa_def *clipdist_neg_mask = nir_imm_intN_t(b, 0xff, 8); - for (unsigned vtx = 0; vtx < nogs_state->num_vertices_per_primitives; ++vtx) { + for (unsigned vtx = 0; vtx < nogs_state->options->num_vertices_per_primitive; ++vtx) { nir_ssa_def *mask = nir_load_shared(b, 1, 8, nogs_state->vtx_addr[vtx], .base = lds_es_clipdist_neg_mask); @@ -1486,7 +1481,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c /* See if the current primitive is accepted */ ac_nir_cull_primitive(b, accepted_by_clipdist, pos, - nogs_state->num_vertices_per_primitives, + nogs_state->options->num_vertices_per_primitive, cull_primitive_accepted, nogs_state); } nir_pop_if(b, if_gs_thread); @@ -1511,7 +1506,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c /* Repack the vertices that survived the culling. */ wg_repack_result rep = repack_invocations_in_workgroup(b, es_accepted, lds_scratch_base, nogs_state->max_num_waves, - nogs_state->wave_size); + nogs_state->options->wave_size); nir_ssa_def *num_live_vertices_in_workgroup = rep.num_repacked_invocations; nir_ssa_def *es_exporter_tid = rep.repacked_invocation_index; @@ -1828,7 +1823,7 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) nir_ssa_def *so_buffer[4] = {0}; nir_ssa_def *prim_stride[4] = {0}; nir_ssa_def *tid_in_tg = nir_load_local_invocation_index(b); - ngg_build_streamout_buffer_info(b, info, s->has_prim_query, + ngg_build_streamout_buffer_info(b, info, s->options->has_prim_query, lds_scratch_base, tid_in_tg, gen_prim_per_stream, prim_stride, so_buffer, buffer_offsets, @@ -1841,7 +1836,7 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) nir_ssa_def *num_vert_per_prim = nir_load_num_vertices_per_primitive_amd(b); nir_ssa_def *vtx_buffer_idx = nir_imul(b, tid_in_tg, num_vert_per_prim); - for (unsigned i = 0; i < s->num_vertices_per_primitives; i++) { + for (unsigned i = 0; i < s->options->num_vertices_per_primitive; i++) { nir_if *if_valid_vertex = nir_push_if(b, nir_ilt(b, nir_imm_int(b, i), num_vert_per_prim)); { @@ -1908,34 +1903,23 @@ ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage, } void -ac_nir_lower_ngg_nogs(nir_shader *shader, - enum radeon_family family, - unsigned num_vertices_per_primitives, - unsigned max_workgroup_size, - unsigned wave_size, - bool can_cull, - bool early_prim_export, - bool passthrough, - bool use_edgeflags, - bool has_prim_query, - bool disable_streamout, - int primitive_id_location, - uint32_t instance_rate_inputs, - uint32_t clipdist_enable_mask, - uint32_t user_clip_plane_enable_mask) +ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); - assert(max_workgroup_size && wave_size); - assert(!(can_cull && passthrough)); + assert(options->max_workgroup_size && options->wave_size); + assert(!(options->can_cull && options->passthrough)); nir_variable *position_value_var = nir_local_variable_create(impl, glsl_vec4_type(), "position_value"); nir_variable *prim_exp_arg_var = nir_local_variable_create(impl, glsl_uint_type(), "prim_exp_arg"); - nir_variable *es_accepted_var = can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "es_accepted") : NULL; - nir_variable *gs_accepted_var = can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "gs_accepted") : NULL; - - bool streamout_enabled = shader->xfb_info && !disable_streamout; - bool has_user_edgeflags = use_edgeflags && (shader->info.outputs_written & VARYING_BIT_EDGE); + nir_variable *es_accepted_var = + options->can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "es_accepted") : NULL; + nir_variable *gs_accepted_var = + options->can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "gs_accepted") : NULL; + + bool streamout_enabled = shader->xfb_info && !options->disable_streamout; + bool has_user_edgeflags = + options->use_edgeflags && (shader->info.outputs_written & VARYING_BIT_EDGE); /* streamout need to be done before either prim or vertex export. Because when no * param export, rasterization can start right after prim and vertex export, * which left streamout buffer writes un-finished. @@ -1944,33 +1928,25 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, * This is because edge flags are written by ES threads but they * are exported by GS threads as part of th primitive export. */ - if (streamout_enabled || has_user_edgeflags) - early_prim_export = false; + bool early_prim_export = + options->early_prim_export && !(streamout_enabled || has_user_edgeflags); lower_ngg_nogs_state state = { - .passthrough = passthrough, - .primitive_id_location = primitive_id_location, + .options = options, .early_prim_export = early_prim_export, - .use_edgeflags = use_edgeflags, - .has_prim_query = has_prim_query, .streamout_enabled = streamout_enabled, - .num_vertices_per_primitives = num_vertices_per_primitives, .position_value_var = position_value_var, .prim_exp_arg_var = prim_exp_arg_var, .es_accepted_var = es_accepted_var, .gs_accepted_var = gs_accepted_var, - .max_num_waves = DIV_ROUND_UP(max_workgroup_size, wave_size), - .wave_size = wave_size, - .instance_rate_inputs = instance_rate_inputs, - .clipdist_enable_mask = clipdist_enable_mask, - .user_clip_plane_enable_mask = user_clip_plane_enable_mask, + .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size), .has_user_edgeflags = has_user_edgeflags, }; const bool need_prim_id_store_shared = - primitive_id_location >= 0 && shader->info.stage == MESA_SHADER_VERTEX; + options->primitive_id_location >= 0 && shader->info.stage == MESA_SHADER_VERTEX; - if (primitive_id_location >= 0) { + if (options->primitive_id_location >= 0) { nir_variable *prim_id_var = nir_variable_create(shader, nir_var_shader_out, glsl_uint_type(), "ngg_prim_id"); prim_id_var->data.location = VARYING_SLOT_PRIMITIVE_ID; prim_id_var->data.driver_location = VARYING_SLOT_PRIMITIVE_ID; @@ -1982,7 +1958,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, nir_builder *b = &builder; /* This is to avoid the & */ nir_builder_init(b, impl); - if (can_cull) { + if (options->can_cull) { /* We need divergence info for culling shaders. */ nir_divergence_analysis(shader); analyze_shader_before_culling(shader, &state); @@ -1995,9 +1971,9 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, ngg_nogs_init_vertex_indices_vars(b, impl, &state); - if (!can_cull) { + if (!options->can_cull) { /* Newer chips can use PRIMGEN_PASSTHRU_NO_MSG to skip gs_alloc_req for NGG passthrough. */ - if (!(passthrough && family >= CHIP_NAVI23)) { + if (!(options->passthrough && options->family >= CHIP_NAVI23)) { /* Allocate export space on wave 0 - confirm to the HW that we want to use all possible space */ nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_load_subgroup_id(b), nir_imm_int(b, 0))); { @@ -2034,7 +2010,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, ngg_nogs_get_pervertex_lds_size(shader->info.stage, shader->num_outputs, state.streamout_enabled, - primitive_id_location >= 0, + options->primitive_id_location >= 0, state.has_user_edgeflags); if (need_prim_id_store_shared) { @@ -2046,7 +2022,8 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, } nir_intrinsic_instr *export_vertex_instr; - nir_ssa_def *es_thread = can_cull ? nir_load_var(b, es_accepted_var) : nir_has_input_vertex_amd(b); + nir_ssa_def *es_thread = + options->can_cull ? nir_load_var(b, es_accepted_var) : nir_has_input_vertex_amd(b); nir_if *if_es_thread = nir_push_if(b, es_thread); { @@ -2054,7 +2031,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, nir_cf_reinsert(&extracted, b->cursor); b->cursor = nir_after_cf_list(&if_es_thread->then_list); - if (state.primitive_id_location >= 0) + if (options->primitive_id_location >= 0) emit_store_ngg_nogs_es_primitive_id(b, &state); /* Export all vertex attributes (including the primitive ID) */ @@ -2064,7 +2041,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, if (state.streamout_enabled) { /* TODO: support culling after streamout. */ - assert(!can_cull); + assert(!options->can_cull); ngg_nogs_build_streamout(b, &state); } @@ -2080,7 +2057,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, prim_exp_arg_var)); } - if (can_cull) { + if (options->can_cull) { /* Replace uniforms. */ apply_reusable_variables(b, &state); @@ -2109,7 +2086,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, nir_lower_alu_to_scalar(shader, NULL, NULL); nir_lower_phis_to_scalar(shader, true); - if (can_cull) { + if (options->can_cull) { /* It's beneficial to redo these opts after splitting the shader. */ nir_opt_sink(shader, nir_move_load_input | nir_move_const_undef | nir_move_copies); nir_opt_move(shader, nir_move_load_input | nir_move_copies | nir_move_const_undef); @@ -2122,7 +2099,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, NIR_PASS(progress, shader, nir_opt_dce); NIR_PASS(progress, shader, nir_opt_dead_cf); - if (can_cull) + if (options->can_cull) progress |= cleanup_culling_shader_after_dce(shader, b->impl, &state); } while (progress); } @@ -2211,8 +2188,8 @@ ngg_gs_clear_primflags(nir_builder *b, nir_ssa_def *num_vertices, unsigned strea static void ngg_gs_shader_query(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg_gs_state *s) { - bool has_xfb_query = s->has_xfb_query; - bool has_pipeline_stats_query = s->gfx_level < GFX11; + bool has_xfb_query = s->options->has_xfb_query; + bool has_pipeline_stats_query = s->options->gfx_level < GFX11; nir_ssa_def *pipeline_query_enabled = NULL; nir_ssa_def *prim_gen_query_enabled = NULL; @@ -2244,7 +2221,8 @@ ngg_gs_shader_query(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg_gs_st unsigned gs_vtx_cnt = nir_src_as_uint(intrin->src[0]); unsigned gs_prm_cnt = nir_src_as_uint(intrin->src[1]); unsigned total_prm_cnt = gs_vtx_cnt - gs_prm_cnt * (s->num_vertices_per_primitive - 1u); - nir_ssa_def *num_threads = nir_bit_count(b, nir_ballot(b, 1, s->wave_size, nir_imm_bool(b, true))); + nir_ssa_def *num_threads = + nir_bit_count(b, nir_ballot(b, 1, s->options->wave_size, nir_imm_bool(b, true))); num_prims_in_wave = nir_imul_imm(b, num_threads, total_prm_cnt); } else { nir_ssa_def *gs_vtx_cnt = intrin->src[0].ssa; @@ -2427,9 +2405,10 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri * - bit 2: whether vertex is live (if culling is enabled: set after culling, otherwise always 1) */ - nir_ssa_def *vertex_live_flag = !stream && s->can_cull ? - nir_ishl_imm(b, nir_b2i32(b, nir_inot(b, nir_load_cull_any_enabled_amd(b))), 2) : - nir_imm_int(b, 0b100); + nir_ssa_def *vertex_live_flag = + !stream && s->options->can_cull + ? nir_ishl_imm(b, nir_b2i32(b, nir_inot(b, nir_load_cull_any_enabled_amd(b))), 2) + : nir_imm_int(b, 0b100); nir_ssa_def *completes_prim = nir_ige(b, current_vtx_per_prim, nir_imm_int(b, s->num_vertices_per_primitive - 1)); nir_ssa_def *complete_flag = nir_b2i32(b, completes_prim); @@ -2816,7 +2795,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *st) */ wg_repack_result rep = repack_invocations_in_workgroup(b, prim_live[stream], scratch_base, - st->max_num_waves, st->wave_size); + st->max_num_waves, st->options->wave_size); /* nir_intrinsic_set_vertex_and_primitive_count can also get primitive count of * current wave, but still need LDS to sum all wave's count to get workgroup count. @@ -2837,7 +2816,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *st) nir_ssa_def *buffer_offsets[4] = {0}; nir_ssa_def *so_buffer[4] = {0}; nir_ssa_def *prim_stride[4] = {0}; - ngg_build_streamout_buffer_info(b, info, st->has_xfb_query, + ngg_build_streamout_buffer_info(b, info, st->options->has_xfb_query, st->lds_addr_gs_scratch, tid_in_tg, gen_prim, prim_stride, so_buffer, buffer_offsets, emit_prim); @@ -2906,7 +2885,7 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s) } /* cull primitives */ - if (s->can_cull) { + if (s->options->can_cull) { nir_if *if_cull_en = nir_push_if(b, nir_load_cull_any_enabled_amd(b)); /* culling code will update the primflag */ @@ -2925,7 +2904,8 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s) * To ensure this, we need to repack invocations that have a live vertex. */ nir_ssa_def *vertex_live = nir_ine(b, out_vtx_primflag_0, nir_imm_zero(b, 1, out_vtx_primflag_0->bit_size)); - wg_repack_result rep = repack_invocations_in_workgroup(b, vertex_live, s->lds_addr_gs_scratch, s->max_num_waves, s->wave_size); + wg_repack_result rep = repack_invocations_in_workgroup(b, vertex_live, s->lds_addr_gs_scratch, + s->max_num_waves, s->options->wave_size); nir_ssa_def *workgroup_num_vertices = rep.num_repacked_invocations; nir_ssa_def *exporter_tid_in_tg = rep.repacked_invocation_index; @@ -2951,31 +2931,21 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s) } void -ac_nir_lower_ngg_gs(nir_shader *shader, - enum amd_gfx_level gfx_level, - unsigned wave_size, - unsigned max_workgroup_size, - unsigned gs_out_vtx_bytes, - bool has_xfb_query, - bool can_cull, - bool disable_streamout) +ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); lower_ngg_gs_state state = { + .options = options, .impl = impl, - .gfx_level = gfx_level, - .max_num_waves = DIV_ROUND_UP(max_workgroup_size, wave_size), - .wave_size = wave_size, - .lds_offs_primflags = gs_out_vtx_bytes, - .lds_bytes_per_gs_out_vertex = gs_out_vtx_bytes + 4u, - .can_cull = can_cull, - .streamout_enabled = shader->xfb_info && !disable_streamout, - .has_xfb_query = has_xfb_query, + .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size), + .lds_offs_primflags = options->gs_out_vtx_bytes, + .lds_bytes_per_gs_out_vertex = options->gs_out_vtx_bytes + 4u, + .streamout_enabled = shader->xfb_info && !options->disable_streamout, }; - if (!can_cull) { + if (!options->can_cull) { nir_gs_count_vertices_and_primitives(shader, state.const_out_vtxcnt, state.const_out_prmcnt, 4u); state.output_compile_time_known = diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index fe852fd..e31c128 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1426,31 +1426,39 @@ void radv_lower_ngg(struct radv_device *device, struct radv_pipeline_stage *ngg_ setup_ngg_lds_layout(nir, &ngg_stage->info, max_vtx_in); + ac_nir_lower_ngg_options options = {0}; + options.family = device->physical_device->rad_info.family; + options.gfx_level = device->physical_device->rad_info.gfx_level; + options.max_workgroup_size = info->workgroup_size; + options.wave_size = info->wave_size; + options.can_cull = nir->info.stage != MESA_SHADER_GEOMETRY && info->has_ngg_culling; + options.disable_streamout = true; + if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL) { - bool export_prim_id = info->outinfo.export_prim_id; - assert(info->is_ngg); if (info->has_ngg_culling) radv_optimize_nir_algebraic(nir, false); - NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, - device->physical_device->rad_info.family, num_vertices_per_prim, - info->workgroup_size, info->wave_size, info->has_ngg_culling, - info->has_ngg_early_prim_export, info->is_ngg_passthrough, - false, pl_key->primitives_generated_query, - true, export_prim_id ? VARYING_SLOT_PRIMITIVE_ID : -1, - pl_key->vs.instance_rate_inputs, 0, 0); + options.num_vertices_per_primitive = num_vertices_per_prim; + options.early_prim_export = info->has_ngg_early_prim_export; + options.passthrough = info->is_ngg_passthrough; + options.has_prim_query = pl_key->primitives_generated_query; + options.primitive_id_location = info->outinfo.export_prim_id ? VARYING_SLOT_PRIMITIVE_ID : -1; + options.instance_rate_inputs = pl_key->vs.instance_rate_inputs; + + NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options); /* Increase ESGS ring size so the LLVM binary contains the correct LDS size. */ ngg_stage->info.ngg_info.esgs_ring_size = nir->info.shared_size; } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { assert(info->is_ngg); - NIR_PASS_V(nir, ac_nir_lower_ngg_gs, - device->physical_device->rad_info.gfx_level, - info->wave_size, info->workgroup_size, - info->gs.gsvs_vertex_size, true, false, true); + + options.gs_out_vtx_bytes = info->gs.gsvs_vertex_size; + options.has_xfb_query = true; + + NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options); } else if (nir->info.stage == MESA_SHADER_MESH) { bool scratch_ring = false; NIR_PASS_V(nir, ac_nir_lower_ngg_ms, &scratch_ring, info->wave_size, pl_key->has_multiview_view_index); -- 2.7.4