From bb127c2130c550b222541aaaac3da377083e8e56 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Thu, 22 Apr 2021 14:44:28 +0200 Subject: [PATCH] radv: Use new NIR lowering of NGG GS when ACO is used. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 679 +-------------------- src/amd/compiler/aco_instruction_selection.h | 8 - .../compiler/aco_instruction_selection_setup.cpp | 23 +- src/amd/vulkan/radv_shader.c | 11 +- 4 files changed, 16 insertions(+), 705 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 089addd..bb60ca9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4290,44 +4290,6 @@ Temp thread_id_in_threadgroup(isel_context *ctx) return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave)); } -Temp wave_count_in_threadgroup(isel_context *ctx) -{ - Builder bld(ctx->program, ctx->block); - return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(28u | (4u << 16))); -} - -Temp ngg_gs_vertex_lds_addr(isel_context *ctx, Temp vertex_idx) -{ - Builder bld(ctx->program, ctx->block); - unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1; - - /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ - if (write_stride_2exp) { - Temp row = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(5u), vertex_idx); - Temp swizzle = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand((1u << write_stride_2exp) - 1), row); - vertex_idx = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), vertex_idx, swizzle); - } - - Temp vertex_idx_bytes = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->ngg_gs_emit_vtx_bytes); - return bld.vadd32(bld.def(v1), vertex_idx_bytes, Operand(ctx->ngg_gs_emit_addr)); -} - -Temp ngg_gs_emit_vertex_lds_addr(isel_context *ctx, Temp emit_vertex_idx) -{ - /* Should be used by GS threads only (not by the NGG GS epilogue). - * Returns the LDS address of the given vertex index as emitted by the current GS thread. - */ - - Builder bld(ctx->program, ctx->block); - - Temp thread_id_in_tg = thread_id_in_threadgroup(ctx); - Temp thread_vertices_addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, ctx->shader->info.gs.vertices_out); - Temp vertex_idx = bld.vadd32(bld.def(v1), thread_vertices_addr, emit_vertex_idx); - - return ngg_gs_vertex_lds_addr(ctx, vertex_idx); -} - Temp get_tess_rel_patch_id(isel_context *ctx) { Builder bld(ctx->program, ctx->block); @@ -6988,115 +6950,6 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) { } } -unsigned gs_outprim_vertices(unsigned outprim) -{ - switch (outprim) { - case 0: /* GL_POINTS */ - return 1; - case 3: /* GL_LINE_STRIP */ - return 2; - case 5: /* GL_TRIANGLE_STRIP */ - return 3; - default: - unreachable("Unsupported GS output primitive type."); - } -} - -void ngg_visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) -{ - Builder bld(ctx->program, ctx->block); - Temp emit_vertex_idx = get_ssa_temp(ctx, instr->src[0].ssa); - Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, emit_vertex_idx); - unsigned stream = nir_intrinsic_stream_id(instr); - unsigned out_idx = 0; - - for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) { - if (ctx->program->info->gs.output_streams[i] != stream) { - continue; - } else if (!ctx->outputs.mask[i] && ctx->program->info->gs.output_usage_mask[i]) { - /* The GS can write this output, but it's empty for the current vertex. */ - out_idx++; - continue; - } - - uint32_t wrmask = ctx->program->info->gs.output_usage_mask[i] & - ctx->outputs.mask[i]; - - /* Clear output for the next vertex. */ - ctx->outputs.mask[i] = 0; - - if (!wrmask) - continue; - - for (unsigned j = 0; j < 4; j++) { - if (wrmask & (1 << j)) { - Temp elem = ctx->outputs.temps[i * 4u + j]; - store_lds(ctx, elem.bytes(), elem, 0x1u, emit_vertex_addr, out_idx * 4u, 4u); - } - - out_idx++; - } - } - - /* Calculate per-vertex primitive flags based on current and total vertex count per primitive: - * bit 0: whether this vertex finishes a primitive - * bit 1: whether the primitive is odd (if we are emitting triangle strips, otherwise always 0) - * bit 2: always 1 (so that we can use it for determining vertex liveness) - */ - unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive); - bool calc_odd = stream == 0 && total_vtx_per_prim == 3; - Temp prim_flag; - - if (nir_src_is_const(instr->src[1])) { - uint8_t current_vtx_per_prim = nir_src_as_uint(instr->src[1]); - uint8_t completes_prim = (current_vtx_per_prim >= (total_vtx_per_prim - 1)) ? 1 : 0; - uint8_t odd = (uint8_t)calc_odd & current_vtx_per_prim; - uint8_t flag = completes_prim | (odd << 1) | (1 << 2); - prim_flag = bld.copy(bld.def(v1b), Operand(flag)); - } else if (!instr->src[1].ssa->divergent) { - Temp current_vtx_per_prim = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); - Temp completes_prim = bld.sopc(aco_opcode::s_cmp_le_u32, bld.def(s1, scc), Operand(total_vtx_per_prim - 1), current_vtx_per_prim); - prim_flag = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0b101u), Operand(0b100u), bld.scc(completes_prim)); - if (calc_odd) { - Temp odd = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), current_vtx_per_prim, Operand(0u)); - prim_flag = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1), bld.def(s1, scc), odd, prim_flag); - } - } else { - Temp current_vtx_per_prim = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); - Temp completes_prim = bld.vopc(aco_opcode::v_cmp_le_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(total_vtx_per_prim - 1), current_vtx_per_prim); - prim_flag = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0b100u), Operand(0b101u), Operand(completes_prim)); - if (calc_odd) { - Temp odd = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), current_vtx_per_prim); - prim_flag = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), odd, Operand(1u), prim_flag); - } - } - - /* Store the per-vertex primitive flags at the end of the vertex data */ - prim_flag = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), as_vgpr(ctx, prim_flag), Operand(0u)); - unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream; - store_lds(ctx, 1, prim_flag, 1u, emit_vertex_addr, primflag_offset, 1); -} - -void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream); -void ngg_gs_write_shader_query(isel_context *ctx, nir_intrinsic_instr *instr); - -void ngg_visit_set_vertex_and_primitive_count(isel_context *ctx, nir_intrinsic_instr *instr) -{ - unsigned stream = nir_intrinsic_stream_id(instr); - if (stream > 0 && !ctx->args->shader_info->gs.num_stream_output_components[stream]) - return; - - ctx->ngg_gs_known_vtxcnt[stream] = true; - - /* Clear the primitive flags of non-emitted GS vertices. */ - if (!nir_src_is_const(instr->src[0]) || nir_src_as_uint(instr->src[0]) < ctx->shader->info.gs.vertices_out) { - Temp vtx_cnt = get_ssa_temp(ctx, instr->src[0].ssa); - ngg_gs_clear_primflags(ctx, vtx_cnt, stream); - } - - ngg_gs_write_shader_query(ctx, instr); -} - void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); @@ -8521,10 +8374,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_emit_vertex_with_counter: { - if (ctx->stage.hw == HWStage::NGG) - ngg_visit_emit_vertex_with_counter(ctx, instr); - else - visit_emit_vertex_with_counter(ctx, instr); + assert(ctx->stage.hw == HWStage::GS); + visit_emit_vertex_with_counter(ctx, instr); break; } case nir_intrinsic_end_primitive_with_counter: { @@ -8535,8 +8386,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_set_vertex_and_primitive_count: { - if (ctx->stage.hw == HWStage::NGG) - ngg_visit_set_vertex_and_primitive_count(ctx, instr); + assert(ctx->stage.hw == HWStage::GS); /* unused in the legacy pipeline, the HW keeps track of this for us */ break; } @@ -11193,20 +11043,6 @@ Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i) return lanecount_to_mask(ctx, count); } -Temp ngg_max_vertex_count(isel_context *ctx) -{ - Builder bld(ctx->program, ctx->block); - return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(12u | (9u << 16u))); -} - -Temp ngg_max_primitive_count(isel_context *ctx) -{ - Builder bld(ctx->program, ctx->block); - return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(22u | (9u << 16u))); -} - void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt) { assert(vtx_cnt.id() && prm_cnt.id()); @@ -11214,7 +11050,7 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt Builder bld(ctx->program, ctx->block); Temp prm_cnt_0; - if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS) && ctx->ngg_gs_const_prmcnt[0] <= 0) { + if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS)) { /* Navi 1x workaround: make sure to always export at least 1 vertex and triangle */ prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand(0u)); prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), prm_cnt, bld.scc(prm_cnt_0)); @@ -11254,495 +11090,6 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt } } -void ngg_emit_wave0_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt = Temp(), Temp prm_cnt = Temp()) -{ - Builder bld(ctx->program, ctx->block); - - /* Get the id of the current wave within the threadgroup (workgroup) */ - Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), - get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(24u | (4u << 16))); - - /* Execute the following code only on the first wave (wave id 0), - * use the SCC def to tell if the wave id is zero or not. - */ - Temp waveid_as_cond = wave_id_in_tg.def(1).getTemp(); - if_context ic; - begin_uniform_if_then(ctx, &ic, waveid_as_cond); - begin_uniform_if_else(ctx, &ic); - bld.reset(ctx->block); - - /* VS/TES: we infer the vertex and primitive count from arguments - * GS: the caller needs to supply them - */ - assert(ctx->stage.has(SWStage::GS) - ? (vtx_cnt.id() && prm_cnt.id()) - : (!vtx_cnt.id() && !prm_cnt.id())); - - /* Number of vertices output by VS/TES */ - if (vtx_cnt.id() == 0) - vtx_cnt = ngg_max_vertex_count(ctx); - - /* Number of primitives output by VS/TES */ - if (prm_cnt.id() == 0) - prm_cnt = ngg_max_primitive_count(ctx); - - ngg_emit_sendmsg_gs_alloc_req(ctx, vtx_cnt, prm_cnt); - - end_uniform_if(ctx, &ic); -} - -Temp ngg_pack_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[], const Temp is_null) -{ - Builder bld(ctx->program, ctx->block); - - Temp tmp; - Temp gs_invocation_id; - - if (ctx->stage == vertex_ngg) - gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id); - - for (unsigned i = 0; i < num_vertices; ++i) { - assert(vtxindex[i].id()); - - if (i) - tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp); - else - tmp = vtxindex[i]; - - /* The initial edge flag is always false in tess eval shaders. */ - if (ctx->stage == vertex_ngg) { - Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8u + i), Operand(1u)); - tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp); - } - } - - if (is_null.id()) - tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), is_null, Operand(31u), tmp); - - return tmp; -} - -void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[], const Temp is_null = Temp()) -{ - Builder bld(ctx->program, ctx->block); - Temp prim_exp_arg; - - if (!ctx->stage.has(SWStage::GS) && ctx->args->options->key.vs_common_out.as_ngg_passthrough) - prim_exp_arg = get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]); - else - prim_exp_arg = ngg_pack_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex, is_null); - - bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1), - 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, - false /* compressed */, true/* done */, false /* valid mask */); -} - -std::pair ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp src_mask) -{ - /* Workgroup scan for NGG GS. - * This performs a reduction along with an exclusive scan addition accross the workgroup. - * Assumes that all lanes are enabled (exec = -1) where this is emitted. - * - * Input: (1) per-lane bool - * -- 1 if the lane has a live/valid vertex, 0 otherwise - * Output: (1) result of a reduction over the entire workgroup, - * -- the total number of vertices emitted by the workgroup - * (2) result of an exclusive scan over the entire workgroup - * -- used for vertex compaction, in order to determine - * which lane should export the current lane's vertex - */ - - Builder bld(ctx->program, ctx->block); - assert(src_mask.regClass() == bld.lm); - - /* Subgroup reduction and exclusive scan on the per-lane boolean. */ - Temp sg_reduction = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), src_mask); - Temp sg_excl = emit_mbcnt(ctx, bld.tmp(v1), Operand(src_mask)); - - if (ctx->program->workgroup_size <= ctx->program->wave_size) - return std::make_pair(sg_reduction, sg_excl); - - if_context ic; - - /* Determine if the current lane is the first. */ - Temp is_first_lane = bld.copy(bld.def(bld.lm), Operand(1u, ctx->program->wave_size == 64)); - Temp wave_id_in_tg = wave_id_in_threadgroup(ctx); - begin_divergent_if_then(ctx, &ic, is_first_lane); - bld.reset(ctx->block); - - /* The first lane of each wave stores the result of its subgroup reduction to LDS (NGG scratch). */ - Temp wave_id_in_tg_lds_addr = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), wave_id_in_tg); - store_lds(ctx, 4u, as_vgpr(ctx, sg_reduction), 0x1u, wave_id_in_tg_lds_addr, ctx->ngg_gs_scratch_addr, 4u); - - /* Wait for all waves to write to LDS. */ - create_workgroup_barrier(bld); - - /* Number of LDS dwords written by all waves (if there is only 1, that is already handled above) */ - unsigned num_lds_dwords = DIV_ROUND_UP(MIN2(ctx->program->workgroup_size, 256), ctx->program->wave_size); - assert(num_lds_dwords >= 2 && num_lds_dwords <= 8); - - /* The first lane of each wave loads every wave's results from LDS, to avoid bank conflicts */ - Temp reduction_per_wave_vector = load_lds(ctx, 4u * num_lds_dwords, bld.tmp(RegClass(RegType::vgpr, num_lds_dwords)), - bld.copy(bld.def(v1), Operand(0u)), ctx->ngg_gs_scratch_addr, 16u); - - begin_divergent_if_else(ctx, &ic); - end_divergent_if(ctx, &ic); - bld.reset(ctx->block); - - /* Create phis which get us the above reduction results, or undef. */ - bld.reset(&ctx->block->instructions, ctx->block->instructions.begin()); - reduction_per_wave_vector = bld.pseudo(aco_opcode::p_phi, bld.def(reduction_per_wave_vector.regClass()), reduction_per_wave_vector, Operand(reduction_per_wave_vector.regClass())); - bld.reset(ctx->block); - - emit_split_vector(ctx, reduction_per_wave_vector, num_lds_dwords); - Temp reduction_per_wave[8]; - - for (unsigned i = 0; i < num_lds_dwords; ++i) { - Temp reduction_current_wave = emit_extract_vector(ctx, reduction_per_wave_vector, i, v1); - reduction_per_wave[i] = bld.readlane(bld.def(s1), reduction_current_wave, Operand(0u)); - } - - Temp wave_count = wave_count_in_threadgroup(ctx); - Temp reduction_result = reduction_per_wave[0]; - Temp excl_base; - - for (unsigned i = 0; i < num_lds_dwords; ++i) { - /* Workgroup reduction: - * Add the reduction results from all waves (up to and including wave_count). - */ - if (i != 0) { - Temp should_add = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), wave_count, Operand(i + 1u)); - Temp addition = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), reduction_per_wave[i], Operand(0u), bld.scc(should_add)); - reduction_result = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), reduction_result, addition); - } - - /* Base of workgroup exclusive scan: - * Add the reduction results from waves up to and excluding wave_id_in_tg. - */ - if (i != (num_lds_dwords - 1)) { - Temp should_add = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), wave_id_in_tg, Operand(i + 1u)); - Temp addition = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), reduction_per_wave[i], Operand(0u), bld.scc(should_add)); - excl_base = !excl_base.id() ? addition : bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), excl_base, addition); - } - } - - assert(excl_base.id()); - - /* WG exclusive scan result: base + subgroup exclusive result. */ - Temp wg_excl = bld.vadd32(bld.def(v1), Operand(excl_base), Operand(sg_excl)); - - return std::make_pair(reduction_result, wg_excl); -} - -void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream) -{ - loop_context lc; - if_context ic; - Builder bld(ctx->program, ctx->block); - Temp zero = bld.copy(bld.def(v1b), Operand(uint8_t(0))); - Temp counter_init = bld.copy(bld.def(v1), as_vgpr(ctx, vtx_cnt)); - - begin_loop(ctx, &lc); - - Temp incremented_counter = bld.tmp(counter_init.regClass()); - bld.reset(&ctx->block->instructions, ctx->block->instructions.begin()); - Temp counter = bld.pseudo(aco_opcode::p_phi, bld.def(counter_init.regClass()), Operand(counter_init), incremented_counter); - bld.reset(ctx->block); - Temp break_cond = bld.vopc(aco_opcode::v_cmp_le_u32, bld.def(bld.lm), Operand(ctx->shader->info.gs.vertices_out), counter); - - /* Break when vertices_out <= counter */ - begin_divergent_if_then(ctx, &ic, break_cond); - emit_loop_break(ctx); - begin_divergent_if_else(ctx, &ic); - end_divergent_if(ctx, &ic); - bld.reset(ctx->block); - - /* Store zero to the primitive flag of the current vertex for the current stream */ - Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, counter); - unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream; - store_lds(ctx, 1, zero, 0xf, emit_vertex_addr, primflag_offset, 1); - - /* Increment counter */ - bld.vadd32(Definition(incremented_counter), counter, Operand(1u)); - - end_loop(ctx, &lc); -} - -void ngg_gs_write_shader_query(isel_context *ctx, nir_intrinsic_instr *instr) -{ - /* Each subgroup uses a single GDS atomic to collect the total number of primitives. - * TODO: Consider using primitive compaction at the end instead. - */ - - unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive); - if_context ic_shader_query; - Builder bld(ctx->program, ctx->block); - - Temp shader_query = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), get_arg(ctx, ctx->args->ngg_gs_state), Operand(0u)); - begin_uniform_if_then(ctx, &ic_shader_query, shader_query); - bld.reset(ctx->block); - - Temp sg_prm_cnt; - - /* Calculate the "real" number of emitted primitives from the emitted GS vertices and primitives. - * GS emits points, line strips or triangle strips. - * Real primitives are points, lines or triangles. - */ - if (nir_src_is_const(instr->src[0]) && nir_src_is_const(instr->src[1])) { - unsigned gs_vtx_cnt = nir_src_as_uint(instr->src[0]); - unsigned gs_prm_cnt = nir_src_as_uint(instr->src[1]); - Temp prm_cnt = bld.copy(bld.def(s1), Operand(gs_vtx_cnt - gs_prm_cnt * (total_vtx_per_prim - 1u))); - Temp thread_cnt = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm)); - sg_prm_cnt = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), prm_cnt, thread_cnt); - } else { - Temp gs_vtx_cnt = get_ssa_temp(ctx, instr->src[0].ssa); - Temp prm_cnt = get_ssa_temp(ctx, instr->src[1].ssa); - if (total_vtx_per_prim > 1) - prm_cnt = bld.vop3(aco_opcode::v_mad_i32_i24, bld.def(v1), prm_cnt, Operand(-1u * (total_vtx_per_prim - 1)), gs_vtx_cnt); - else - prm_cnt = as_vgpr(ctx, prm_cnt); - - /* Reduction calculates the primitive count for the entire subgroup. */ - sg_prm_cnt = emit_reduction_instr(ctx, aco_opcode::p_reduce, ReduceOp::iadd32, - ctx->program->wave_size, bld.def(s1), prm_cnt); - } - - Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)); - Temp is_first_lane = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), - Operand(1u, ctx->program->wave_size == 64), first_lane); - - if_context ic_last_lane; - begin_divergent_if_then(ctx, &ic_last_lane, is_first_lane); - bld.reset(ctx->block); - - Temp gds_addr = bld.copy(bld.def(v1), Operand(0u)); - Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0x100u))); - bld.ds(aco_opcode::ds_add_u32, gds_addr, as_vgpr(ctx, sg_prm_cnt), m, 0u, 0u, true); - - begin_divergent_if_else(ctx, &ic_last_lane); - end_divergent_if(ctx, &ic_last_lane); - - begin_uniform_if_else(ctx, &ic_shader_query); - end_uniform_if(ctx, &ic_shader_query); -} - -Temp ngg_gs_load_prim_flag_0(isel_context *ctx, Temp tid_in_tg, Temp max_vtxcnt, Temp vertex_lds_addr) -{ - if_context ic; - Builder bld(ctx->program, ctx->block); - - Temp is_vertex_emit_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_vtxcnt, tid_in_tg); - begin_divergent_if_then(ctx, &ic, is_vertex_emit_thread); - bld.reset(ctx->block); - - Operand m = load_lds_size_m0(bld); - Temp prim_flag_0 = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset); - - begin_divergent_if_else(ctx, &ic); - end_divergent_if(ctx, &ic); - - bld.reset(&ctx->block->instructions, ctx->block->instructions.begin()); - prim_flag_0 = bld.pseudo(aco_opcode::p_phi, bld.def(prim_flag_0.regClass()), Operand(prim_flag_0), Operand(0u)); - - return prim_flag_0; -} - -void ngg_gs_setup_vertex_compaction(isel_context *ctx, Temp vertex_live, Temp tid_in_tg, Temp exporter_tid_in_tg) -{ - if_context ic; - Builder bld(ctx->program, ctx->block); - assert(vertex_live.regClass() == bld.lm); - - begin_divergent_if_then(ctx, &ic, vertex_live); - bld.reset(ctx->block); - - /* Setup the vertex compaction. - * Save the current thread's id for the thread which will export the current vertex. - * We reuse stream 1 of the primitive flag of the other thread's vertex for storing this. - */ - Temp export_thread_lds_addr = ngg_gs_vertex_lds_addr(ctx, exporter_tid_in_tg); - tid_in_tg = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tid_in_tg, Operand(0u)); - store_lds(ctx, 1u, tid_in_tg, 1u, export_thread_lds_addr, ctx->ngg_gs_primflags_offset + 1u, 1u); - - begin_divergent_if_else(ctx, &ic); - end_divergent_if(ctx, &ic); - bld.reset(ctx->block); - - /* Wait for all waves to setup the vertex compaction. */ - create_workgroup_barrier(bld); -} - -void ngg_gs_export_primitives(isel_context *ctx, Temp max_prmcnt, Temp tid_in_tg, Temp exporter_tid_in_tg, - Temp prim_flag_0) -{ - if_context ic; - Builder bld(ctx->program, ctx->block); - unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive); - assert(total_vtx_per_prim <= 3); - - Temp is_prim_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_prmcnt, tid_in_tg); - begin_divergent_if_then(ctx, &ic, is_prim_export_thread); - bld.reset(ctx->block); - - Temp is_null_prim = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(-1u), prim_flag_0); - Temp indices[3]; - - indices[total_vtx_per_prim - 1] = exporter_tid_in_tg; - if (total_vtx_per_prim >= 2) - indices[total_vtx_per_prim - 2] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(1u)); - if (total_vtx_per_prim == 3) - indices[total_vtx_per_prim - 3] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(2u)); - - if (total_vtx_per_prim == 3) { - /* API GS outputs triangle strips, but NGG HW needs triangles. - * We already have triangles due to how we set the primitive flags, but we need to - * make sure the vertex order is so that the front/back is correct, and the provoking vertex is kept. - */ - bool flatshade_first = !ctx->args->options->key.vs.provoking_vtx_last; - - /* If the triangle is odd, this will swap its two non-provoking vertices. */ - Temp is_odd = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), Operand(prim_flag_0), Operand(1u), Operand(1u)); - if (flatshade_first) { - indices[1] = bld.vadd32(bld.def(v1), indices[1], Operand(is_odd)); - indices[2] = bld.vsub32(bld.def(v1), indices[2], Operand(is_odd)); - } else { - indices[0] = bld.vadd32(bld.def(v1), indices[0], Operand(is_odd)); - indices[1] = bld.vsub32(bld.def(v1), indices[1], Operand(is_odd)); - } - } - - ngg_emit_prim_export(ctx, total_vtx_per_prim, indices, is_null_prim); - - begin_divergent_if_else(ctx, &ic); - end_divergent_if(ctx, &ic); -} - -void ngg_gs_export_vertices(isel_context *ctx, Temp wg_vtx_cnt, Temp tid_in_tg, Temp vertex_lds_addr) -{ - if_context ic; - Builder bld(ctx->program, ctx->block); - - /* See if the current thread has to export a vertex. */ - Temp is_vtx_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), wg_vtx_cnt, tid_in_tg); - begin_divergent_if_then(ctx, &ic, is_vtx_export_thread); - bld.reset(ctx->block); - - /* The index of the vertex that the current thread will export. */ - Temp exported_vtx_idx; - - if (ctx->ngg_gs_early_alloc) { - /* No vertex compaction necessary, the thread can export its own vertex. */ - exported_vtx_idx = tid_in_tg; - } else { - /* Vertex compaction: read stream 1 of the primitive flags to see which vertex the current thread needs to export */ - Operand m = load_lds_size_m0(bld); - exported_vtx_idx = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset + 1); - } - - /* Get the LDS address of the vertex that the current thread must export. */ - Temp exported_vtx_addr = ngg_gs_vertex_lds_addr(ctx, exported_vtx_idx); - - /* Read the vertex attributes from LDS. */ - unsigned out_idx = 0; - for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) { - if (ctx->program->info->gs.output_streams[i] != 0) - continue; - - /* Set the output mask to the GS output usage mask. */ - unsigned rdmask = - ctx->outputs.mask[i] = - ctx->program->info->gs.output_usage_mask[i]; - - if (!rdmask) - continue; - - for (unsigned j = 0; j < 4; j++) { - if (rdmask & (1 << j)) - ctx->outputs.temps[i * 4u + j] = - load_lds(ctx, 4u, bld.tmp(v1), exported_vtx_addr, out_idx * 4u, 4u); - - out_idx++; - } - } - - /* Export the vertex parameters. */ - create_vs_exports(ctx); - - begin_divergent_if_else(ctx, &ic); - end_divergent_if(ctx, &ic); -} - -void ngg_gs_prelude(isel_context *ctx) -{ - if (!ctx->ngg_gs_early_alloc) - return; - - /* We know the GS writes the maximum possible number of vertices, so - * it's likely that most threads need to export a primitive, too. - * Thus, we won't have to worry about primitive compaction here. - */ - Temp num_max_vertices = ngg_max_vertex_count(ctx); - ngg_emit_wave0_sendmsg_gs_alloc_req(ctx, num_max_vertices, num_max_vertices); -} - -void ngg_gs_finale(isel_context *ctx) -{ - /* Sanity check. Make sure the vertex/primitive counts are set and the LDS is correctly initialized. */ - assert(ctx->ngg_gs_known_vtxcnt[0]); - - if_context ic; - Builder bld(ctx->program, ctx->block); - - /* Wait for all waves to reach the epilogue. */ - create_workgroup_barrier(bld); - - /* Thread ID in the entire threadgroup */ - Temp tid_in_tg = thread_id_in_threadgroup(ctx); - /* Number of threads that may need to export a vertex or primitive. */ - Temp max_vtxcnt = ngg_max_vertex_count(ctx); - /* LDS address of the vertex corresponding to the current thread. */ - Temp vertex_lds_addr = ngg_gs_vertex_lds_addr(ctx, tid_in_tg); - /* Primitive flag from stream 0 of the vertex corresponding to the current thread. */ - Temp prim_flag_0 = ngg_gs_load_prim_flag_0(ctx, tid_in_tg, max_vtxcnt, vertex_lds_addr); - - bld.reset(ctx->block); - - /* NIR already filters out incomplete primitives and vertices, - * so any vertex whose primitive flag is non-zero is considered live/valid. - */ - Temp vertex_live = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), Operand(prim_flag_0)); - - /* Total number of vertices emitted by the workgroup. */ - Temp wg_vtx_cnt; - /* ID of the thread which will export the current thread's vertex. */ - Temp exporter_tid_in_tg; - - if (ctx->ngg_gs_early_alloc) { - /* There is no need for a scan or vertex compaction, we know that - * the GS writes all possible vertices so each thread can export its own vertex. - */ - wg_vtx_cnt = max_vtxcnt; - exporter_tid_in_tg = tid_in_tg; - } else { - /* Perform a workgroup reduction and exclusive scan. */ - std::pair wg_scan = ngg_gs_workgroup_reduce_and_scan(ctx, vertex_live); - bld.reset(ctx->block); - /* Total number of vertices emitted by the workgroup. */ - wg_vtx_cnt = wg_scan.first; - /* ID of the thread which will export the current thread's vertex. */ - exporter_tid_in_tg = wg_scan.second; - /* Skip all exports when possible. */ - Temp have_exports = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), wg_vtx_cnt, Operand(0u)); - max_vtxcnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), max_vtxcnt, Operand(0u), bld.scc(have_exports)); - - ngg_emit_wave0_sendmsg_gs_alloc_req(ctx, wg_vtx_cnt, max_vtxcnt); - ngg_gs_setup_vertex_compaction(ctx, vertex_live, tid_in_tg, exporter_tid_in_tg); - } - - ngg_gs_export_primitives(ctx, max_vtxcnt, tid_in_tg, exporter_tid_in_tg, prim_flag_0); - ngg_gs_export_vertices(ctx, wg_vtx_cnt, tid_in_tg, vertex_lds_addr); -} - } /* end namespace */ void select_program(Program *program, @@ -11753,7 +11100,7 @@ void select_program(Program *program, { isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false); if_context ic_merged_wave_info; - bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS); + bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS); for (unsigned i = 0; i < shader_count; i++) { nir_shader *nir = shaders[i]; @@ -11776,9 +11123,6 @@ void select_program(Program *program, } } - if (!i && ngg_gs) - ngg_gs_prelude(&ctx); - /* In a merged VS+TCS HS, the VS implementation can be completely empty. */ nir_function_impl *func = nir_shader_get_entrypoint(nir); bool empty_shader = nir_cf_list_is_empty_block(&func->body) && @@ -11787,14 +11131,8 @@ void select_program(Program *program, (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs)); - bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader); - bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info; - - if (i && ngg_gs) { - /* NGG GS waves need to wait for each other after the GS half is done. */ - Builder bld(ctx.program, ctx.block); - create_workgroup_barrier(bld); - } + bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1)); + bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1)); if (check_merged_wave_info) { Temp cond = merged_wave_info_to_mask(&ctx, i); @@ -11843,9 +11181,6 @@ void select_program(Program *program, end_divergent_if(&ctx, &ic_merged_wave_info); } - if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY) - ngg_gs_finale(&ctx); - if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) { /* Outputs of the previous stage are inputs to the next stage */ ctx.inputs = ctx.outputs; diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index 0a5462c..379da77 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -93,15 +93,7 @@ struct isel_context { Temp persp_centroid, linear_centroid; /* GS inputs */ - bool ngg_gs_early_alloc = false; - bool ngg_gs_known_vtxcnt[4] = {false, false, false, false}; Temp gs_wave_id; - unsigned ngg_gs_emit_addr = 0; - unsigned ngg_gs_emit_vtx_bytes = 0; - unsigned ngg_gs_scratch_addr = 0; - unsigned ngg_gs_primflags_offset = 0; - int ngg_gs_const_vtxcnt[4]; - int ngg_gs_const_prmcnt[4]; /* VS output information */ bool export_clip_dists; diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 2e44568..e92e6b3 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -407,28 +407,7 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir) setup_vs_output_info(ctx, nir, false, ctx->options->key.vs_common_out.export_clip_dists, outinfo); - unsigned ngg_gs_scratch_bytes = ctx->args->shader_info->so.num_outputs ? (44u * 4u) : (8u * 4u); - unsigned ngg_emit_bytes = ctx->args->shader_info->ngg_info.ngg_emit_size * 4u; - unsigned esgs_ring_bytes = ctx->args->shader_info->ngg_info.esgs_ring_size; - - ctx->ngg_gs_primflags_offset = ctx->args->shader_info->gs.gsvs_vertex_size; - ctx->ngg_gs_emit_vtx_bytes = ctx->ngg_gs_primflags_offset + 4u; - ctx->ngg_gs_emit_addr = esgs_ring_bytes; - ctx->ngg_gs_scratch_addr = ctx->ngg_gs_emit_addr + ngg_emit_bytes; - ctx->ngg_gs_scratch_addr = ALIGN(ctx->ngg_gs_scratch_addr, 16u); - - unsigned total_lds_bytes = ctx->ngg_gs_scratch_addr + ngg_gs_scratch_bytes; - assert(total_lds_bytes >= ctx->ngg_gs_emit_addr); - assert(total_lds_bytes >= ctx->ngg_gs_scratch_addr); - ctx->program->config->lds_size = DIV_ROUND_UP(total_lds_bytes, ctx->program->dev.lds_encoding_granule); - - /* Make sure we have enough room for emitted GS vertices */ - if (nir->info.gs.vertices_out) - assert((ngg_emit_bytes % (ctx->ngg_gs_emit_vtx_bytes * nir->info.gs.vertices_out)) == 0); - - /* See if the number of vertices and primitives are compile-time known */ - nir_gs_count_vertices_and_primitives(nir, ctx->ngg_gs_const_vtxcnt, ctx->ngg_gs_const_prmcnt, 4u); - ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1; + ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule); } if (ctx->stage.has(SWStage::VS)) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index fd5780a..385cc6d 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -862,11 +862,16 @@ bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has info->is_ngg_passthrough = out_conf.passthrough; key->vs_common_out.as_ngg_passthrough = out_conf.passthrough; } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { - if (!key->vs_common_out.as_ngg) + if (!info->is_ngg) return false; - /* TODO: lower NGG GS in NIR */ - return false; + ac_nir_lower_ngg_gs( + nir, info->wave_size, max_workgroup_size, + info->ngg_info.esgs_ring_size, + info->gs.gsvs_vertex_size, + info->ngg_info.ngg_emit_size * 4u, + key->vs.provoking_vtx_last); + return true; } else { return false; } -- 2.7.4