From: Qiang Yu Date: Mon, 6 Jun 2022 09:06:41 +0000 (+0800) Subject: ac/nir/ngg: support line culling X-Git-Tag: upstream/22.3.5~4232 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=db0e9d3caba348d4edbbe6f883fe1ceb8ddad2c8;p=platform%2Fupstream%2Fmesa.git ac/nir/ngg: support line culling Port from ac_llvm_cull.c Acked-by: Marek Olšák Reviewed-by: Timur Kristóf Signed-off-by: Qiang Yu Part-of: --- diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index f6a9c9b..c2fb8c5 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -162,11 +162,12 @@ ac_nir_lower_mesh_inputs_to_mem(nir_shader *shader, unsigned task_num_entries); nir_ssa_def * -ac_nir_cull_triangle(nir_builder *b, - nir_ssa_def *initially_accepted, - nir_ssa_def *pos[3][4], - ac_nir_cull_accepted accept_func, - void *state); +ac_nir_cull_primitive(nir_builder *b, + nir_ssa_def *initially_accepted, + nir_ssa_def *pos[3][4], + unsigned num_vertices, + ac_nir_cull_accepted accept_func, + void *state); bool ac_nir_lower_global_access(nir_shader *shader); diff --git a/src/amd/common/ac_nir_cull.c b/src/amd/common/ac_nir_cull.c index bdb3be7..3c2ed76 100644 --- a/src/amd/common/ac_nir_cull.c +++ b/src/amd/common/ac_nir_cull.c @@ -36,13 +36,14 @@ typedef struct } position_w_info; static void -analyze_position_w(nir_builder *b, nir_ssa_def *pos[3][4], position_w_info *w_info) +analyze_position_w(nir_builder *b, nir_ssa_def *pos[][4], unsigned num_vertices, + position_w_info *w_info) { w_info->all_w_negative = nir_imm_bool(b, true); w_info->w_reflection = nir_imm_bool(b, false); w_info->any_w_negative = nir_imm_bool(b, false); - for (unsigned i = 0; i < 3; ++i) { + for (unsigned i = 0; i < num_vertices; ++i) { nir_ssa_def *neg_w = nir_flt(b, pos[i][3], nir_imm_float(b, 0.0f)); w_info->w_reflection = nir_ixor(b, neg_w, w_info->w_reflection); w_info->any_w_negative = nir_ior(b, neg_w, w_info->any_w_negative); @@ -51,7 +52,7 @@ analyze_position_w(nir_builder *b, nir_ssa_def *pos[3][4], position_w_info *w_in } static nir_ssa_def * -cull_face(nir_builder *b, nir_ssa_def *pos[3][4], const position_w_info *w_info) +cull_face_triangle(nir_builder *b, nir_ssa_def *pos[3][4], const position_w_info *w_info) { nir_ssa_def *det_t0 = nir_fsub(b, pos[2][0], pos[0][0]); nir_ssa_def *det_t1 = nir_fsub(b, pos[1][1], pos[0][1]); @@ -79,7 +80,7 @@ cull_face(nir_builder *b, nir_ssa_def *pos[3][4], const position_w_info *w_info) } static void -calc_bbox(nir_builder *b, nir_ssa_def *pos[3][4], nir_ssa_def *bbox_min[3], nir_ssa_def *bbox_max[3]) +calc_bbox_triangle(nir_builder *b, nir_ssa_def *pos[3][4], nir_ssa_def *bbox_min[2], nir_ssa_def *bbox_max[2]) { for (unsigned chan = 0; chan < 2; ++chan) { bbox_min[chan] = nir_fmin(b, pos[0][chan], nir_fmin(b, pos[1][chan], pos[2][chan])); @@ -88,7 +89,7 @@ calc_bbox(nir_builder *b, nir_ssa_def *pos[3][4], nir_ssa_def *bbox_min[3], nir_ } static nir_ssa_def * -cull_frustrum(nir_builder *b, nir_ssa_def *bbox_min[3], nir_ssa_def *bbox_max[3]) +cull_frustrum(nir_builder *b, nir_ssa_def *bbox_min[2], nir_ssa_def *bbox_max[2]) { nir_ssa_def *prim_outside_view = nir_imm_false(b); @@ -101,8 +102,8 @@ cull_frustrum(nir_builder *b, nir_ssa_def *bbox_min[3], nir_ssa_def *bbox_max[3] } static nir_ssa_def * -cull_small_primitive(nir_builder *b, nir_ssa_def *bbox_min[3], nir_ssa_def *bbox_max[3], - nir_ssa_def *prim_is_small_else) +cull_small_primitive_triangle(nir_builder *b, nir_ssa_def *bbox_min[2], nir_ssa_def *bbox_max[2], + nir_ssa_def *prim_is_small_else) { nir_ssa_def *prim_is_small = NULL; @@ -137,31 +138,193 @@ cull_small_primitive(nir_builder *b, nir_ssa_def *bbox_min[3], nir_ssa_def *bbox return nir_if_phi(b, prim_is_small, prim_is_small_else); } -nir_ssa_def * +static nir_ssa_def * ac_nir_cull_triangle(nir_builder *b, nir_ssa_def *initially_accepted, nir_ssa_def *pos[3][4], + position_w_info *w_info, ac_nir_cull_accepted accept_func, void *state) { - position_w_info w_info = {0}; - analyze_position_w(b, pos, &w_info); + nir_ssa_def *accepted = initially_accepted; + accepted = nir_iand(b, accepted, nir_inot(b, w_info->all_w_negative)); + accepted = nir_iand(b, accepted, nir_inot(b, cull_face_triangle(b, pos, w_info))); + + nir_ssa_def *bbox_accepted = NULL; + + nir_if *if_accepted = nir_push_if(b, accepted); + { + nir_ssa_def *bbox_min[2] = {0}, *bbox_max[2] = {0}; + calc_bbox_triangle(b, pos, bbox_min, bbox_max); + + nir_ssa_def *prim_outside_view = cull_frustrum(b, bbox_min, bbox_max); + nir_ssa_def *prim_invisible = + cull_small_primitive_triangle(b, bbox_min, bbox_max, prim_outside_view); + bbox_accepted = nir_ior(b, nir_inot(b, prim_invisible), w_info->any_w_negative); + + /* for caller which need to react when primitive is accepted */ + if (accept_func) { + nir_if *if_still_accepted = nir_push_if(b, bbox_accepted); + { + accept_func(b, state); + } + nir_pop_if(b, if_still_accepted); + } + } + nir_pop_if(b, if_accepted); + + return nir_if_phi(b, bbox_accepted, accepted); +} + +static void +rotate_45degrees(nir_builder *b, nir_ssa_def *v[2]) +{ + /* sin(45) == cos(45) */ + nir_ssa_def *sincos45 = nir_imm_float(b, 0.707106781); + + /* x2 = x*cos45 - y*sin45 = x*sincos45 - y*sincos45 + * y2 = x*sin45 + y*cos45 = x*sincos45 + y*sincos45 + */ + nir_ssa_def *first = nir_fmul(b, v[0], sincos45); + + /* Doing 2x ffma while duplicating the multiplication is 33% faster than fmul+fadd+fadd. */ + nir_ssa_def *result[2] = { + nir_ffma(b, nir_fneg(b, v[1]), sincos45, first), + nir_ffma(b, v[1], sincos45, first), + }; + + memcpy(v, result, sizeof(result)); +} + +static void +calc_bbox_line(nir_builder *b, nir_ssa_def *pos[3][4], nir_ssa_def *bbox_min[2], nir_ssa_def *bbox_max[2]) +{ + nir_ssa_def *clip_half_line_width = nir_load_clip_half_line_width_amd(b); + + for (unsigned chan = 0; chan < 2; ++chan) { + bbox_min[chan] = nir_fmin(b, pos[0][chan], pos[1][chan]); + bbox_max[chan] = nir_fmax(b, pos[0][chan], pos[1][chan]); + + nir_ssa_def *width = nir_channel(b, clip_half_line_width, chan); + bbox_min[chan] = nir_fsub(b, bbox_min[chan], width); + bbox_max[chan] = nir_fsub(b, bbox_max[chan], width); + } +} + +static nir_ssa_def * +cull_small_primitive_line(nir_builder *b, nir_ssa_def *pos[3][4], + nir_ssa_def *bbox_min[2], nir_ssa_def *bbox_max[2], + nir_ssa_def *prim_is_small_else) +{ + nir_ssa_def *prim_is_small = NULL; + + /* Small primitive filter - eliminate lines that are too small to affect a sample. */ + nir_if *if_cull_small_prims = nir_push_if(b, nir_load_cull_small_primitives_enabled_amd(b)); + { + /* This only works with lines without perpendicular end caps (lines with perpendicular + * end caps are rasterized as quads and thus can't be culled as small prims in 99% of + * cases because line_width >= 1). + * + * This takes advantage of the diamond exit rule, which says that every pixel + * has a diamond inside it touching the pixel boundary and only if a line exits + * the diamond, that pixel is filled. If a line enters the diamond or stays + * outside the diamond, the pixel isn't filled. + * + * This algorithm is a little simpler than that. The space outside all diamonds also + * has the same diamond shape, which we'll call corner diamonds. + * + * The idea is to cull all lines that are entirely inside a diamond, including + * corner diamonds. If a line is entirely inside a diamond, it can be culled because + * it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled + * because it doesn't enter any diamond and thus can't exit any diamond. + * + * The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding + * box test is used to determine whether a line is entirely inside any square (diamond). + * + * The line width doesn't matter. Wide lines only duplicate filled pixels in either X or + * Y direction from the filled pixels. MSAA also doesn't matter. MSAA should ideally use + * perpendicular end caps that enable quad rasterization for lines. Thus, this should + * always use non-MSAA viewport transformation and non-MSAA small prim precision. + * + * A good test is piglit/lineloop because it draws 10k subpixel lines in a circle. + * It should contain no holes if this matches hw behavior. + */ + nir_ssa_def *v0[2], *v1[2]; + nir_ssa_def *vp = nir_load_viewport_xy_scale_and_offset(b); + + /* Get vertex positions in pixels. */ + for (unsigned chan = 0; chan < 2; chan++) { + nir_ssa_def *vp_scale = nir_channel(b, vp, chan); + nir_ssa_def *vp_translate = nir_channel(b, vp, 2 + chan); + + v0[chan] = nir_ffma(b, pos[0][chan], vp_scale, vp_translate); + v1[chan] = nir_ffma(b, pos[1][chan], vp_scale, vp_translate); + } + + /* Rotate the viewport by 45 degress, so that diamonds become squares. */ + rotate_45degrees(b, v0); + rotate_45degrees(b, v1); + + nir_ssa_def *small_prim_precision = nir_load_cull_small_prim_precision_amd(b); + prim_is_small = prim_is_small_else; + + for (unsigned chan = 0; chan < 2; chan++) { + /* The width of each square is sqrt(0.5), so scale it to 1 because we want + * round() to give us the position of the closest center of a square (diamond). + */ + v0[chan] = nir_fmul_imm(b, v0[chan], 1.414213562); + v1[chan] = nir_fmul_imm(b, v1[chan], 1.414213562); + + /* Compute the bounding box around both vertices. We do this because we must + * enlarge the line area by the precision of the rasterizer. + */ + nir_ssa_def *min = nir_fmin(b, v0[chan], v1[chan]); + nir_ssa_def *max = nir_fmax(b, v0[chan], v1[chan]); + + /* Enlarge the bounding box by the precision of the rasterizer. */ + min = nir_fsub(b, min, small_prim_precision); + max = nir_fadd(b, max, small_prim_precision); + + /* Round the bounding box corners. If both rounded corners are equal, + * the bounding box is entirely inside a square (diamond). + */ + min = nir_fround_even(b, min); + max = nir_fround_even(b, max); + + nir_ssa_def *rounded_to_eq = nir_feq(b, min, max); + prim_is_small = nir_ior(b, prim_is_small, rounded_to_eq); + } + } + nir_pop_if(b, if_cull_small_prims); + + return nir_if_phi(b, prim_is_small, prim_is_small_else); +} + +static nir_ssa_def * +ac_nir_cull_line(nir_builder *b, + nir_ssa_def *initially_accepted, + nir_ssa_def *pos[3][4], + position_w_info *w_info, + ac_nir_cull_accepted accept_func, + void *state) +{ nir_ssa_def *accepted = initially_accepted; - accepted = nir_iand(b, accepted, nir_inot(b, w_info.all_w_negative)); - accepted = nir_iand(b, accepted, nir_inot(b, cull_face(b, pos, &w_info))); + accepted = nir_iand(b, accepted, nir_inot(b, w_info->any_w_negative)); nir_ssa_def *bbox_accepted = NULL; nir_if *if_accepted = nir_push_if(b, accepted); { - nir_ssa_def *bbox_min[3] = {0}, *bbox_max[3] = {0}; - calc_bbox(b, pos, bbox_min, bbox_max); + nir_ssa_def *bbox_min[2] = {0}, *bbox_max[2] = {0}; + calc_bbox_line(b, pos, bbox_min, bbox_max); + /* Frustrum culling - eliminate lines that are fully outside the view. */ nir_ssa_def *prim_outside_view = cull_frustrum(b, bbox_min, bbox_max); - nir_ssa_def *prim_invisible = cull_small_primitive(b, bbox_min, bbox_max, prim_outside_view); + nir_ssa_def *prim_invisible = + cull_small_primitive_line(b, pos, bbox_min, bbox_max, prim_outside_view); - bbox_accepted = nir_ior(b, nir_inot(b, prim_invisible), w_info.any_w_negative); + bbox_accepted = nir_inot(b, prim_invisible); /* for caller which need to react when primitive is accepted */ if (accept_func) { @@ -176,3 +339,24 @@ ac_nir_cull_triangle(nir_builder *b, return nir_if_phi(b, bbox_accepted, accepted); } + +nir_ssa_def * +ac_nir_cull_primitive(nir_builder *b, + nir_ssa_def *initially_accepted, + nir_ssa_def *pos[3][4], + unsigned num_vertices, + ac_nir_cull_accepted accept_func, + void *state) +{ + position_w_info w_info = {0}; + analyze_position_w(b, pos, num_vertices, &w_info); + + if (num_vertices == 3) + return ac_nir_cull_triangle(b, initially_accepted, pos, &w_info, accept_func, state); + else if (num_vertices == 2) + return ac_nir_cull_line(b, initially_accepted, pos, &w_info, accept_func, state); + else + unreachable("point culling not implemented"); + + return NULL; +} diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 193a182..050f39b 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -844,14 +844,16 @@ compact_vertices_after_culling(nir_builder *b, nir_ssa_def *exporter_vtx_indices[3] = {0}; /* Load the index of the ES threads that will export the current GS thread's vertices */ - for (unsigned v = 0; v < 3; ++v) { + for (unsigned v = 0; v < nogs_state->num_vertices_per_primitives; ++v) { nir_ssa_def *vtx_addr = nir_load_var(b, gs_vtxaddr_vars[v]); nir_ssa_def *exporter_vtx_idx = nir_load_shared(b, 1, 8, vtx_addr, .base = lds_es_exporter_tid); exporter_vtx_indices[v] = nir_u2u32(b, exporter_vtx_idx); nir_store_var(b, nogs_state->gs_vtx_indices_vars[v], exporter_vtx_indices[v], 0x1); } - nir_ssa_def *prim_exp_arg = emit_pack_ngg_prim_exp_arg(b, 3, exporter_vtx_indices, NULL, nogs_state->use_edgeflags); + nir_ssa_def *prim_exp_arg = + emit_pack_ngg_prim_exp_arg(b, nogs_state->num_vertices_per_primitives, + exporter_vtx_indices, NULL, nogs_state->use_edgeflags); nir_store_var(b, prim_exp_arg_var, prim_exp_arg, 0x1u); } nir_pop_if(b, if_gs_accepted); @@ -1123,7 +1125,7 @@ cull_primitive_accepted(nir_builder *b, void *state) nir_store_var(b, s->gs_accepted_var, nir_imm_true(b), 0x1u); /* Store the accepted state to LDS for ES threads */ - for (unsigned vtx = 0; vtx < 3; ++vtx) + for (unsigned vtx = 0; vtx < s->num_vertices_per_primitives; ++vtx) nir_store_shared(b, nir_imm_intN_t(b, 1, 8), s->vtx_addr[vtx], .base = lds_es_vertex_accepted); } @@ -1253,27 +1255,29 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c { /* Load vertex indices from input VGPRs */ nir_ssa_def *vtx_idx[3] = {0}; - for (unsigned vertex = 0; vertex < 3; ++vertex) + for (unsigned vertex = 0; vertex < nogs_state->num_vertices_per_primitives; ++vertex) vtx_idx[vertex] = nir_load_var(b, nogs_state->gs_vtx_indices_vars[vertex]); nir_ssa_def *pos[3][4] = {0}; /* Load W positions of vertices first because the culling code will use these first */ - for (unsigned vtx = 0; vtx < 3; ++vtx) { + for (unsigned vtx = 0; vtx < nogs_state->num_vertices_per_primitives; ++vtx) { nogs_state->vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], pervertex_lds_bytes); pos[vtx][3] = nir_load_shared(b, 1, 32, nogs_state->vtx_addr[vtx], .base = lds_es_pos_w); nir_store_var(b, gs_vtxaddr_vars[vtx], nogs_state->vtx_addr[vtx], 0x1u); } /* Load the X/W, Y/W positions of vertices */ - for (unsigned vtx = 0; vtx < 3; ++vtx) { + for (unsigned vtx = 0; vtx < nogs_state->num_vertices_per_primitives; ++vtx) { nir_ssa_def *xy = nir_load_shared(b, 2, 32, nogs_state->vtx_addr[vtx], .base = lds_es_pos_x); pos[vtx][0] = nir_channel(b, xy, 0); pos[vtx][1] = nir_channel(b, xy, 1); } /* See if the current primitive is accepted */ - ac_nir_cull_triangle(b, nir_imm_bool(b, true), pos, cull_primitive_accepted, nogs_state); + ac_nir_cull_primitive(b, nir_imm_bool(b, true), pos, + nogs_state->num_vertices_per_primitives, + cull_primitive_accepted, nogs_state); } nir_pop_if(b, if_gs_thread);