From f00d3e29094942cf8a35c76646b2cfd82f4b3f8a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 26 Sep 2021 14:18:45 -0400 Subject: [PATCH] radeonsi: implement shader-based culling for lines This helps some viewperf subtests. Only view XY culling is done. Edgeflags are always disabled with lines. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/llvm/ac_llvm_cull.c | 15 +++++++++------ src/amd/llvm/ac_llvm_cull.h | 2 ++ src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 23 ++++++++++++++++++----- src/gallium/drivers/radeonsi/si_pipe.h | 17 ++++++++++++----- src/gallium/drivers/radeonsi/si_shader.h | 6 ++++-- src/gallium/drivers/radeonsi/si_state.h | 2 +- src/gallium/drivers/radeonsi/si_state_draw.cpp | 9 +++++++-- src/gallium/drivers/radeonsi/si_state_shaders.c | 10 ++++++---- 8 files changed, 59 insertions(+), 25 deletions(-) diff --git a/src/amd/llvm/ac_llvm_cull.c b/src/amd/llvm/ac_llvm_cull.c index c9f4a9c..ad406929 100644 --- a/src/amd/llvm/ac_llvm_cull.c +++ b/src/amd/llvm/ac_llvm_cull.c @@ -52,7 +52,7 @@ struct ac_position_w_info { }; static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], - struct ac_position_w_info *w) + struct ac_position_w_info *w, unsigned num_vertices) { LLVMBuilderRef builder = ctx->builder; LLVMValueRef all_w_negative = ctx->i1true; @@ -60,7 +60,7 @@ static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[ w->w_reflection = ctx->i1false; w->any_w_negative = ctx->i1false; - for (unsigned i = 0; i < 3; i++) { + for (unsigned i = 0; i < num_vertices; i++) { LLVMValueRef neg_w; neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, ""); @@ -137,11 +137,14 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], /* Compute the primitive bounding box for easy culling. */ for (unsigned chan = 0; chan < (options->cull_view_near_z || options->cull_view_far_z ? 3 : 2); chan++) { + assert(options->num_vertices >= 2); bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]); - bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]); - bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]); - bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]); + + if (options->num_vertices == 3) { + bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]); + bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]); + } } /* View culling. */ @@ -231,7 +234,7 @@ void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], void *userdata) { struct ac_position_w_info w; - ac_analyze_position_w(ctx, pos, &w); + ac_analyze_position_w(ctx, pos, &w, options->num_vertices); /* W culling. */ LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true; diff --git a/src/amd/llvm/ac_llvm_cull.h b/src/amd/llvm/ac_llvm_cull.h index 6aa52b2..db1dcdd 100644 --- a/src/amd/llvm/ac_llvm_cull.h +++ b/src/amd/llvm/ac_llvm_cull.h @@ -46,6 +46,8 @@ struct ac_cull_options { bool cull_w; /* cull primitives with all W < 0 */ bool use_halfz_clip_space; + + uint8_t num_vertices; /* 1..3 */ }; /* Callback invoked in the inner-most branch where the primitive is accepted. */ diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 15cbe31..bcba01f 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -83,6 +83,9 @@ static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, uns /* Blits always use axis-aligned rectangles with 3 vertices. */ *num_vertices = 3; return LLVMConstInt(ctx->ac.i32, 3, 0); + } else if (ctx->shader->key.opt.ngg_culling & SI_NGG_CULL_LINES) { + *num_vertices = 2; + return LLVMConstInt(ctx->ac.i32, 2, 0); } else { /* We always build up all three indices for the prim export * independent of the primitive type. The additional garbage @@ -994,13 +997,23 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) /* Execute culling code. */ struct ac_cull_options options = {}; - options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE; - options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE; options.cull_view_xy = true; - options.cull_small_prims = true; /* this would only be false with conservative rasterization */ - options.cull_zero_area = options.cull_front || options.cull_back; options.cull_w = true; + if (shader->key.opt.ngg_culling & SI_NGG_CULL_LINES) { + options.num_vertices = 2; + + assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE)); + assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE)); + assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)); + } else { + options.num_vertices = 3; + options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE; + options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE; + options.cull_small_prims = true; /* this would only be false with conservative rasterization */ + options.cull_zero_area = options.cull_front || options.cull_back; + } + /* Tell ES threads whether their vertex survived. */ LLVMValueRef params[] = { gs_accepted, @@ -1995,7 +2008,7 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel; const gl_shader_stage gs_stage = gs_sel->info.stage; const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1); - const unsigned input_prim = si_get_input_prim(gs_sel); + const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key); const bool use_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index d0bc9c4..36aaa5f 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1819,6 +1819,12 @@ static inline unsigned si_get_total_colormask(struct si_context *sctx) ((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) | \ (1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY)) +#define UTIL_ALL_PRIM_TRIANGLE_MODES \ + ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | \ + (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) | \ + (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | \ + (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)) + static inline bool util_prim_is_lines(unsigned prim) { return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0; @@ -1831,11 +1837,12 @@ static inline bool util_prim_is_points_or_lines(unsigned prim) static inline bool util_rast_prim_is_triangles(unsigned prim) { - return ((1 << prim) & - ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | - (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) | - (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | - (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))); + return ((1 << prim) & UTIL_ALL_PRIM_TRIANGLE_MODES) != 0; +} + +static inline bool util_rast_prim_is_lines_or_triangles(unsigned prim) +{ + return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | UTIL_ALL_PRIM_TRIANGLE_MODES)) != 0; } /** diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index c55461f..ff32672 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -287,6 +287,7 @@ enum #define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */ #define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3) #define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0xf << 3) /* GS fast launch (both prim types) */ +#define SI_NGG_CULL_LINES (1 << 7) /* the primitive type is lines */ /** * For VS shader keys, describe any fixups required for vertex fetch. @@ -685,7 +686,7 @@ struct si_shader_key { unsigned kill_pointsize : 1; /* For NGG VS and TES. */ - unsigned ngg_culling : 7; /* SI_NGG_CULL_* */ + unsigned ngg_culling : 8; /* SI_NGG_CULL_* */ /* For shaders where monolithic variants have better code. * @@ -963,7 +964,8 @@ static inline bool si_shader_uses_bindless_images(struct si_shader_selector *sel static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader) { if (shader->selector->info.stage == MESA_SHADER_VERTEX && - !shader->selector->info.base.vs.blit_sgprs_amd) + !shader->selector->info.base.vs.blit_sgprs_amd && + !(shader->key.opt.ngg_culling & SI_NGG_CULL_LINES)) return true; return false; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index cd1bd63..5c28b3b 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -587,7 +587,7 @@ void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key, void si_update_ps_inputs_read_or_disabled(struct si_context *sctx); void si_update_ps_kill_enable(struct si_context *sctx); void si_update_vrs_flat_shading(struct si_context *sctx); -unsigned si_get_input_prim(const struct si_shader_selector *gs); +unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key); bool si_update_ngg(struct si_context *sctx); void si_ps_key_update_framebuffer(struct si_context *sctx); void si_ps_key_update_framebuffer_blend(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 671966b..73652a6 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -2157,8 +2157,8 @@ static void si_draw_vbo(struct pipe_context *ctx, if (NGG && !HAS_GS && /* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type - * is not triangles, so this check is only needed without tessellation. */ - (HAS_TESS || sctx->current_rast_prim == PIPE_PRIM_TRIANGLES) && + * is not points, so this check is only needed without tessellation. */ + (HAS_TESS || util_rast_prim_is_lines_or_triangles(sctx->current_rast_prim)) && /* Only the first draw for a shader starts with culling disabled and it's disabled * until we pass the total_direct_count check and then it stays enabled until * the shader is changed. This eliminates most culling on/off state changes. */ @@ -2170,6 +2170,11 @@ static void si_draw_vbo(struct pipe_context *ctx, rs->ngg_cull_flags; assert(ngg_culling); /* rasterizer state should always set this to non-zero */ + if (util_prim_is_lines(sctx->current_rast_prim)) { + /* Overwrite it to mask out face cull flags. */ + ngg_culling = SI_NGG_CULL_ENABLED | SI_NGG_CULL_LINES; + } + /* Use NGG fast launch for certain primitive types. * A draw must have at least 1 full primitive. * The fast launch doesn't work with tessellation. diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index c69e70b..2414d52 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1094,7 +1094,7 @@ static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx) gfx10_emit_shader_ngg_tail(sctx, shader); } -unsigned si_get_input_prim(const struct si_shader_selector *gs) +unsigned si_get_input_prim(const struct si_shader_selector *gs, const struct si_shader_key *key) { if (gs->info.stage == MESA_SHADER_GEOMETRY) return gs->info.base.gs.input_primitive; @@ -1107,7 +1107,9 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs) return PIPE_PRIM_TRIANGLES; } - /* TODO: Set this correctly if the primitive type is set in the shader key. */ + if (key->opt.ngg_culling & SI_NGG_CULL_LINES) + return PIPE_PRIM_LINES; + return PIPE_PRIM_TRIANGLES; /* worst case for all callers */ } @@ -1151,7 +1153,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader gs_info->base.vs.window_space_position : 0; bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid; unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1); - unsigned input_prim = si_get_input_prim(gs_sel); + unsigned input_prim = si_get_input_prim(gs_sel, &shader->key); bool break_wave_at_eoi = false; struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader); if (!pm4) @@ -2987,7 +2989,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->ngg_cull_vert_threshold = 128; } } else if (sel->info.stage == MESA_SHADER_TESS_EVAL) { - if (sel->rast_prim == PIPE_PRIM_TRIANGLES && + if (sel->rast_prim != PIPE_PRIM_POINTS && (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL) || sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS) || sscreen->info.chip_class == GFX10_3)) -- 2.7.4