From 9151ac3531f05a825b6a07c4977251b45ed34141 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 5 Nov 2021 21:56:24 -0400 Subject: [PATCH] ac,radeonsi: cull small lines in the shader using the diamond exit rule It also splits clip_half_line_width into X and Y components for tighter view culling. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/llvm/ac_llvm_cull.c | 96 +++++++++++++++++++++++- src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 31 ++++---- src/gallium/drivers/radeonsi/si_pipe.h | 2 + src/gallium/drivers/radeonsi/si_shader.h | 3 +- src/gallium/drivers/radeonsi/si_state.c | 3 +- src/gallium/drivers/radeonsi/si_state_viewport.c | 11 ++- 6 files changed, 125 insertions(+), 21 deletions(-) diff --git a/src/amd/llvm/ac_llvm_cull.c b/src/amd/llvm/ac_llvm_cull.c index 87d201f..d37a9f8 100644 --- a/src/amd/llvm/ac_llvm_cull.c +++ b/src/amd/llvm/ac_llvm_cull.c @@ -120,6 +120,25 @@ static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3 return accepted; } +static void rotate_45degrees(struct ac_llvm_context *ctx, LLVMValueRef v[2]) +{ + /* sin(45) == cos(45) */ + LLVMValueRef sincos45 = LLVMConstReal(ctx->f32, 0.707106781); + + /* x2 = x*cos45 - y*sin45 = x*sincos45 - y*sincos45 + * y2 = x*sin45 + y*cos45 = x*sincos45 + y*sincos45 + */ + LLVMValueRef first = LLVMBuildFMul(ctx->builder, v[0], sincos45, ""); + + /* Doing 2x ffma while duplicating the multiplication is 33% faster than fmul+fadd+fadd. */ + LLVMValueRef result[2] = { + ac_build_fmad(ctx, LLVMBuildFNeg(ctx->builder, v[1], ""), sincos45, first), + ac_build_fmad(ctx, v[1], sincos45, first), + }; + + memcpy(v, result, sizeof(result)); +} + /* Perform view culling and small primitive elimination and return true * if the primitive is accepted and initially_accepted == true. */ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], @@ -181,8 +200,8 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], } } - /* Small primitive elimination. */ - if (options->cull_small_prims) { + /* Small primitive culling - triangles. */ + if (options->cull_small_prims && options->num_vertices == 3) { /* Assuming a sample position at (0.5, 0.5), if we round * the bounding box min/max extents and the results of * the rounding are equal in either the X or Y direction, @@ -214,6 +233,79 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], accepted = LLVMBuildAnd(builder, accepted, visible, ""); } + /* Small primitive culling - lines. */ + if (options->cull_small_prims && options->num_vertices == 2) { + /* This only works with lines without perpendicular end caps (lines with perpendicular + * end caps are rasterized as quads and thus can't be culled as small prims in 99% of + * cases because line_width >= 1). + * + * This takes advantage of the diamont exit rule, which says that every pixel + * has a diamond inside it touching the pixel boundary and only if a line exits + * the diamond, that pixel is filled. If a line enters the diamond or stays + * outside the diamond, the pixel isn't filled. + * + * This algorithm is a little simpler than that. The space outside all diamonds also + * has the same diamond shape, which we'll call corner diamonds. + * + * The idea is to cull all lines that are entirely inside a diamond, including + * corner diamonds. If a line is entirely inside a diamond, it can be culled because + * it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled + * because it doesn't enter any diamond and thus can't exit any diamond. + * + * The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding + * box test is used to determine whether a line is entirely inside any square (diamond). + * + * The line width doesn't matter. Wide lines only duplicate filled pixels in either X or + * Y direction from the filled pixels. MSAA also doesn't matter. MSAA should ideally use + * perpendicular end caps that enable quad rasterization for lines. Thus, this should + * always use non-MSAA viewport transformation and non-MSAA small prim precision. + * + * A good test is piglit/lineloop because it draws 10k subpixel lines in a circle. + * It should contain no holes if this matches hw behavior. + */ + LLVMValueRef v0[2], v1[2]; + + /* Get vertex positions in pixels. */ + for (unsigned chan = 0; chan < 2; chan++) { + v0[chan] = ac_build_fmad(ctx, pos[0][chan], vp_scale[chan], vp_translate[chan]); + v1[chan] = ac_build_fmad(ctx, pos[1][chan], vp_scale[chan], vp_translate[chan]); + } + + /* Rotate the viewport by 45 degress, so that diamonds become squares. */ + rotate_45degrees(ctx, v0); + rotate_45degrees(ctx, v1); + + LLVMValueRef not_equal[2]; + + for (unsigned chan = 0; chan < 2; chan++) { + /* The width of each square is sqrt(0.5), so scale it to 1 because we want + * round() to give us the position of the closest center of a square (diamond). + */ + v0[chan] = LLVMBuildFMul(builder, v0[chan], LLVMConstReal(ctx->f32, 1.414213562), ""); + v1[chan] = LLVMBuildFMul(builder, v1[chan], LLVMConstReal(ctx->f32, 1.414213562), ""); + + /* Compute the bounding box around both vertices. We do this because we must + * enlarge the line area by the precision of the rasterizer. + */ + LLVMValueRef min = ac_build_fmin(ctx, v0[chan], v1[chan]); + LLVMValueRef max = ac_build_fmax(ctx, v0[chan], v1[chan]); + + /* Enlarge the bounding box by the precision of the rasterizer. */ + min = LLVMBuildFSub(builder, min, small_prim_precision, ""); + max = LLVMBuildFAdd(builder, max, small_prim_precision, ""); + + /* Round the bounding box corners. If both rounded corners are equal, + * the bounding box is entirely inside a square (diamond). + */ + min = ac_build_round(ctx, min); + max = ac_build_round(ctx, max); + not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, ""); + } + + accepted = LLVMBuildAnd(builder, accepted, + LLVMBuildOr(builder, not_equal[0], not_equal[1], ""), ""); + } + /* Disregard the bounding box culling if any W is negative because the code * doesn't work with that. */ diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 71a14a5..b5369fa 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -970,44 +970,47 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) } } + LLVMValueRef vp_scale[2] = {}, vp_translate[2] = {}, small_prim_precision = NULL; LLVMValueRef clip_half_line_width[2] = {}; /* Load the viewport state for small prim culling. */ + bool prim_is_lines = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES; LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->small_prim_cull_info); - LLVMValueRef vp = ac_build_load_to_sgpr(&ctx->ac, ptr, ctx->ac.i32_0); + /* Lines will always use the non-AA viewport transformation. */ + LLVMValueRef vp = ac_build_load_to_sgpr(&ctx->ac, ptr, + prim_is_lines ? ctx->ac.i32_1 : ctx->ac.i32_0); vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, ""); - LLVMValueRef vp_scale[2], vp_translate[2]; vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); - /* Get the small prim filter precision. */ - LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4); - small_prim_precision = - LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), ""); - small_prim_precision = - LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), ""); - small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, ""); - /* Execute culling code. */ struct ac_cull_options options = {}; options.cull_view_xy = true; options.cull_w = true; - if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES) { - ptr = LLVMBuildPointerCast(builder, ptr, - LLVMPointerType(ctx->ac.v2i32, AC_ADDR_SPACE_CONST_32BIT), ""); + if (prim_is_lines) { LLVMValueRef terms = ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, 2, 0)); - terms = LLVMBuildBitCast(builder, terms, ctx->ac.v2f32, ""); + terms = LLVMBuildBitCast(builder, terms, ctx->ac.v4f32, ""); clip_half_line_width[0] = ac_llvm_extract_elem(&ctx->ac, terms, 0); clip_half_line_width[1] = ac_llvm_extract_elem(&ctx->ac, terms, 1); + small_prim_precision = ac_llvm_extract_elem(&ctx->ac, terms, 2); options.num_vertices = 2; + options.cull_small_prims = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT; assert(!(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE)); assert(!(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE)); } else { + /* Get the small prim filter precision. */ + small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4); + small_prim_precision = + LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), ""); + small_prim_precision = + LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), ""); + small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, ""); + options.num_vertices = 3; options.cull_front = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE; options.cull_back = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index a98b648..d5d88a4 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -899,7 +899,9 @@ struct si_saved_cs { struct si_small_prim_cull_info { float scale[2], translate[2]; + float scale_no_aa[2], translate_no_aa[2]; float clip_half_line_width[2]; /* line_width * 0.5 in clip space in X and Y directions */ + float small_prim_precision_no_aa; /* same as the small prim precision, but ignores MSAA */ /* The above fields are uploaded to memory. The below fields are passed via user SGPRs. */ float small_prim_precision; }; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 118c37e..bc27e82 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -283,6 +283,7 @@ enum #define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ #define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ #define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */ +#define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4) /* cull small lines according to the diamond exit rule */ /** * For VS shader keys, describe any fixups required for vertex fetch. @@ -660,7 +661,7 @@ struct si_shader_key_ge { unsigned kill_pointsize : 1; /* For NGG VS and TES. */ - unsigned ngg_culling : 4; /* SI_NGG_CULL_* */ + unsigned ngg_culling : 5; /* SI_NGG_CULL_* */ /* For shaders where monolithic variants have better code. * diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index d12424c..ffff623 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -969,7 +969,8 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast } else { rs->ngg_cull_flags_tris = rs->ngg_cull_flags_tris_y_inverted = SI_NGG_CULL_ENABLED; rs->ngg_cull_flags_lines = SI_NGG_CULL_ENABLED | - SI_NGG_CULL_LINES; + SI_NGG_CULL_LINES | + (!rs->perpendicular_end_caps ? SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT : 0); bool cull_front, cull_back; diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index be184bf..7db69b9 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -70,6 +70,9 @@ static void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small info.translate[1] += 0.5; } + memcpy(info.scale_no_aa, info.scale, sizeof(info.scale)); + memcpy(info.translate_no_aa, info.translate, sizeof(info.translate)); + /* Scale the framebuffer up, so that samples become pixels and small * primitive culling is the same for all sample counts. * This only works with the standard DX sample positions, because @@ -87,11 +90,13 @@ static void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode; if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH) - info.small_prim_precision = num_samples / 4096.0; + info.small_prim_precision_no_aa = 1.0 / 4096.0; else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH) - info.small_prim_precision = num_samples / 1024.0; + info.small_prim_precision_no_aa = 1.0 / 1024.0; else - info.small_prim_precision = num_samples / 256.0; + info.small_prim_precision_no_aa = 1.0 / 256.0; + + info.small_prim_precision = num_samples * info.small_prim_precision_no_aa; *out = info; } -- 2.7.4