From aaed7a29bef6637c712c459f84ec6ec7911f1300 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 9 Nov 2020 17:52:45 -0500 Subject: [PATCH] radeonsi: implement GS fast launch for indexed triangle strips This increases performance for indexed triangle strips up to +100%. In practice, it's limited by memory bandwidth and compute power, so 256-bit memory bus and a lot of CUs are recommended. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader.c | 5 +- src/gallium/drivers/radeonsi/si_shader.h | 7 ++- .../drivers/radeonsi/si_shader_llvm_vs.c | 42 ++++++++++++++ src/gallium/drivers/radeonsi/si_state_draw.c | 55 +++++++++++++++++-- 4 files changed, 101 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a11dfdf31de..e361631af4e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1582,6 +1582,8 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST); key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP); + key->vs_prolog.gs_fast_launch_index_size_packed = + SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling); } if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) { @@ -2056,7 +2058,8 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, shader.key.as_ngg = key->vs_prolog.as_ngg; shader.key.opt.ngg_culling = (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) | - (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0); + (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) | + SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed); shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs; break; case MESA_SHADER_TESS_CTRL: diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b080bfc835d..767247b75b6 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -278,7 +278,9 @@ enum #define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ #define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */ #define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */ +#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */ +#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3) +#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0xf << 3) /* GS fast launch (both prim types) */ /** * For VS shader keys, describe any fixups required for vertex fetch. @@ -559,6 +561,7 @@ union si_shader_part_key { unsigned as_prim_discard_cs : 1; unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */ unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */ + unsigned gs_fast_launch_index_size_packed : 2; /* Prologs for monolithic shaders shouldn't set EXEC. */ unsigned is_monolithic : 1; } vs_prolog; @@ -652,7 +655,7 @@ struct si_shader_key { unsigned kill_pointsize : 1; /* For NGG VS and TES. */ - unsigned ngg_culling : 5; /* SI_NGG_CULL_* */ + unsigned ngg_culling : 7; /* SI_NGG_CULL_* */ /* For shaders where monolithic variants have better code. * diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index 6632f54b096..8df88a49699 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -923,6 +923,48 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */ input_vgprs[8] = input_vgprs[6]; /* InstanceID */ + + if (key->vs_prolog.gs_fast_launch_index_size_packed) { + LLVMTypeRef index_type = ctx->ac.voidt; + + switch (key->vs_prolog.gs_fast_launch_index_size_packed) { + case 1: + index_type = ctx->ac.i8; + break; + case 2: + index_type = ctx->ac.i16; + break; + case 3: + index_type = ctx->ac.i32; + break; + default: + unreachable("invalid gs_fast_launch_index_size_packed"); + } + + LLVMValueRef sgprs[2] = { + ac_get_arg(&ctx->ac, input_sgpr_param[0]), + ac_get_arg(&ctx->ac, input_sgpr_param[1]), + }; + LLVMValueRef indices = ac_build_gather_values(&ctx->ac, sgprs, 2); + indices = LLVMBuildBitCast(ctx->ac.builder, indices, ctx->ac.i64, ""); + indices = LLVMBuildIntToPtr(ctx->ac.builder, indices, + LLVMPointerType(index_type, AC_ADDR_SPACE_CONST), ""); + + LLVMValueRef vertex_id = ac_build_alloca_init(&ctx->ac, input_vgprs[5], ""); + + /* if (is ES thread...) */ + ac_build_ifcc(&ctx->ac, + LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), + si_unpack_param(ctx, merged_wave_info, 0, 8), ""), 0); + /* VertexID = indexBufferLoad(VertexID); */ + LLVMValueRef index = LLVMBuildGEP(ctx->ac.builder, indices, &input_vgprs[5], 1, ""); + index = LLVMBuildLoad(ctx->ac.builder, index, ""); + index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i32, ""); + LLVMBuildStore(ctx->ac.builder, index, vertex_id); + ac_build_endif(&ctx->ac, 0); + + input_vgprs[5] = LLVMBuildLoad(ctx->ac.builder, vertex_id, ""); + } } unsigned vertex_id_vgpr = first_vs_vgpr; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 74cfd6ac4d6..5390672f2fc 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1020,6 +1020,38 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw !(sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL))); } } else { + /* Set the index buffer for fast launch. The VS prolog will load the indices. */ + if (sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) { + index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size); + + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indexbuf), + RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); + uint64_t base_index_va = si_resource(indexbuf)->gpu_address + index_offset; + + for (unsigned i = 0; i < num_draws; i++) { + uint64_t index_va = base_index_va + draws[i].start * original_index_size; + + radeon_set_sh_reg_seq(cs, R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS, 2); + radeon_emit(cs, index_va); + radeon_emit(cs, index_va >> 32); + + if (i > 0) { + if (info->increment_draw_id) { + unsigned draw_id = info->drawid + i; + + radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_DRAWID * 4, draw_id); + sctx->last_drawid = draw_id; + } + } + + /* TODO: Do index buffer bounds checking? We don't do it in this case. */ + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); + radeon_emit(cs, draws[i].count); + radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX); + } + return; + } + for (unsigned i = 0; i < num_draws; i++) { if (i > 0) { if (info->increment_draw_id) { @@ -2131,15 +2163,19 @@ static void si_draw_vbo(struct pipe_context *ctx, ngg_culling |= SI_NGG_CULL_BACK_FACE; } - /* Use NGG fast launch for certain non-indexed primitive types. + /* Use NGG fast launch for certain primitive types. * A draw must have at least 1 full primitive. */ - if (ngg_culling && !index_size && min_direct_count >= 3 && !sctx->tes_shader.cso && + if (ngg_culling && min_direct_count >= 3 && !sctx->tes_shader.cso && !sctx->gs_shader.cso) { - if (prim == PIPE_PRIM_TRIANGLES) + if (prim == PIPE_PRIM_TRIANGLES && !index_size) { ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST; - else if (prim == PIPE_PRIM_TRIANGLE_STRIP) - ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP; + } else if (prim == PIPE_PRIM_TRIANGLE_STRIP && !primitive_restart) { + ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP | + SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3)); + /* The index buffer will be emulated. */ + index_size = 0; + } } if (ngg_culling != old_ngg_culling) { @@ -2175,6 +2211,15 @@ static void si_draw_vbo(struct pipe_context *ctx, ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) sctx->flags |= SI_CONTEXT_VGT_FLUSH; + if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) && + !(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) { + /* Need to re-set these, because we have bound an index buffer there. */ + sctx->shader_pointers_dirty |= + (1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) | + (1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY)); + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + } + /* Set this to the correct value determined by si_update_shaders. */ sctx->ngg_culling = ngg_culling; } -- 2.34.1