From: Marek Olšák Date: Thu, 2 Jun 2022 19:43:07 +0000 (-0400) Subject: radeonsi/gfx11: rework GDS streamout code to single-lane and enable streamout X-Git-Tag: upstream/22.3.5~7298 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e24354c1b2e0e2dc23b6acf227f26a55fbf3fabd;p=platform%2Fupstream%2Fmesa.git radeonsi/gfx11: rework GDS streamout code to single-lane and enable streamout GDS is basically scalar in gfx11. This is not exactly how it's supposed to be done (we should be using the GDS_STRMOUT registers), but it works. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 96560ee..8c8623d 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -369,18 +369,79 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]); tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); - LLVMValueRef args[] = { + LLVMValueRef args[8] = { LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""), - tmp, - ctx->ac.i32_0, // ordering - ctx->ac.i32_0, // scope - ctx->ac.i1false, // isVolatile - LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index - ctx->ac.i1true, // wave release - ctx->ac.i1true, // wave done + ctx->ac.i32_0, /* value to add */ + ctx->ac.i32_0, /* ordering */ + ctx->ac.i32_0, /* scope */ + ctx->ac.i1false, /* isVolatile */ + LLVMConstInt(ctx->ac.i32, 1 << 24, false), /* OA index, bits 24+: lane count */ + ctx->ac.i1true, /* wave release */ + ctx->ac.i1true, /* wave done */ }; - tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args, - ARRAY_SIZE(args), 0); + + if (ctx->screen->info.gfx_level >= GFX11) { + /* Gfx11 GDS instructions only operate on the first active lane. All other lanes are + * ignored. So are their EXEC bits. This uses the mutex feature of ds_ordered_count + * to emulate a multi-dword atomic. + * + * This is the expected code: + * ds_ordered_count release=0 done=0 // lock mutex + * ds_add_rtn_u32 dwords_written0 + * ds_add_rtn_u32 dwords_written1 + * ds_add_rtn_u32 dwords_written2 + * ds_add_rtn_u32 dwords_written3 + * ds_ordered_count release=1 done=1 // unlock mutex + * + * TODO: Increment GDS_STRMOUT registers instead of GDS memory. + */ + LLVMValueRef dwords_written[4] = {tmp, tmp, tmp, tmp}; + + /* Move all 4 VGPRs from other lanes to lane 0. */ + for (unsigned i = 1; i < 4; i++) { + if (ctx->shader->selector->info.base.xfb_stride[i]) + dwords_written[i] = ac_build_quad_swizzle(&ctx->ac, tmp, i, i, i, i); + } + + /* Set release=0 to start a GDS mutex. Set done=0 because it's not the last one. */ + args[6] = args[7] = ctx->ac.i1false; + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, + args, ARRAY_SIZE(args), 0); + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); + + for (unsigned i = 0; i < 4; i++) { + if (ctx->shader->selector->info.base.xfb_stride[i]) { + LLVMValueRef gds_ptr = + ac_build_gep_ptr(&ctx->ac, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0)); + + dwords_written[i] = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, + gds_ptr, dwords_written[i], + LLVMAtomicOrderingMonotonic, false); + } + } + + /* TODO: This might not be needed if GDS executes instructions in order. */ + ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); + + /* Set release=1 to end a GDS mutex. Set done=1 because it's the last one. */ + args[6] = args[7] = ctx->ac.i1true; + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, + args, ARRAY_SIZE(args), 0); + + tmp = dwords_written[0]; + for (unsigned i = 1; i < 4; i++) { + if (ctx->shader->selector->info.base.xfb_stride[i]) { + dwords_written[i] = ac_build_readlane(&ctx->ac, dwords_written[i], ctx->ac.i32_0); + tmp = ac_build_writelane(&ctx->ac, tmp, dwords_written[i], LLVMConstInt(ctx->ac.i32, i, 0)); + } + } + } else { + args[1] = tmp; /* value to add */ + args[5] = LLVMConstInt(ctx->ac.i32, 4 << 24, false), /* bits 24+: lane count */ + + tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, + args, ARRAY_SIZE(args), 0); + } /* Keep offsets in a VGPR for quick retrieval via readlane by * the first wave for bounds checking, and also store in LDS @@ -451,9 +512,28 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout { tmp = LLVMBuildSub(builder, generated, emit, ""); tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); - tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, ""); - LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp, - LLVMAtomicOrderingMonotonic, false); + + if (ctx->screen->info.gfx_level >= GFX11) { + /* Gfx11 GDS instructions only operate on the first active lane. + * This is an unrolled waterfall loop. We only get here when we overflow, + * so it doesn't have to be fast. + */ + for (unsigned i = 0; i < 4; i++) { + if (bufmask_for_stream[stream] & BITFIELD_BIT(i)) { + LLVMValueRef index = LLVMConstInt(ctx->ac.i32, i, 0); + + ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, index, ""), 0); + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, + LLVMBuildGEP(builder, gdsbase, &index, 1, ""), + tmp, LLVMAtomicOrderingMonotonic, false); + ac_build_endif(&ctx->ac, 0); + } + } + } else { + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, + LLVMBuildGEP(builder, gdsbase, &tid, 1, ""), + tmp, LLVMAtomicOrderingMonotonic, false); + } } ac_build_endif(&ctx->ac, 5222); ac_build_endif(&ctx->ac, 5221); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 1e8e72c..015bb07 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -3158,11 +3158,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx, if (!sel) return NULL; - if (sscreen->info.gfx_level == GFX11 && state->stream_output.num_outputs) { - fprintf(stderr, "radeonsi: streamout unimplemented\n"); - abort(); - } - sel->screen = sscreen; sel->compiler_ctx_state.debug = sctx->debug; sel->compiler_ctx_state.is_debug_context = sctx->is_debug;