}
case nir_intrinsic_ordered_xfb_counter_add_amd: {
/* must be called in a single lane of a workgroup. */
+ /* TODO: Add RADV support. */
+ bool use_gds_registers = ctx->ac.gfx_level >= GFX11 &&
+ ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL;
LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, gdsptr, "");
*
* This is the expected code:
* ds_ordered_count release=0 done=0 // lock mutex
- * ds_add_rtn_u32 dwords_written0
- * ds_add_rtn_u32 dwords_written1
- * ds_add_rtn_u32 dwords_written2
- * ds_add_rtn_u32 dwords_written3
+ * if (gfx_level >= GFX11) {
+ * ds_add_gs_reg_rtn GDS_STRMOUT_DWORDS_WRITTEN_0
+ * ds_add_gs_reg_rtn GDS_STRMOUT_DWORDS_WRITTEN_1
+ * ds_add_gs_reg_rtn GDS_STRMOUT_DWORDS_WRITTEN_2
+ * ds_add_gs_reg_rtn GDS_STRMOUT_DWORDS_WRITTEN_3
+ * } else {
+ * ds_add_rtn_u32 dwords_written0
+ * ds_add_rtn_u32 dwords_written1
+ * ds_add_rtn_u32 dwords_written2
+ * ds_add_rtn_u32 dwords_written3
+ * }
* ds_ordered_count release=1 done=1 // unlock mutex
*
- * TODO: Increment GDS_STRMOUT registers instead of GDS memory.
+ * GDS_STRMOUT_DWORDS_WRITTEN_n are just general-purpose global registers. We use them
+ * because MCBP (mid-command-buffer preemption) saves and restores them, and it doesn't
+ * save and restore GDS memory.
*/
LLVMValueRef args[8] = {
LLVMBuildIntToPtr(ctx->ac.builder, get_src(ctx, instr->src[0]), gdsptr, ""),
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
LLVMValueRef global_count[4];
- LLVMValueRef add_count = get_src(ctx, instr->src[1]);
+ LLVMValueRef count_vec = get_src(ctx, instr->src[1]);
unsigned write_mask = nir_intrinsic_write_mask(instr);
for (unsigned i = 0; i < instr->num_components; i++) {
+ LLVMValueRef value =
+ LLVMBuildExtractElement(ctx->ac.builder, count_vec,
+ LLVMConstInt(ctx->ac.i32, i, false), "");
if (write_mask & (1 << i)) {
- LLVMValueRef gds_ptr =
- ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
- LLVMValueRef count =
- LLVMBuildExtractElement(ctx->ac.builder, add_count,
- LLVMConstInt(ctx->ac.i32, i, false), "");
-
- global_count[i] =
- LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpAdd, gds_ptr, count,
- LLVMAtomicOrderingMonotonic, false);
- } else
+ if (use_gds_registers) {
+ /* The offset is a relative offset from GDS_STRMOUT_DWORDS_WRITTEN_0. */
+ global_count[i] =
+ ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.add.gs.reg.rtn.i32", ctx->ac.i32,
+ (LLVMValueRef[]){value, LLVMConstInt(ctx->ac.i32, i * 4, 0)},
+ 2, 0);
+ } else {
+ LLVMValueRef gds_ptr =
+ ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
+ global_count[i] =
+ LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpAdd, gds_ptr, value,
+ LLVMAtomicOrderingMonotonic, false);
+ }
+ } else {
global_count[i] = LLVMGetUndef(ctx->ac.i32);
+ }
}
ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
args[6] = args[7] = ctx->ac.i1true;
ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
args, ARRAY_SIZE(args), 0);
-
result = ac_build_gather_values(&ctx->ac, global_count, instr->num_components);
break;
}
case nir_intrinsic_xfb_counter_sub_amd: {
/* must be called in a single lane of a workgroup. */
+ /* TODO: Add RADV support. */
+ bool use_gds_registers = ctx->ac.gfx_level >= GFX11 &&
+ ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL;
LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, gdsptr, "");
LLVMValueRef sub_vec = get_src(ctx, instr->src[0]);
LLVMValueRef value =
LLVMBuildExtractElement(ctx->ac.builder, sub_vec,
LLVMConstInt(ctx->ac.i32, i, false), "");
-
- LLVMValueRef gds_ptr =
- ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
- LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpSub, gds_ptr, value,
- LLVMAtomicOrderingMonotonic, false);
+ if (use_gds_registers) {
+ /* The offset is a relative offset from GDS_STRMOUT_DWORDS_WRITTEN_0. */
+ ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.sub.gs.reg.rtn.i32", ctx->ac.i32,
+ (LLVMValueRef[]){value, LLVMConstInt(ctx->ac.i32, i * 4, 0)},
+ 2, 0);
+ } else {
+ LLVMValueRef gds_ptr =
+ ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
+ LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpSub, gds_ptr, value,
+ LLVMAtomicOrderingMonotonic, false);
+ }
}
}
break;
*/
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
- /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
if (sctx->screen->use_ngg_streamout) {
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
+ if (sctx->gfx_level >= GFX11) {
+ sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
+ } else {
+ /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
+ }
/* Wait now. This is needed to make sure that GDS is not
* busy at the end of IBs.
if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
si_emit_streamout_end(sctx);
+ /* TODO: This is a hack that fixes streamout failures. It shouldn't be necessary. */
+ if (sctx->gfx_level >= GFX11 && !wait_now)
+ si_flush_gfx_cs(sctx, 0, NULL);
+
/* Set the new targets. */
unsigned enabled_mask = 0, append_bitmask = 0;
for (i = 0; i < num_targets; i++) {
t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
- if (sctx->screen->use_ngg_streamout) {
+ if (sctx->gfx_level >= GFX11) {
+ if (sctx->streamout.append_bitmask & (1 << i)) {
+ /* Restore the register value. */
+ si_cp_copy_data(sctx, cs, COPY_DATA_REG, NULL,
+ (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 / 4) + i,
+ COPY_DATA_SRC_MEM, t[i]->buf_filled_size,
+ t[i]->buf_filled_size_offset);
+ } else {
+ /* Set to 0. */
+ radeon_begin(cs);
+ radeon_set_uconfig_reg(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + i * 4, 0);
+ radeon_end();
+ }
+ } else if (sctx->screen->use_ngg_streamout) {
bool append = sctx->streamout.append_bitmask & (1 << i);
uint64_t va = 0;
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
struct si_streamout_target **t = sctx->streamout.targets;
- if (!sctx->screen->use_ngg_streamout)
+ if (sctx->gfx_level >= GFX11) {
+ /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
+ sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+ sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+ } else if (!sctx->screen->use_ngg_streamout) {
si_flush_vgt_streamout(sctx);
+ }
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
if (!t[i])
uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
- if (sctx->screen->use_ngg_streamout) {
+ if (sctx->gfx_level >= GFX11) {
+ si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM,
+ t[i]->buf_filled_size, t[i]->buf_filled_size_offset,
+ COPY_DATA_REG, NULL,
+ (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
+ sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
+ } else if (sctx->screen->use_ngg_streamout) {
/* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */
si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,