amd/llvm,radeonsi/gfx11: switch to using GDS_STRMOUT registers
authorMarek Olšák <marek.olsak@amd.com>
Thu, 2 Jun 2022 19:43:07 +0000 (15:43 -0400)
committerMarge Bot <emma+marge@anholt.net>
Tue, 7 Mar 2023 22:08:47 +0000 (22:08 +0000)
This is required by register shadowing (required by the new PAIRS packets),
preemption, user queues, and we only have to wait for VS after streamout,
not PS. This is how gfx11 streamout should have been done.

Reviewed-by: Qiang Yu <yuq825@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21584>

src/amd/llvm/ac_nir_to_llvm.c
src/gallium/drivers/radeonsi/si_gfx_cs.c
src/gallium/drivers/radeonsi/si_state_streamout.c

index 3e109fc..58b4db2 100644 (file)
@@ -4171,6 +4171,9 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
    }
    case nir_intrinsic_ordered_xfb_counter_add_amd: {
       /* must be called in a single lane of a workgroup. */
+      /* TODO: Add RADV support. */
+      bool use_gds_registers = ctx->ac.gfx_level >= GFX11 &&
+                               ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL;
       LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
       LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, gdsptr, "");
 
@@ -4180,13 +4183,22 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
        *
        * This is the expected code:
        *    ds_ordered_count release=0 done=0   // lock mutex
-       *    ds_add_rtn_u32 dwords_written0
-       *    ds_add_rtn_u32 dwords_written1
-       *    ds_add_rtn_u32 dwords_written2
-       *    ds_add_rtn_u32 dwords_written3
+       *    if (gfx_level >= GFX11) {
+       *       ds_add_gs_reg_rtn GDS_STRMOUT_DWORDS_WRITTEN_0
+       *       ds_add_gs_reg_rtn GDS_STRMOUT_DWORDS_WRITTEN_1
+       *       ds_add_gs_reg_rtn GDS_STRMOUT_DWORDS_WRITTEN_2
+       *       ds_add_gs_reg_rtn GDS_STRMOUT_DWORDS_WRITTEN_3
+       *    } else {
+       *       ds_add_rtn_u32 dwords_written0
+       *       ds_add_rtn_u32 dwords_written1
+       *       ds_add_rtn_u32 dwords_written2
+       *       ds_add_rtn_u32 dwords_written3
+       *    }
        *    ds_ordered_count release=1 done=1   // unlock mutex
        *
-       * TODO: Increment GDS_STRMOUT registers instead of GDS memory.
+       * GDS_STRMOUT_DWORDS_WRITTEN_n are just general-purpose global registers. We use them
+       * because MCBP (mid-command-buffer preemption) saves and restores them, and it doesn't
+       * save and restore GDS memory.
        */
       LLVMValueRef args[8] = {
          LLVMBuildIntToPtr(ctx->ac.builder, get_src(ctx, instr->src[0]), gdsptr, ""),
@@ -4205,21 +4217,29 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
       ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
 
       LLVMValueRef global_count[4];
-      LLVMValueRef add_count = get_src(ctx, instr->src[1]);
+      LLVMValueRef count_vec = get_src(ctx, instr->src[1]);
       unsigned write_mask = nir_intrinsic_write_mask(instr);
       for (unsigned i = 0; i < instr->num_components; i++) {
+         LLVMValueRef value =
+            LLVMBuildExtractElement(ctx->ac.builder, count_vec,
+                                    LLVMConstInt(ctx->ac.i32, i, false), "");
          if (write_mask & (1 << i)) {
-            LLVMValueRef gds_ptr =
-               ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
-            LLVMValueRef count =
-               LLVMBuildExtractElement(ctx->ac.builder, add_count,
-                                       LLVMConstInt(ctx->ac.i32, i, false), "");
-
-            global_count[i] =
-               LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpAdd, gds_ptr, count,
-                                  LLVMAtomicOrderingMonotonic, false);
-         } else
+            if (use_gds_registers) {
+               /* The offset is a relative offset from GDS_STRMOUT_DWORDS_WRITTEN_0. */
+               global_count[i] =
+                  ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.add.gs.reg.rtn.i32", ctx->ac.i32,
+                                     (LLVMValueRef[]){value, LLVMConstInt(ctx->ac.i32, i * 4, 0)},
+                                     2, 0);
+            } else {
+               LLVMValueRef gds_ptr =
+                  ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
+               global_count[i] =
+                  LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpAdd, gds_ptr, value,
+                                     LLVMAtomicOrderingMonotonic, false);
+            }
+         } else {
             global_count[i] = LLVMGetUndef(ctx->ac.i32);
+         }
       }
 
       ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
@@ -4228,12 +4248,14 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
       args[6] = args[7] = ctx->ac.i1true;
       ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
                          args, ARRAY_SIZE(args), 0);
-
       result = ac_build_gather_values(&ctx->ac, global_count, instr->num_components);
       break;
    }
    case nir_intrinsic_xfb_counter_sub_amd: {
       /* must be called in a single lane of a workgroup. */
+      /* TODO: Add RADV support. */
+      bool use_gds_registers = ctx->ac.gfx_level >= GFX11 &&
+                               ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL;
       LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
       LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, gdsptr, "");
       LLVMValueRef sub_vec = get_src(ctx, instr->src[0]);
@@ -4244,11 +4266,17 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
             LLVMValueRef value =
                LLVMBuildExtractElement(ctx->ac.builder, sub_vec,
                                        LLVMConstInt(ctx->ac.i32, i, false), "");
-
-            LLVMValueRef gds_ptr =
-               ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
-            LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpSub, gds_ptr, value,
-                               LLVMAtomicOrderingMonotonic, false);
+            if (use_gds_registers) {
+               /* The offset is a relative offset from GDS_STRMOUT_DWORDS_WRITTEN_0. */
+               ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.sub.gs.reg.rtn.i32", ctx->ac.i32,
+                                  (LLVMValueRef[]){value, LLVMConstInt(ctx->ac.i32, i * 4, 0)},
+                                  2, 0);
+            } else {
+               LLVMValueRef gds_ptr =
+                  ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
+               LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpSub, gds_ptr, value,
+                                  LLVMAtomicOrderingMonotonic, false);
+            }
          }
       }
       break;
index 0a48aee..c9bcc51 100644 (file)
@@ -103,8 +103,12 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
           * idle when we leave the IB, otherwise another process
           * might overwrite it while our shaders are busy.
           */
-         if (sscreen->use_ngg_streamout)
-            wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+         if (sscreen->use_ngg_streamout) {
+            if (ctx->gfx_level >= GFX11)
+               wait_flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+            else
+               wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+         }
       }
    }
 
index d6b0f3f..95dc42e 100644 (file)
@@ -110,9 +110,13 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
        */
       sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
 
-      /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
       if (sctx->screen->use_ngg_streamout) {
-         sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
+         if (sctx->gfx_level >= GFX11) {
+            sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
+         } else {
+            /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
+            sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
+         }
 
          /* Wait now. This is needed to make sure that GDS is not
           * busy at the end of IBs.
@@ -143,6 +147,10 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
    if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
       si_emit_streamout_end(sctx);
 
+   /* TODO: This is a hack that fixes streamout failures. It shouldn't be necessary. */
+   if (sctx->gfx_level >= GFX11 && !wait_now)
+      si_flush_gfx_cs(sctx, 0, NULL);
+
    /* Set the new targets. */
    unsigned enabled_mask = 0, append_bitmask = 0;
    for (i = 0; i < num_targets; i++) {
@@ -262,7 +270,20 @@ static void si_emit_streamout_begin(struct si_context *sctx)
 
       t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
 
-      if (sctx->screen->use_ngg_streamout) {
+      if (sctx->gfx_level >= GFX11) {
+         if (sctx->streamout.append_bitmask & (1 << i)) {
+            /* Restore the register value. */
+            si_cp_copy_data(sctx, cs, COPY_DATA_REG, NULL,
+                            (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 / 4) + i,
+                            COPY_DATA_SRC_MEM, t[i]->buf_filled_size,
+                            t[i]->buf_filled_size_offset);
+         } else {
+            /* Set to 0. */
+            radeon_begin(cs);
+            radeon_set_uconfig_reg(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + i * 4, 0);
+            radeon_end();
+         }
+      } else if (sctx->screen->use_ngg_streamout) {
          bool append = sctx->streamout.append_bitmask & (1 << i);
          uint64_t va = 0;
 
@@ -330,8 +351,13 @@ void si_emit_streamout_end(struct si_context *sctx)
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    struct si_streamout_target **t = sctx->streamout.targets;
 
-   if (!sctx->screen->use_ngg_streamout)
+   if (sctx->gfx_level >= GFX11) {
+      /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
+      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+      sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
+   } else if (!sctx->screen->use_ngg_streamout) {
       si_flush_vgt_streamout(sctx);
+   }
 
    for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
       if (!t[i])
@@ -339,7 +365,13 @@ void si_emit_streamout_end(struct si_context *sctx)
 
       uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 
-      if (sctx->screen->use_ngg_streamout) {
+      if (sctx->gfx_level >= GFX11) {
+         si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM,
+                         t[i]->buf_filled_size, t[i]->buf_filled_size_offset,
+                         COPY_DATA_REG, NULL,
+                         (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
+         sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
+      } else if (sctx->screen->use_ngg_streamout) {
          /* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */
          si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
                            EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,