radeonsi: inline gfx10_emit_streamout_begin/end
authorMarek Olšák <marek.olsak@amd.com>
Fri, 3 Jun 2022 23:45:31 +0000 (19:45 -0400)
committerMarge Bot <emma+marge@anholt.net>
Sat, 11 Jun 2022 11:14:16 +0000 (11:14 +0000)
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16885>

src/gallium/drivers/radeonsi/si_state_streamout.c

index 464259d..e10174f 100644 (file)
@@ -211,70 +211,6 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
       sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
 }
 
-static void gfx10_emit_streamout_begin(struct si_context *sctx)
-{
-   struct si_streamout_target **t = sctx->streamout.targets;
-   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-   unsigned last_target = 0;
-
-   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-      if (t[i])
-         last_target = i;
-   }
-
-   radeon_begin(cs);
-
-   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-      if (!t[i])
-         continue;
-
-      t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
-
-      bool append = sctx->streamout.append_bitmask & (1 << i);
-      uint64_t va = 0;
-
-      if (append) {
-         radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
-                                   RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
-
-         va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
-      }
-
-      radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
-      radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
-                  S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
-      radeon_emit(va);
-      radeon_emit(va >> 32);
-      radeon_emit(4 * i); /* destination in GDS */
-      radeon_emit(0);
-      radeon_emit(S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
-   }
-   radeon_end();
-
-   sctx->streamout.begin_emitted = true;
-}
-
-static void gfx10_emit_streamout_end(struct si_context *sctx)
-{
-   struct si_streamout_target **t = sctx->streamout.targets;
-
-   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-      if (!t[i])
-         continue;
-
-      uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
-
-      /* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */
-      si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
-                        EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
-                        t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
-
-      t[i]->buf_filled_size_valid = true;
-   }
-
-   sctx->streamout.begin_emitted = false;
-}
-
 static void si_flush_vgt_streamout(struct si_context *sctx)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
@@ -315,97 +251,121 @@ static void si_emit_streamout_begin(struct si_context *sctx)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    struct si_streamout_target **t = sctx->streamout.targets;
-   uint8_t *stride_in_dw = sctx->streamout.stride_in_dw;
-   unsigned i;
 
-   si_flush_vgt_streamout(sctx);
-
-   radeon_begin(cs);
+   if (!sctx->screen->use_ngg_streamout)
+      si_flush_vgt_streamout(sctx);
 
-   for (i = 0; i < sctx->streamout.num_targets; i++) {
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
       if (!t[i])
          continue;
 
-      t[i]->stride_in_dw = stride_in_dw[i];
+      t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
 
-      /* AMD GCN binds streamout buffers as shader resources.
-       * VGT only counts primitives and tells the shader
-       * through SGPRs what to do. */
-      radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
-      radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
-      radeon_emit(stride_in_dw[i]);                                    /* VTX_STRIDE (in DW) */
+      if (sctx->screen->use_ngg_streamout) {
+         bool append = sctx->streamout.append_bitmask & (1 << i);
+         uint64_t va = 0;
 
-      if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
-         uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+         if (append) {
+            radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
+                                      RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
 
-         /* Append. */
-         radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-         radeon_emit(STRMOUT_SELECT_BUFFER(i) |
-                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
-         radeon_emit(0);                                              /* unused */
-         radeon_emit(0);                                              /* unused */
-         radeon_emit(va);                                             /* src address lo */
-         radeon_emit(va >> 32);                                       /* src address hi */
+            va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+         }
 
-         radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
-                                   RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
+         radeon_begin(cs);
+         radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
+         radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
+                     S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(1));
+         radeon_emit(va);
+         radeon_emit(va >> 32);
+         radeon_emit(4 * i); /* destination in GDS */
+         radeon_emit(0);
+         radeon_emit(S_415_BYTE_COUNT_GFX9(4));
+         radeon_end();
       } else {
-         /* Start from the beginning. */
-         radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-         radeon_emit(STRMOUT_SELECT_BUFFER(i) |
-                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
-         radeon_emit(0);                                                 /* unused */
-         radeon_emit(0);                                                 /* unused */
-         radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
-         radeon_emit(0);                          /* unused */
+         /* Legacy streamout.
+          *
+          * The hw binds streamout buffers as shader resources. VGT only counts primitives
+          * and tells the shader through SGPRs what to do.
+          */
+         radeon_begin(cs);
+         radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
+         radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
+         radeon_emit(sctx->streamout.stride_in_dw[i]);                                    /* VTX_STRIDE (in DW) */
+
+         if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
+            uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+
+            /* Append. */
+            radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+            radeon_emit(STRMOUT_SELECT_BUFFER(i) |
+                        STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
+            radeon_emit(0);                                              /* unused */
+            radeon_emit(0);                                              /* unused */
+            radeon_emit(va);                                             /* src address lo */
+            radeon_emit(va >> 32);                                       /* src address hi */
+
+            radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
+                                      RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE);
+         } else {
+            /* Start from the beginning. */
+            radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+            radeon_emit(STRMOUT_SELECT_BUFFER(i) |
+                        STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
+            radeon_emit(0);                                                 /* unused */
+            radeon_emit(0);                                                 /* unused */
+            radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
+            radeon_emit(0);                          /* unused */
+         }
+         radeon_end_update_context_roll(sctx);
       }
    }
-   radeon_end();
 
    sctx->streamout.begin_emitted = true;
 }
 
 void si_emit_streamout_end(struct si_context *sctx)
 {
-   if (sctx->screen->use_ngg_streamout) {
-      gfx10_emit_streamout_end(sctx);
-      return;
-   }
-
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    struct si_streamout_target **t = sctx->streamout.targets;
-   unsigned i;
-   uint64_t va;
 
-   si_flush_vgt_streamout(sctx);
-
-   radeon_begin(cs);
+   if (!sctx->screen->use_ngg_streamout)
+      si_flush_vgt_streamout(sctx);
 
-   for (i = 0; i < sctx->streamout.num_targets; i++) {
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
       if (!t[i])
          continue;
 
-      va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
-      radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-      radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
-                  STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
-      radeon_emit(va);                                  /* dst address lo */
-      radeon_emit(va >> 32);                            /* dst address hi */
-      radeon_emit(0);                                   /* unused */
-      radeon_emit(0);                                   /* unused */
+      uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 
-      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
-                                RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE);
+      if (sctx->screen->use_ngg_streamout) {
+         /* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */
+         si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
+                           EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
+                           t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
+      } else {
+         radeon_begin(cs);
+         radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+         radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+                     STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
+         radeon_emit(va);                                  /* dst address lo */
+         radeon_emit(va >> 32);                            /* dst address hi */
+         radeon_emit(0);                                   /* unused */
+         radeon_emit(0);                                   /* unused */
+
+         /* Zero the buffer size. The counters (primitives generated,
+          * primitives emitted) may be enabled even if there is not
+          * buffer bound. This ensures that the primitives-emitted query
+          * won't increment. */
+         radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
+         radeon_end_update_context_roll(sctx);
 
-      /* Zero the buffer size. The counters (primitives generated,
-       * primitives emitted) may be enabled even if there is not
-       * buffer bound. This ensures that the primitives-emitted query
-       * won't increment. */
-      radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
+         radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size,
+                                   RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE);
+      }
 
       t[i]->buf_filled_size_valid = true;
    }
-   radeon_end_update_context_roll(sctx);
 
    sctx->streamout.begin_emitted = false;
 }
@@ -474,11 +434,8 @@ void si_init_streamout_functions(struct si_context *sctx)
    sctx->b.create_stream_output_target = si_create_so_target;
    sctx->b.stream_output_target_destroy = si_so_target_destroy;
    sctx->b.set_stream_output_targets = si_set_streamout_targets;
+   sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
 
-   if (sctx->screen->use_ngg_streamout) {
-      sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
-   } else {
-      sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
+   if (!sctx->screen->use_ngg_streamout)
       sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
-   }
 }