From 3f900df0713f4d33cdd8d0d963c71668f2b37dec Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 3 Jun 2022 19:45:31 -0400 Subject: [PATCH] radeonsi: inline gfx10_emit_streamout_begin/end Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_state_streamout.c | 217 +++++++++------------- 1 file changed, 87 insertions(+), 130 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 464259d..e10174f 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -211,70 +211,6 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ sctx->emit_cache_flush(sctx, &sctx->gfx_cs); } -static void gfx10_emit_streamout_begin(struct si_context *sctx) -{ - struct si_streamout_target **t = sctx->streamout.targets; - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - unsigned last_target = 0; - - for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { - if (t[i]) - last_target = i; - } - - radeon_begin(cs); - - for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { - if (!t[i]) - continue; - - t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i]; - - bool append = sctx->streamout.append_bitmask & (1 << i); - uint64_t va = 0; - - if (append) { - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, - RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE); - - va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; - } - - radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); - radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | - S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target)); - radeon_emit(va); - radeon_emit(va >> 32); - radeon_emit(4 * i); /* destination in GDS */ - radeon_emit(0); - radeon_emit(S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target)); - } - radeon_end(); - - sctx->streamout.begin_emitted = true; -} - -static void gfx10_emit_streamout_end(struct si_context *sctx) -{ - struct si_streamout_target **t = sctx->streamout.targets; - - for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { - if (!t[i]) - continue; - - uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; - - /* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */ - si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2, - EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS, - t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0); - - t[i]->buf_filled_size_valid = true; - } - - sctx->streamout.begin_emitted = false; -} - static void si_flush_vgt_streamout(struct si_context *sctx) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; @@ -315,97 +251,121 @@ static void si_emit_streamout_begin(struct si_context *sctx) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct si_streamout_target **t = sctx->streamout.targets; - uint8_t *stride_in_dw = sctx->streamout.stride_in_dw; - unsigned i; - si_flush_vgt_streamout(sctx); - - radeon_begin(cs); + if (!sctx->screen->use_ngg_streamout) + si_flush_vgt_streamout(sctx); - for (i = 0; i < sctx->streamout.num_targets; i++) { + for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { if (!t[i]) continue; - t[i]->stride_in_dw = stride_in_dw[i]; + t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i]; - /* AMD GCN binds streamout buffers as shader resources. - * VGT only counts primitives and tells the shader - * through SGPRs what to do. */ - radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2); - radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ - radeon_emit(stride_in_dw[i]); /* VTX_STRIDE (in DW) */ + if (sctx->screen->use_ngg_streamout) { + bool append = sctx->streamout.append_bitmask & (1 << i); + uint64_t va = 0; - if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) { - uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + if (append) { + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, + RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE); - /* Append. */ - radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); - radeon_emit(STRMOUT_SELECT_BUFFER(i) | - STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ - radeon_emit(0); /* unused */ - radeon_emit(0); /* unused */ - radeon_emit(va); /* src address lo */ - radeon_emit(va >> 32); /* src address hi */ + va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + } - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, - RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE); + radeon_begin(cs); + radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | + S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(1)); + radeon_emit(va); + radeon_emit(va >> 32); + radeon_emit(4 * i); /* destination in GDS */ + radeon_emit(0); + radeon_emit(S_415_BYTE_COUNT_GFX9(4)); + radeon_end(); } else { - /* Start from the beginning. */ - radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); - radeon_emit(STRMOUT_SELECT_BUFFER(i) | - STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ - radeon_emit(0); /* unused */ - radeon_emit(0); /* unused */ - radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */ - radeon_emit(0); /* unused */ + /* Legacy streamout. + * + * The hw binds streamout buffers as shader resources. VGT only counts primitives + * and tells the shader through SGPRs what to do. + */ + radeon_begin(cs); + radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2); + radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ + radeon_emit(sctx->streamout.stride_in_dw[i]); /* VTX_STRIDE (in DW) */ + + if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) { + uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + + /* Append. */ + radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ + radeon_emit(0); /* unused */ + radeon_emit(0); /* unused */ + radeon_emit(va); /* src address lo */ + radeon_emit(va >> 32); /* src address hi */ + + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, + RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE); + } else { + /* Start from the beginning. */ + radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ + radeon_emit(0); /* unused */ + radeon_emit(0); /* unused */ + radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */ + radeon_emit(0); /* unused */ + } + radeon_end_update_context_roll(sctx); } } - radeon_end(); sctx->streamout.begin_emitted = true; } void si_emit_streamout_end(struct si_context *sctx) { - if (sctx->screen->use_ngg_streamout) { - gfx10_emit_streamout_end(sctx); - return; - } - struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct si_streamout_target **t = sctx->streamout.targets; - unsigned i; - uint64_t va; - si_flush_vgt_streamout(sctx); - - radeon_begin(cs); + if (!sctx->screen->use_ngg_streamout) + si_flush_vgt_streamout(sctx); - for (i = 0; i < sctx->streamout.num_targets; i++) { + for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { if (!t[i]) continue; - va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; - radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); - radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | - STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ - radeon_emit(va); /* dst address lo */ - radeon_emit(va >> 32); /* dst address hi */ - radeon_emit(0); /* unused */ - radeon_emit(0); /* unused */ + uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, - RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE); + if (sctx->screen->use_ngg_streamout) { + /* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */ + si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2, + EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS, + t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0); + } else { + radeon_begin(cs); + radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | + STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ + radeon_emit(va); /* dst address lo */ + radeon_emit(va >> 32); /* dst address hi */ + radeon_emit(0); /* unused */ + radeon_emit(0); /* unused */ + + /* Zero the buffer size. The counters (primitives generated, + * primitives emitted) may be enabled even if there is not + * buffer bound. This ensures that the primitives-emitted query + * won't increment. */ + radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); + radeon_end_update_context_roll(sctx); - /* Zero the buffer size. The counters (primitives generated, - * primitives emitted) may be enabled even if there is not - * buffer bound. This ensures that the primitives-emitted query - * won't increment. */ - radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, + RADEON_USAGE_WRITE | RADEON_PRIO_SO_FILLED_SIZE); + } t[i]->buf_filled_size_valid = true; } - radeon_end_update_context_roll(sctx); sctx->streamout.begin_emitted = false; } @@ -474,11 +434,8 @@ void si_init_streamout_functions(struct si_context *sctx) sctx->b.create_stream_output_target = si_create_so_target; sctx->b.stream_output_target_destroy = si_so_target_destroy; sctx->b.set_stream_output_targets = si_set_streamout_targets; + sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; - if (sctx->screen->use_ngg_streamout) { - sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin; - } else { - sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; + if (!sctx->screen->use_ngg_streamout) sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; - } } -- 2.7.4