From 1e4b5390425c13b493b05bdaf17a94a6f2f32057 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 16 Jul 2023 10:38:20 -0400 Subject: [PATCH] radeonsi: handle deferred cache flushes as a state (si_atom) This allows us to remove a little bit of code from si_draw, and enable removing more code in the future. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_blit.c | 9 +++++++-- src/gallium/drivers/radeonsi/si_clear.c | 8 +++++++- src/gallium/drivers/radeonsi/si_compute.c | 11 ++++++++--- src/gallium/drivers/radeonsi/si_compute_blit.c | 14 ++++++++++++-- src/gallium/drivers/radeonsi/si_cp_dma.c | 8 +++++++- src/gallium/drivers/radeonsi/si_descriptors.c | 3 ++- src/gallium/drivers/radeonsi/si_gfx_cs.c | 12 ++++++++++-- src/gallium/drivers/radeonsi/si_pipe.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.h | 23 ++++++++++++++++++++++- src/gallium/drivers/radeonsi/si_query.c | 4 ++++ src/gallium/drivers/radeonsi/si_state.c | 12 ++++++++++++ src/gallium/drivers/radeonsi/si_state.h | 1 + src/gallium/drivers/radeonsi/si_state_draw.cpp | 23 ++++++++++++++--------- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 2 ++ src/gallium/drivers/radeonsi/si_state_streamout.c | 8 ++++++-- src/gallium/drivers/radeonsi/si_test_dma_perf.c | 4 ++-- 16 files changed, 117 insertions(+), 27 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 1219542..63c959a 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -496,16 +496,20 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture /* Required before and after FMASK and DCC_DECOMPRESS. */ if (custom_blend == sctx->custom_blend_fmask_decompress || - custom_blend == sctx->custom_blend_dcc_decompress) + custom_blend == sctx->custom_blend_dcc_decompress) { sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } si_blitter_begin(sctx, SI_DECOMPRESS); util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend); si_blitter_end(sctx); if (custom_blend == sctx->custom_blend_fmask_decompress || - custom_blend == sctx->custom_blend_dcc_decompress) + custom_blend == sctx->custom_blend_dcc_decompress) { sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } /* When running FMASK decompression with DCC, we need to run the "eliminate fast clear" pass * separately because FMASK decompression doesn't eliminate DCC fast clear. This makes @@ -1036,6 +1040,7 @@ static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_inf { /* Required before and after CB_RESOLVE. */ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); si_blitter_begin( sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 7c7e7ea..c6051a3 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -55,6 +55,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info, if (sctx->gfx_level <= GFX8) sctx->flags |= SI_CONTEXT_INV_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + /* Execute clears. */ for (unsigned i = 0; i < num_clears; i++) { if (info[i].is_dcc_msaa) { @@ -83,6 +85,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info, /* GFX6-8: CB and DB don't use L2. */ if (sctx->gfx_level <= GFX8) sctx->flags |= SI_CONTEXT_WB_L2; + + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } static bool si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex) @@ -1162,8 +1166,10 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers, si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); } - if (needs_db_flush) + if (needs_db_flush) { sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } } if (unlikely(sctx->sqtt_enabled)) { diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 220b872..ce60c1c 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -947,8 +947,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug && info->block[0] * info->block[1] * info->block[2] > 256; - if (cs_regalloc_hang) + if (cs_regalloc_hang) { sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed) return; @@ -976,6 +978,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info /* Indirect buffers use TC L2 on GFX9, but not older hw. */ if (sctx->gfx_level <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) { sctx->flags |= SI_CONTEXT_WB_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); si_resource(info->indirect)->TC_L2_dirty = false; } } @@ -1024,7 +1027,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info /* Registers that are not read from memory should be set before this: */ if (sctx->flags) - sctx->emit_cache_flush(sctx, &sctx->gfx_cs); + si_emit_cache_flush_direct(sctx); if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) { sctx->atoms.s.render_cond.emit(sctx, -1); @@ -1060,8 +1063,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info sctx->compute_is_busy = true; sctx->num_compute_calls++; - if (cs_regalloc_hang) + if (cs_regalloc_hang) { sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } } void si_destroy_compute(struct si_compute *program) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index fdc9825..9a4af15 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -163,6 +163,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g if (sctx->num_hw_pipestat_streamout_queries) sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; + if (sctx->flags) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + if (!(flags & SI_OP_CS_RENDER_COND_ENABLE)) sctx->render_cond_enabled = false; @@ -213,6 +216,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME; } } + + if (sctx->flags) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info, @@ -220,8 +226,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf unsigned num_buffers, const struct pipe_shader_buffer *buffers, unsigned writeable_bitmask) { - if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) + if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) { sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } /* Save states. */ struct pipe_shader_buffer saved_sb[3] = {}; @@ -243,8 +251,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf /* Do cache flushing at the end. */ if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) { - if (flags & SI_OP_SYNC_AFTER) + if (flags & SI_OP_SYNC_AFTER) { sctx->flags |= SI_CONTEXT_WB_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } } else { while (writeable_bitmask) si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true; diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 916f2d9..c26b49d 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -144,7 +144,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst * Also wait for the previous CP DMA operations. */ if (*is_first && sctx->flags) - sctx->emit_cache_flush(sctx, &sctx->gfx_cs); + si_emit_cache_flush_direct(sctx); if (user_flags & SI_OP_SYNC_CPDMA_BEFORE && *is_first && !(*packet_flags & CP_DMA_CLEAR)) *packet_flags |= CP_DMA_RAW_WAIT; @@ -192,6 +192,9 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy); } + if (sctx->flags) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + while (size) { unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS); @@ -330,6 +333,9 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy); + if (sctx->flags) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + /* This is the main part doing the copying. Src is always aligned. */ main_dst_offset = dst_offset + skipped_size; main_src_offset = src_offset + skipped_size; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 3355013..fabd098 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1926,7 +1926,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx) * descriptors directly in memory, in case the GPU is using them. */ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; - sctx->emit_cache_flush(sctx, &sctx->gfx_cs); + si_emit_cache_flush_direct(sctx); util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) { unsigned desc_slot = (*tex_handle)->desc_slot; @@ -1950,6 +1950,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx) /* Invalidate scalar L0 because the cache doesn't know that L2 changed. */ sctx->flags |= SI_CONTEXT_INV_SCACHE; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); sctx->bindless_descriptors_dirty = false; } diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index a25f64b..42655e3 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -108,7 +108,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h /* Wait for draw calls to finish if needed. */ if (wait_flags) { ctx->flags |= wait_flags; - ctx->emit_cache_flush(ctx, &ctx->gfx_cs); + si_emit_cache_flush_direct(ctx); } ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs; @@ -396,6 +396,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg) ctx->flags |= SI_CONTEXT_VGT_FLUSH; + si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush); + if (ctx->screen->attribute_ring) { radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring, RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS); @@ -658,6 +660,9 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) unsigned cb_db_event = 0; unsigned flags = ctx->flags; + if (!flags) + return; + if (!ctx->has_graphics) { /* Only process compute flags. */ flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | @@ -911,10 +916,13 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) ctx->flags = 0; } -void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) +void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) { uint32_t flags = sctx->flags; + if (!flags) + return; + if (!sctx->has_graphics) { /* Only process compute flags. */ flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 0402c92..a9b0034 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -611,7 +611,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign if (sctx->gfx_level >= GFX10) sctx->emit_cache_flush = gfx10_emit_cache_flush; else - sctx->emit_cache_flush = si_emit_cache_flush; + sctx->emit_cache_flush = gfx6_emit_cache_flush; sctx->b.emit_string_marker = si_emit_string_marker; sctx->b.set_debug_callback = si_set_debug_callback; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index fb0e6e6..c1d7ced 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1531,7 +1531,7 @@ void si_trace_emit(struct si_context *sctx); void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl); void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); -void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); +void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); /* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement * optimizations without affecting the normal draw_vbo functions perf. */ @@ -1851,6 +1851,8 @@ static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned /* GFX6-GFX8 */ sctx->flags |= SI_CONTEXT_INV_L2; } + + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples, @@ -1876,6 +1878,8 @@ static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned /* GFX6-GFX8 */ sctx->flags |= SI_CONTEXT_INV_L2; } + + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler) @@ -2116,6 +2120,23 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim, } } +/* There are 3 ways to flush caches and all of them are correct. + * + * 1) sctx->flags |= ...; + * si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); // deferred + * + * 2) sctx->flags |= ...; + * si_emit_cache_flush_direct(sctx); // immediate + * + * 3) sctx->flags |= ...; + * sctx->emit_cache_flush(sctx, cs); // immediate (2 is better though) + */ +static inline void si_emit_cache_flush_direct(struct si_context *sctx) +{ + sctx->emit_cache_flush(sctx, &sctx->gfx_cs); + sctx->dirty_atoms &= ~SI_ATOM_BIT(cache_flush); +} + #define PRINT_ERR(fmt, args...) \ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index b45abe8..61ae0d8 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -859,9 +859,11 @@ static void si_update_hw_pipeline_stats(struct si_context *sctx, unsigned type, if (diff == 1 && sctx->num_hw_pipestat_streamout_queries == 1) { sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } else if (diff == -1 && sctx->num_hw_pipestat_streamout_queries == 0) { sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } } } @@ -1569,6 +1571,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q } sctx->flags |= sctx->screen->barrier_flags.cp_to_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) { if (query->b.type != PIPE_QUERY_TIMESTAMP) { @@ -1664,6 +1667,7 @@ static void si_render_condition(struct pipe_context *ctx, struct pipe_query *que /* Settings this in the render cond atom is too late, * so set it here. */ sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); sctx->render_cond_enabled = old_render_cond_enabled; } diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 83bbf09..a459d4e 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1494,11 +1494,13 @@ static void si_set_active_query_state(struct pipe_context *ctx, bool enable) if (sctx->num_hw_pipestat_streamout_queries) { sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } } else { if (sctx->num_hw_pipestat_streamout_queries) { sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } } @@ -2893,6 +2895,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, } sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); /* u_blitter doesn't invoke depth decompression when it does multiple * blits in a row, but the only case when it matters for DB is when @@ -2910,6 +2913,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, * Flushing DB metadata works around the problem. */ sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } /* Take the maximum of the old and new count. If the new count is lower, @@ -5390,6 +5394,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) /* Indirect buffers use TC L2 on GFX9, but not older hw. */ if (sctx->screen->info.gfx_level <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER) sctx->flags |= SI_CONTEXT_WB_L2; + + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) @@ -5402,6 +5408,11 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) return si_create_blend_state_mode(&sctx->b, &blend, mode); } +static void si_emit_cache_flush_state(struct si_context *sctx, unsigned index) +{ + sctx->emit_cache_flush(sctx, &sctx->gfx_cs); +} + void si_init_state_compute_functions(struct si_context *sctx) { sctx->b.create_sampler_state = si_create_sampler_state; @@ -5434,6 +5445,7 @@ void si_init_state_functions(struct si_context *sctx) sctx->atoms.s.clip_regs.emit = si_emit_clip_regs; sctx->atoms.s.clip_state.emit = si_emit_clip_state; sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref; + sctx->atoms.s.cache_flush.emit = si_emit_cache_flush_state; sctx->b.create_blend_state = si_create_blend_state; sctx->b.bind_blend_state = si_bind_blend_state; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 16c0811..ac06ec1 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -208,6 +208,7 @@ union si_state_atoms { struct si_atom ngg_cull_state; struct si_atom vgt_pipeline_state; struct si_atom tess_io_layout; + struct si_atom cache_flush; } s; struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)]; }; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 77f63bc..14a3b72 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -833,8 +833,10 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, if (GFX_VERSION == GFX7 && sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) && num_instanced_prims_less_than(indirect, prim, min_vertex_count, - instance_count, 2, sctx->patch_vertices)) + instance_count, 2, sctx->patch_vertices)) { sctx->flags |= SI_CONTEXT_VGT_FLUSH; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } } return ia_multi_vgt_param; @@ -2086,6 +2088,7 @@ static void si_draw(struct pipe_context *ctx, /* GFX8 reads index buffers through TC L2, so it doesn't * need this. */ sctx->flags |= SI_CONTEXT_WB_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); si_resource(indexbuf)->TC_L2_dirty = false; } } @@ -2098,12 +2101,14 @@ static void si_draw(struct pipe_context *ctx, if (GFX_VERSION <= GFX8) { if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) { sctx->flags |= SI_CONTEXT_WB_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); si_resource(indirect->buffer)->TC_L2_dirty = false; } if (indirect->indirect_draw_count && si_resource(indirect->indirect_draw_count)->TC_L2_dirty) { sctx->flags |= SI_CONTEXT_WB_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false; } } @@ -2260,18 +2265,17 @@ static void si_draw(struct pipe_context *ctx, /* Emit all states except possibly render condition. */ si_emit_rasterizer_prim_state(sctx); - si_emit_all_states(sctx, masked_atoms); - - /* Emit draw states. */ - si_emit_vs_state - (sctx, index_size); + /* This must be done before si_emit_all_states because it can set cache flush flags. */ si_emit_draw_registers (sctx, indirect, prim, index_size, instance_count, primitive_restart, info->restart_index, min_direct_count); + /* This emits states and flushes caches. */ + si_emit_all_states(sctx, masked_atoms); + /* <-- CUs are idle here if the cache_flush state waited. */ - if (sctx->flags) - sctx->emit_cache_flush(sctx, &sctx->gfx_cs); - /* <-- CUs are idle here if we waited. */ + /* This must be done after si_emit_all_states, which can affect this. */ + si_emit_vs_state + (sctx, index_size); /* If we haven't emitted the render condition state (because it depends on cache flushes), * do it now. @@ -2328,6 +2332,7 @@ static void si_draw(struct pipe_context *ctx, (GFX_VERSION == GFX8 && (sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI))) && si_get_strmout_en(sctx)) { sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } if (unlikely(IS_BLIT && sctx->decompression_enabled)) { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 4c6d6b0..9496b52 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -3427,6 +3427,8 @@ bool si_update_ngg(struct si_context *sctx) */ if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) { sctx->flags |= SI_CONTEXT_VGT_FLUSH; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + if (sctx->gfx_level == GFX10) { /* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index af7954d..4fa0119 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -115,6 +115,9 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ SI_CONTEXT_PFP_SYNC_ME; } + if (sctx->flags) + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + /* Streamout buffers must be bound in 2 places: * 1) in VGT by setting the VGT_STRMOUT registers * 2) as shader resources @@ -193,7 +196,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); if (wait_now) - sctx->emit_cache_flush(sctx, &sctx->gfx_cs); + si_emit_cache_flush_direct(sctx); } static void si_flush_vgt_streamout(struct si_context *sctx) @@ -309,7 +312,7 @@ void si_emit_streamout_end(struct si_context *sctx) if (sctx->gfx_level >= GFX11) { /* Wait for streamout to finish before reading GDS_STRMOUT registers. */ sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; - sctx->emit_cache_flush(sctx, &sctx->gfx_cs); + si_emit_cache_flush_direct(sctx); } else { si_flush_vgt_streamout(sctx); } @@ -326,6 +329,7 @@ void si_emit_streamout_end(struct si_context *sctx) COPY_DATA_REG, NULL, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i); sctx->flags |= SI_CONTEXT_PFP_SYNC_ME; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } else { radeon_begin(cs); radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 026ffa5..0a1d582 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -153,7 +153,7 @@ void si_test_dma_perf(struct si_screen *sscreen) sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB; - sctx->emit_cache_flush(sctx, &sctx->gfx_cs); + si_emit_cache_flush_direct(sctx); struct pipe_query *q = ctx->create_query(ctx, query_type, 0); ctx->begin_query(ctx, q); @@ -217,7 +217,7 @@ void si_test_dma_perf(struct si_screen *sscreen) sctx->flags |= SI_CONTEXT_INV_VCACHE | (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) | SI_CONTEXT_CS_PARTIAL_FLUSH; - sctx->emit_cache_flush(sctx, &sctx->gfx_cs); + si_emit_cache_flush_direct(sctx); } ctx->end_query(ctx, q); -- 2.7.4