From a77ee8b548d83614b11bbfb654b031b7d464c3e3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 26 Aug 2013 17:19:39 +0200 Subject: [PATCH] radeonsi: simplify and improve flushing MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This mimics r600g. The R600_CONTEXT_xxx flags are added to rctx->b.flags and si_emit_cache_flush emits the packets. That's it. The shared radeon code tells us when the streamout cache should be flushed, so we have to check the flags anyway. There is a new atom "cache_flush", because caches must be flushed *after* resource descriptors are changed in memory. Functional changes: * Write caches are flushed at the end of CS and read caches are flushed at its beginning. * Sampler view states are removed from si_state, they only held the flush flags. * Everytime a shader is changed, the I cache is flushed. Is this needed? Due to a hw bug, this also flushes the K cache. * The WRITE_DATA packet is changed to use TC, which fixes a rendering issue in openarena. I'm not sure how TC interacts with CP DMA, but for now it seems to work better than any other solution I tried. (BTW CIK allows us to use TC for CP DMA.) * Flush the K cache instead of the texture cache when updating resource descriptors (due to a hw bug, this also flushes the I cache). I think the K cache flush is correct here, but I'm not sure if the texture cache should be flushed too (probably not considering we use TC for WRITE_DATA, but we don't use TC for CP DMA). * The number of resource contexts is decreased to 16. With all of these cache changes, 4 doesn't work, but 8 works, which suggests I'm actually doing the right thing here and the pipeline isn't drained during flushes. Reviewed-by: Michel Dänzer Reviewed-by: Christian König Tested-by: Tom Stellard --- src/gallium/drivers/radeon/r600_pipe_common.h | 1 + src/gallium/drivers/radeonsi/r600.h | 3 - src/gallium/drivers/radeonsi/r600_hw_context.c | 45 +++------- src/gallium/drivers/radeonsi/radeonsi_pipe.c | 4 + src/gallium/drivers/radeonsi/radeonsi_pipe.h | 8 +- src/gallium/drivers/radeonsi/radeonsi_pm4.c | 11 --- src/gallium/drivers/radeonsi/radeonsi_pm4.h | 2 - src/gallium/drivers/radeonsi/si_commands.c | 9 -- src/gallium/drivers/radeonsi/si_descriptors.c | 16 ++-- src/gallium/drivers/radeonsi/si_state.c | 46 +++++----- src/gallium/drivers/radeonsi/si_state.h | 9 +- src/gallium/drivers/radeonsi/si_state_draw.c | 111 ++++++++++++++++--------- 12 files changed, 125 insertions(+), 140 deletions(-) diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index 98c113f..b5c32bb 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -42,6 +42,7 @@ #define R600_CONTEXT_INV_VERTEX_CACHE (1 << 0) #define R600_CONTEXT_INV_TEX_CACHE (1 << 1) #define R600_CONTEXT_INV_CONST_CACHE (1 << 2) +#define R600_CONTEXT_INV_SHADER_CACHE (1 << 3) /* read-write caches */ #define R600_CONTEXT_STREAMOUT_FLUSH (1 << 8) #define R600_CONTEXT_FLUSH_AND_INV (1 << 9) diff --git a/src/gallium/drivers/radeonsi/r600.h b/src/gallium/drivers/radeonsi/r600.h index 531ff40..a914ce2 100644 --- a/src/gallium/drivers/radeonsi/r600.h +++ b/src/gallium/drivers/radeonsi/r600.h @@ -69,9 +69,6 @@ struct r600_query { struct list_head list; }; -#define R600_CONTEXT_DST_CACHES_DIRTY (1 << 1) -#define R600_CONTEXT_CHECK_EVENT_FLUSH (1 << 2) - struct r600_context; struct r600_screen; diff --git a/src/gallium/drivers/radeonsi/r600_hw_context.c b/src/gallium/drivers/radeonsi/r600_hw_context.c index d3c9140..db622ba 100644 --- a/src/gallium/drivers/radeonsi/r600_hw_context.c +++ b/src/gallium/drivers/radeonsi/r600_hw_context.c @@ -157,7 +157,7 @@ void si_need_cs_space(struct r600_context *ctx, unsigned num_dw, } /* Count in framebuffer cache flushes at the end of CS. */ - num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */ + num_dw += ctx->atoms.cache_flush->num_dw; /* Save 16 dwords for the fence mechanism. */ num_dw += 16; @@ -174,37 +174,6 @@ void si_need_cs_space(struct r600_context *ctx, unsigned num_dw, } } -static void r600_flush_framebuffer(struct r600_context *ctx) -{ - struct si_pm4_state *pm4; - - if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY)) - return; - - pm4 = si_pm4_alloc_state(ctx); - - if (pm4 == NULL) - return; - - si_cmd_surface_sync(pm4, S_0085F0_CB0_DEST_BASE_ENA(1) | - S_0085F0_CB1_DEST_BASE_ENA(1) | - S_0085F0_CB2_DEST_BASE_ENA(1) | - S_0085F0_CB3_DEST_BASE_ENA(1) | - S_0085F0_CB4_DEST_BASE_ENA(1) | - S_0085F0_CB5_DEST_BASE_ENA(1) | - S_0085F0_CB6_DEST_BASE_ENA(1) | - S_0085F0_CB7_DEST_BASE_ENA(1) | - S_0085F0_DB_ACTION_ENA(1) | - S_0085F0_DB_DEST_BASE_ENA(1)); - si_cmd_flush_and_inv_cb_meta(pm4); - - si_pm4_emit(ctx, pm4); - si_pm4_free_state(ctx, pm4, ~0); - - ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY; - ctx->flush_and_inv_cb_meta = false; -} - void si_context_flush(struct r600_context *ctx, unsigned flags) { struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; @@ -230,7 +199,11 @@ void si_context_flush(struct r600_context *ctx, unsigned flags) } #endif - r600_flush_framebuffer(ctx); + ctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV_CB | + R600_CONTEXT_FLUSH_AND_INV_CB_META | + R600_CONTEXT_FLUSH_AND_INV_DB | + R600_CONTEXT_INV_TEX_CACHE; + si_emit_cache_flush(&ctx->b, NULL); /* partial flush is needed to avoid lockups on some chips with user fences */ cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); @@ -275,7 +248,11 @@ void si_context_flush(struct r600_context *ctx, unsigned flags) #endif ctx->pm4_dirty_cdwords = 0; - ctx->flags = 0; + + /* Flush read caches at the beginning of CS. */ + ctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE | + R600_CONTEXT_INV_CONST_CACHE | + R600_CONTEXT_INV_SHADER_CACHE; /* set all valid group as dirty so they get reemited on * next draw command diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c b/src/gallium/drivers/radeonsi/radeonsi_pipe.c index bb44ccc..6ca138f 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c @@ -244,6 +244,10 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void si_init_all_descriptors(rctx); + /* Initialize cache_flush. */ + rctx->cache_flush = si_atom_cache_flush; + rctx->atoms.cache_flush = &rctx->cache_flush; + switch (rctx->b.chip_class) { case SI: case CIK: diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h b/src/gallium/drivers/radeonsi/radeonsi_pipe.h index 9d84e61..61fdfe2 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h @@ -131,8 +131,12 @@ struct r600_context { union { struct { + /* The order matters. */ struct r600_atom *const_buffers[SI_NUM_SHADERS]; struct r600_atom *sampler_views[SI_NUM_SHADERS]; + /* Caches must be flushed after resource descriptors are + * updated in memory. */ + struct r600_atom *cache_flush; }; struct r600_atom *array[0]; } atoms; @@ -179,7 +183,6 @@ struct r600_context { unsigned backend_mask; unsigned max_db; /* for OQ */ - unsigned flags; boolean predicate_drawing; unsigned num_so_targets; @@ -198,12 +201,11 @@ struct r600_context { /* With rasterizer discard, there doesn't have to be a pixel shader. * In that case, we bind this one: */ struct si_pipe_shader *dummy_pixel_shader; + struct r600_atom cache_flush; /* SI state handling */ union si_state queued; union si_state emitted; - - bool flush_and_inv_cb_meta; }; /* r600_blit.c */ diff --git a/src/gallium/drivers/radeonsi/radeonsi_pm4.c b/src/gallium/drivers/radeonsi/radeonsi_pm4.c index 9d0a7c0..37a199d 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_pm4.c +++ b/src/gallium/drivers/radeonsi/radeonsi_pm4.c @@ -145,17 +145,6 @@ void si_pm4_inval_texture_cache(struct si_pm4_state *state) state->cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); } -void si_pm4_inval_fb_cache(struct si_pm4_state *state, unsigned nr_cbufs) -{ - state->cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1); - state->cp_coher_cntl |= ((1 << nr_cbufs) - 1) << S_0085F0_CB0_DEST_BASE_ENA_SHIFT; -} - -void si_pm4_inval_zsbuf_cache(struct si_pm4_state *state) -{ - state->cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1); -} - void si_pm4_free_state(struct r600_context *rctx, struct si_pm4_state *state, unsigned idx) diff --git a/src/gallium/drivers/radeonsi/radeonsi_pm4.h b/src/gallium/drivers/radeonsi/radeonsi_pm4.h index b74db08..2e32a19 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_pm4.h +++ b/src/gallium/drivers/radeonsi/radeonsi_pm4.h @@ -80,8 +80,6 @@ void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx) void si_pm4_inval_shader_cache(struct si_pm4_state *state); void si_pm4_inval_texture_cache(struct si_pm4_state *state); -void si_pm4_inval_fb_cache(struct si_pm4_state *state, unsigned nr_cbufs); -void si_pm4_inval_zsbuf_cache(struct si_pm4_state *state); void si_pm4_free_state(struct r600_context *rctx, struct si_pm4_state *state, diff --git a/src/gallium/drivers/radeonsi/si_commands.c b/src/gallium/drivers/radeonsi/si_commands.c index e498bd2..bf95924 100644 --- a/src/gallium/drivers/radeonsi/si_commands.c +++ b/src/gallium/drivers/radeonsi/si_commands.c @@ -78,12 +78,3 @@ void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl) si_pm4_cmd_end(pm4, false); } } - -void si_cmd_flush_and_inv_cb_meta(struct si_pm4_state *pm4) -{ - si_pm4_cmd_begin(pm4, PKT3_EVENT_WRITE); - si_pm4_cmd_add(pm4, - EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | - EVENT_INDEX(0)); - si_pm4_cmd_end(pm4, false); -} diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 2983d75..5d85448 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -30,7 +30,7 @@ #include "util/u_memory.h" -#define SI_NUM_CONTEXTS 256 +#define SI_NUM_CONTEXTS 16 static uint32_t null_desc[8]; /* zeros */ @@ -142,7 +142,8 @@ static void si_release_descriptors(struct si_descriptors *desc) pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL); } -static void si_update_descriptors(struct si_descriptors *desc) +static void si_update_descriptors(struct r600_context *rctx, + struct si_descriptors *desc) { if (desc->dirty_mask) { desc->atom.num_dw = @@ -150,6 +151,8 @@ static void si_update_descriptors(struct si_descriptors *desc) (4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask) + /* update */ 4; /* pointer update */ desc->atom.dirty = true; + /* The descriptors are read with the K cache. */ + rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE; } else { desc->atom.dirty = false; } @@ -185,6 +188,7 @@ static void si_emit_descriptors(struct r600_context *rctx, va_base = r600_resource_va(rctx->b.b.screen, &desc->buffer->b.b); /* Copy the descriptors to a new context slot. */ + /* XXX Consider using TC or L2 for this copy on CIK. */ si_emit_cp_dma_copy_buffer(rctx, va_base + new_context_id * desc->context_size, va_base + desc->current_context_id * desc->context_size, @@ -215,7 +219,7 @@ static void si_emit_descriptors(struct r600_context *rctx, packet_size = 2 + desc->element_dw_size; radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0)); - radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) | + radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_TC_OR_L2) | PKT3_WRITE_DATA_WR_CONFIRM | PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME)); radeon_emit(cs, va & 0xFFFFFFFFUL); @@ -322,7 +326,7 @@ void si_set_sampler_view(struct r600_context *rctx, unsigned shader, } views->desc.dirty_mask |= 1 << slot; - si_update_descriptors(&views->desc); + si_update_descriptors(rctx, &views->desc); } /* BUFFER RESOURCES */ @@ -405,8 +409,6 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s if (shader >= SI_NUM_SHADERS) return; - rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE; - assert(slot < buffers->num_buffers); pipe_resource_reference(&buffers->buffers[slot], NULL); @@ -451,7 +453,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s } buffers->desc.dirty_mask |= 1 << slot; - si_update_descriptors(&buffers->desc); + si_update_descriptors(rctx, &buffers->desc); } /* INIT/DEINIT */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 5ac55f2..3c4197c 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2238,11 +2238,13 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (pm4 == NULL) return; - si_pm4_inval_fb_cache(pm4, state->nr_cbufs); - rctx->flush_and_inv_cb_meta = true; - - if (state->zsbuf) - si_pm4_inval_zsbuf_cache(pm4); + if (rctx->framebuffer.nr_cbufs) { + rctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV_CB | + R600_CONTEXT_FLUSH_AND_INV_CB_META; + } + if (rctx->framebuffer.zsbuf) { + rctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV_DB; + } util_copy_framebuffer_state(&rctx->framebuffer, state); @@ -2468,6 +2470,8 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) si_pm4_bind_state(rctx, vs, sel->current->pm4); else si_pm4_bind_state(rctx, vs, rctx->dummy_pixel_shader->pm4); + + rctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE; } static void si_bind_ps_shader(struct pipe_context *ctx, void *state) @@ -2484,6 +2488,8 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) si_pm4_bind_state(rctx, ps, sel->current->pm4); else si_pm4_bind_state(rctx, ps, rctx->dummy_pixel_shader->pm4); + + rctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE; } static void si_delete_shader_selector(struct pipe_context *ctx, @@ -2826,17 +2832,14 @@ static void *si_create_sampler_state(struct pipe_context *ctx, /* XXX consider moving this function to si_descriptors.c for gcc to inline * the si_set_sampler_view calls. LTO might help too. */ -static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx, - unsigned shader, unsigned count, - struct pipe_sampler_view **views) +static void si_set_sampler_views(struct r600_context *rctx, + unsigned shader, unsigned count, + struct pipe_sampler_view **views) { struct r600_textures_info *samplers = &rctx->samplers[shader]; struct si_pipe_sampler_view **rviews = (struct si_pipe_sampler_view **)views; - struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx); int i; - si_pm4_inval_texture_cache(pm4); - for (i = 0; i < count; i++) { if (views[i]) { struct r600_texture *rtex = @@ -2879,27 +2882,23 @@ static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx, } samplers->n_views = count; - return pm4; + rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE; } static void si_set_vs_sampler_views(struct pipe_context *ctx, unsigned count, struct pipe_sampler_view **views) { struct r600_context *rctx = (struct r600_context *)ctx; - struct si_pm4_state *pm4; - pm4 = si_set_sampler_views(rctx, PIPE_SHADER_VERTEX, count, views); - si_pm4_set_state(rctx, vs_sampler_views, pm4); + si_set_sampler_views(rctx, PIPE_SHADER_VERTEX, count, views); } static void si_set_ps_sampler_views(struct pipe_context *ctx, unsigned count, struct pipe_sampler_view **views) { struct r600_context *rctx = (struct r600_context *)ctx; - struct si_pm4_state *pm4; - pm4 = si_set_sampler_views(rctx, PIPE_SHADER_FRAGMENT, count, views); - si_pm4_set_state(rctx, ps_sampler_views, pm4); + si_set_sampler_views(rctx, PIPE_SHADER_FRAGMENT, count, views); } static struct si_pm4_state *si_bind_sampler_states(struct r600_context *rctx, unsigned count, @@ -2915,7 +2914,7 @@ static struct si_pm4_state *si_bind_sampler_states(struct r600_context *rctx, un if (!count) goto out; - si_pm4_inval_texture_cache(pm4); + rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE; si_pm4_sh_data_begin(pm4); for (i = 0; i < count; i++) { @@ -3128,14 +3127,9 @@ static void si_set_polygon_stipple(struct pipe_context *ctx, static void si_texture_barrier(struct pipe_context *ctx) { struct r600_context *rctx = (struct r600_context *)ctx; - struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx); - - if (pm4 == NULL) - return; - si_pm4_inval_texture_cache(pm4); - si_pm4_inval_fb_cache(pm4, rctx->framebuffer.nr_cbufs); - si_pm4_set_state(rctx, texture_barrier, pm4); + rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE | + R600_CONTEXT_FLUSH_AND_INV_CB; } static void *si_create_blend_custom(struct r600_context *rctx, unsigned mode) diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 82fac4a..94a1521 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -28,6 +28,7 @@ #define SI_STATE_H #include "radeonsi_pm4.h" +#include "../radeon/r600_pipe_common.h" struct si_state_blend { struct si_pm4_state pm4; @@ -74,8 +75,6 @@ struct si_vertex_element union si_state { struct { - struct si_pm4_state *sync; - struct si_pm4_state *flush_and_inv_cb_meta; struct si_pm4_state *init; struct si_state_blend *blend; struct si_pm4_state *blend_color; @@ -90,14 +89,11 @@ union si_state { struct si_pm4_state *fb_blend; struct si_pm4_state *dsa_stencil_ref; struct si_pm4_state *vs; - struct si_pm4_state *vs_sampler_views; struct si_pm4_state *vs_sampler; struct si_pm4_state *ps; - struct si_pm4_state *ps_sampler_views; struct si_pm4_state *ps_sampler; struct si_pm4_state *spi; struct si_pm4_state *vertex_buffers; - struct si_pm4_state *texture_barrier; struct si_pm4_state *draw_info; struct si_pm4_state *draw; } named; @@ -214,6 +210,8 @@ void si_init_state_functions(struct r600_context *rctx); void si_init_config(struct r600_context *rctx); /* si_state_draw.c */ +extern const struct r600_atom si_atom_cache_flush; +void si_emit_cache_flush(struct r600_common_context *rctx, struct r600_atom *atom); void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo); /* si_commands.c */ @@ -224,6 +222,5 @@ void si_cmd_draw_index_2(struct si_pm4_state *pm4, uint32_t max_size, void si_cmd_draw_index_auto(struct si_pm4_state *pm4, uint32_t count, uint32_t initiator, bool predicate); void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl); -void si_cmd_flush_and_inv_cb_meta(struct si_pm4_state *pm4); #endif diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 5c693ad..38d8497 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -52,8 +52,6 @@ static void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *s if (pm4 == NULL) return; - si_pm4_inval_shader_cache(pm4); - /* Certain attributes (position, psize, etc.) don't count as params. * VS is required to export at least one param and r600_shader_from_tgsi() * takes care of adding a dummy export. @@ -116,6 +114,7 @@ static void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *s } si_pm4_bind_state(rctx, vs, shader->pm4); + rctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE; } static void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *shader) @@ -133,8 +132,6 @@ static void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *s if (pm4 == NULL) return; - si_pm4_inval_shader_cache(pm4); - db_shader_control = S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) | S_02880C_ALPHA_TO_MASK_DISABLE(rctx->fb_cb0_is_integer); @@ -244,6 +241,7 @@ static void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *s shader->cb0_is_integer = rctx->fb_cb0_is_integer; shader->sprite_coord_enable = rctx->sprite_coord_enable; si_pm4_bind_state(rctx, ps, shader->pm4); + rctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE; } /* @@ -461,9 +459,8 @@ static void si_vertex_buffer_update(struct r600_context *rctx) unsigned i, count; uint64_t va; - si_pm4_inval_texture_cache(pm4); + rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE; - /* bind vertex buffer once */ count = rctx->vertex_elements->count; assert(count <= 256 / 4); @@ -568,11 +565,74 @@ static void si_state_draw(struct r600_context *rctx, si_pm4_set_state(rctx, draw, pm4); } +void si_emit_cache_flush(struct r600_common_context *rctx, struct r600_atom *atom) +{ + struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + uint32_t cp_coher_cntl = 0; + + /* XXX SI flushes both ICACHE and KCACHE if either flag is set. + * XXX CIK shouldn't have this issue. Test CIK before separating the flags + * XXX to ensure there is no regression. Also find out if there is another + * XXX way to flush either ICACHE or KCACHE but not both for SI. */ + if (rctx->flags & (R600_CONTEXT_INV_SHADER_CACHE | + R600_CONTEXT_INV_CONST_CACHE)) { + cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1) | + S_0085F0_SH_KCACHE_ACTION_ENA(1); + } + if (rctx->flags & (R600_CONTEXT_INV_TEX_CACHE | + R600_CONTEXT_STREAMOUT_FLUSH)) { + cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1) | + S_0085F0_TCL1_ACTION_ENA(1); + } + if (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_CB) { + cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | + S_0085F0_CB0_DEST_BASE_ENA(1) | + S_0085F0_CB1_DEST_BASE_ENA(1) | + S_0085F0_CB2_DEST_BASE_ENA(1) | + S_0085F0_CB3_DEST_BASE_ENA(1) | + S_0085F0_CB4_DEST_BASE_ENA(1) | + S_0085F0_CB5_DEST_BASE_ENA(1) | + S_0085F0_CB6_DEST_BASE_ENA(1) | + S_0085F0_CB7_DEST_BASE_ENA(1); + } + if (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_DB) { + cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | + S_0085F0_DB_DEST_BASE_ENA(1); + } + + if (cp_coher_cntl) { + if (rctx->chip_class >= CIK) { + radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0)); + radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0xff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + } else { + radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); + radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + } + } + + if (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_CB_META) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); + } + + rctx->flags = 0; +} + +const struct r600_atom si_atom_cache_flush = { si_emit_cache_flush, 9 }; /* number of CS dwords */ + void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct r600_context *rctx = (struct r600_context *)ctx; struct pipe_index_buffer ib = {}; - uint32_t cp_coher_cntl, i; + uint32_t i; if (!info->count && (info->indexed || !info->count_from_stream_output)) return; @@ -605,40 +665,15 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) si_state_draw(rctx, info, &ib); - /* Cache flushing via CP_COHER_CNTL. */ - cp_coher_cntl = si_pm4_sync_flags(rctx); - - if (rctx->b.flags & R600_CONTEXT_INV_CONST_CACHE) { - cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1) | - S_0085F0_SH_KCACHE_ACTION_ENA(1); - } - - if (cp_coher_cntl) { - struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx); - - if (pm4 == NULL) - return; - - si_cmd_surface_sync(pm4, cp_coher_cntl); - si_pm4_set_state(rctx, sync, pm4); - } - - if (rctx->flush_and_inv_cb_meta) { - struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx); - - if (pm4 == NULL) - return; - - si_cmd_flush_and_inv_cb_meta(pm4); - si_pm4_set_state(rctx, flush_and_inv_cb_meta, pm4); - rctx->flush_and_inv_cb_meta = false; - } - - /* Emit states. */ rctx->pm4_dirty_cdwords += si_pm4_dirty_dw(rctx); + /* Check flush flags. */ + if (rctx->b.flags) + rctx->atoms.cache_flush->dirty = true; + si_need_cs_space(rctx, 0, TRUE); + /* Emit states. */ for (i = 0; i < SI_NUM_ATOMS(rctx); i++) { if (rctx->atoms.array[i]->dirty) { rctx->atoms.array[i]->emit(&rctx->b, rctx->atoms.array[i]); @@ -663,8 +698,6 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) } #endif - rctx->flags |= R600_CONTEXT_DST_CACHES_DIRTY; - /* Set the depth buffer as dirty. */ if (rctx->framebuffer.zsbuf) { struct pipe_surface *surf = rctx->framebuffer.zsbuf; -- 2.7.4