radeonsi: do not do two full flushes on every compute dispatch
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Sun, 27 Mar 2016 09:14:34 +0000 (11:14 +0200)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Tue, 19 Apr 2016 16:10:31 +0000 (18:10 +0200)
v2: Add more CS_PARTIAL_FLUSH events.

Essentially every place with waits on finishing for pixel shaders
also has a write after read hazard with compute shaders.

Invalidating L2 waits implicitly on pixel and compute shaders,
so, we don't need a CS_PARTIAL_FLUSH for switching FBO.

v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.

According to Marek the INV_GLOBAL_L2 events don't wait for compute
shaders to finish, so wait for them explicitly.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_cp_dma.c
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_hw_context.c
src/gallium/drivers/radeonsi/si_state.c

index 921b62c..105cf8c 100644 (file)
@@ -441,13 +441,8 @@ static void si_launch_grid(
        if (!sctx->cs_shader_state.initialized)
                si_initialize_compute(sctx);
 
-       sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-                        SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_INV_ICACHE |
-                        SI_CONTEXT_INV_SMEM_L1 |
-                        SI_CONTEXT_FLUSH_WITH_INV_L2 |
-                        SI_CONTEXT_FLAG_COMPUTE;
-       si_emit_cache_flush(sctx, NULL);
+       if (sctx->b.flags)
+               si_emit_cache_flush(sctx, NULL);
 
        if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
                return;
@@ -480,14 +475,6 @@ static void si_launch_grid(
                si_setup_tgsi_grid(sctx, info);
 
        si_emit_dispatch_packets(sctx, info);
-
-       sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                        SI_CONTEXT_INV_VMEM_L1 |
-                        SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_INV_ICACHE |
-                        SI_CONTEXT_INV_SMEM_L1 |
-                        SI_CONTEXT_FLAG_COMPUTE;
-       si_emit_cache_flush(sctx, NULL);
 }
 
 
index 001ddd4..38e0ee6 100644 (file)
@@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
        uint64_t va = r600_resource(dst)->gpu_address + offset;
 
        /* Flush the caches. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
        while (size) {
                unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
@@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
        }
 
        /* Flush the caches. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
        /* This is the main part doing the copying. Src is always aligned. */
        main_dst_offset = dst_offset + skipped_size;
index 301a865..1580e61 100644 (file)
@@ -1032,7 +1032,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
         * start writing to the targets.
         */
        if (num_targets)
-               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                                SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        /* Streamout buffers must be bound in 2 places:
         * 1) in VGT by setting the VGT_STRMOUT registers
index 69fecce..e3abb7f 100644 (file)
@@ -117,6 +117,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
        ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
                        SI_CONTEXT_INV_VMEM_L1 |
                        SI_CONTEXT_INV_GLOBAL_L2 |
+                       SI_CONTEXT_CS_PARTIAL_FLUSH |
                        /* this is probably not needed anymore */
                        SI_CONTEXT_PS_PARTIAL_FLUSH;
        si_emit_cache_flush(ctx, NULL);
index af9ffdd..305a70b 100644 (file)
@@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
         */
        sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
                         SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        /* Take the maximum of the old and new count. If the new count is lower,
         * dirtying is needed to disable the unbound colorbuffers.
@@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
 
        sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
                         SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_FLUSH_AND_INV_CB;
+                        SI_CONTEXT_FLUSH_AND_INV_CB |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH;
 }
 
 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
@@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 
        /* Subsequent commands must wait for all shader invocations to
         * complete. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
                sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
@@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
                     PIPE_BARRIER_SHADER_BUFFER |
                     PIPE_BARRIER_TEXTURE |
                     PIPE_BARRIER_IMAGE |
-                    PIPE_BARRIER_STREAMOUT_BUFFER)) {
+                    PIPE_BARRIER_STREAMOUT_BUFFER |
+                    PIPE_BARRIER_GLOBAL_BUFFER)) {
                /* As far as I can tell, L1 contents are written back to L2
                 * automatically at end of shader, but the contents of other
                 * L1 caches might still be stale. */