radeonsi: do not do two full flushes on every compute dispatch

author Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Sun, 27 Mar 2016 09:14:34 +0000 (11:14 +0200)

committer Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Tue, 19 Apr 2016 16:10:31 +0000 (18:10 +0200)
author Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Sun, 27 Mar 2016 09:14:34 +0000 (11:14 +0200)
committer Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Tue, 19 Apr 2016 16:10:31 +0000 (18:10 +0200)
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c

index 921b62c..105cf8c 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -441,13 +441,8 @@ static void si_launch_grid(
         if (!sctx->cs_shader_state.initialized)
                 si_initialize_compute(sctx);
  
-       sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-                        SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_INV_ICACHE |
-                        SI_CONTEXT_INV_SMEM_L1 |
-                        SI_CONTEXT_FLUSH_WITH_INV_L2 |
-                        SI_CONTEXT_FLAG_COMPUTE;
-       si_emit_cache_flush(sctx, NULL);
+       if (sctx->b.flags)
+               si_emit_cache_flush(sctx, NULL);
  
         if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
                 return;
@@ -480,14 +475,6 @@ static void si_launch_grid(
                 si_setup_tgsi_grid(sctx, info);
  
         si_emit_dispatch_packets(sctx, info);
-
-       sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                        SI_CONTEXT_INV_VMEM_L1 |
-                        SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_INV_ICACHE |
-                        SI_CONTEXT_INV_SMEM_L1 |
-                        SI_CONTEXT_FLAG_COMPUTE;
-       si_emit_cache_flush(sctx, NULL);
  }
  
  
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c

index 001ddd4..38e0ee6 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
         uint64_t va = r600_resource(dst)->gpu_address + offset;
  
         /* Flush the caches. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
  
         while (size) {
                 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
@@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
         }
  
         /* Flush the caches. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
  
         /* This is the main part doing the copying. Src is always aligned. */
         main_dst_offset = dst_offset + skipped_size;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c

index 301a865..1580e61 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1032,7 +1032,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
          * start writing to the targets.
          */
         if (num_targets)
-               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                                SI_CONTEXT_CS_PARTIAL_FLUSH;
  
         /* Streamout buffers must be bound in 2 places:
          * 1) in VGT by setting the VGT_STRMOUT registers
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c

index 69fecce..e3abb7f 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -117,6 +117,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
         ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
                         SI_CONTEXT_INV_VMEM_L1 |
                         SI_CONTEXT_INV_GLOBAL_L2 |
+                       SI_CONTEXT_CS_PARTIAL_FLUSH |
                         /* this is probably not needed anymore */
                         SI_CONTEXT_PS_PARTIAL_FLUSH;
         si_emit_cache_flush(ctx, NULL);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c

index af9ffdd..305a70b 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
          */
         sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
                          SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH;
  
         /* Take the maximum of the old and new count. If the new count is lower,
          * dirtying is needed to disable the unbound colorbuffers.
@@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
  
         sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
                          SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_FLUSH_AND_INV_CB;
+                        SI_CONTEXT_FLUSH_AND_INV_CB |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH;
  }
  
  static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
@@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
  
         /* Subsequent commands must wait for all shader invocations to
          * complete. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH;
  
         if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
                 sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
@@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
                      PIPE_BARRIER_SHADER_BUFFER |
                      PIPE_BARRIER_TEXTURE |
                      PIPE_BARRIER_IMAGE |
-                    PIPE_BARRIER_STREAMOUT_BUFFER)) {
+                    PIPE_BARRIER_STREAMOUT_BUFFER |
+                    PIPE_BARRIER_GLOBAL_BUFFER)) {
                 /* As far as I can tell, L1 contents are written back to L2
                  * automatically at end of shader, but the contents of other
                  * L1 caches might still be stale. */
author	Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
	Sun, 27 Mar 2016 09:14:34 +0000 (11:14 +0200)
committer	Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
	Tue, 19 Apr 2016 16:10:31 +0000 (18:10 +0200)
src/gallium/drivers/radeonsi/si_compute.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_cp_dma.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_descriptors.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_hw_context.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state.c		patch \| blob \| history