radeonsi: always use the L2 LRU cache policy for faster clears and copies

author Marek Olšák <marek.olsak@amd.com>

Sat, 8 May 2021 03:47:23 +0000 (23:47 -0400)

committer Marge Bot <eric+marge@anholt.net>

Tue, 25 May 2021 16:15:44 +0000 (16:15 +0000)
author Marek Olšák <marek.olsak@amd.com>
Sat, 8 May 2021 03:47:23 +0000 (23:47 -0400)
committer Marge Bot <eric+marge@anholt.net>
Tue, 25 May 2021 16:15:44 +0000 (16:15 +0000)
diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c

index 538c4b1..7359734 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -62,10 +62,10 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
  
     /* Flush caches and wait for idle. */
     if (types & (SI_CLEAR_TYPE_CMASK | SI_CLEAR_TYPE_DCC))
-      sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_STREAM);
+      sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU);
  
     if (types & SI_CLEAR_TYPE_HTILE)
-      sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_DB_META, L2_STREAM);
+      sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_DB_META, L2_LRU);
  
     /* Flush caches in case we use compute. */
     sctx->flags |= SI_CONTEXT_INV_VCACHE;
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c

index 39fdbed..3dcb0f1 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -27,9 +27,7 @@
  #include "util/format/u_format.h"
  #include "util/format_srgb.h"
  
-/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
- * and L2_STREAM for src.
- */
+/* Determine the cache policy. */
  static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
                                               uint64_t size)
  {
@@ -37,7 +35,7 @@ static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_co
                                       coher == SI_COHERENCY_DB_META ||
                                       coher == SI_COHERENCY_CP)) ||
         (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
-      return size <= sctx->screen->info.l2_cache_size / 8 ? L2_LRU : L2_STREAM;
+      return L2_LRU; /* it's faster if L2 doesn't evict anything  */
  
     return L2_BYPASS;
  }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index fbdba27..948ae1f 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -64,7 +64,8 @@ extern "C" {
  /* Tunables for compute-based clear_buffer and copy_buffer: */
  #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
  #define SI_COMPUTE_COPY_DW_PER_THREAD  4
-#define SI_COMPUTE_DST_CACHE_POLICY    L2_STREAM
+/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
+#define SI_COMPUTE_DST_CACHE_POLICY    L2_LRU
  
  /* Pipeline & streamout query controls. */
  #define SI_CONTEXT_START_PIPELINE_STATS  (1 << 0)
author	Marek Olšák <marek.olsak@amd.com>
	Sat, 8 May 2021 03:47:23 +0000 (23:47 -0400)
committer	Marge Bot <eric+marge@anholt.net>
	Tue, 25 May 2021 16:15:44 +0000 (16:15 +0000)
src/gallium/drivers/radeonsi/si_clear.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_compute_blit.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pipe.h		patch \| blob \| history