/* Flush caches and wait for idle. */
if (types & (SI_CLEAR_TYPE_CMASK | SI_CLEAR_TYPE_DCC))
- sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_STREAM);
+ sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU);
if (types & SI_CLEAR_TYPE_HTILE)
- sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_DB_META, L2_STREAM);
+ sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_DB_META, L2_LRU);
/* Flush caches in case we use compute. */
sctx->flags |= SI_CONTEXT_INV_VCACHE;
#include "util/format/u_format.h"
#include "util/format_srgb.h"
-/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
- * and L2_STREAM for src.
- */
+/* Determine the cache policy. */
static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
uint64_t size)
{
coher == SI_COHERENCY_DB_META ||
coher == SI_COHERENCY_CP)) ||
(sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
- return size <= sctx->screen->info.l2_cache_size / 8 ? L2_LRU : L2_STREAM;
+ return L2_LRU; /* it's faster if L2 doesn't evict anything */
return L2_BYPASS;
}
/* Tunables for compute-based clear_buffer and copy_buffer: */
#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
#define SI_COMPUTE_COPY_DW_PER_THREAD 4
-#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM
+/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
+#define SI_COMPUTE_DST_CACHE_POLICY L2_LRU
/* Pipeline & streamout query controls. */
#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)