From 06b6af596c2d73410469733d6ef6fe88a3b7248c Mon Sep 17 00:00:00 2001
From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 21 Mar 2021 16:57:15 -0400
Subject: [PATCH] radeonsi: do Z-only or S-only HTILE clear using a compute
 shader doing RMW

This adds a clear_buffer compute shader that does read-modify-write to
update a subset of bits in HTILE.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10003>
---
 src/gallium/drivers/radeonsi/si_clear.c          | 60 +++++++++++++++++--
 src/gallium/drivers/radeonsi/si_compute_blit.c   | 74 ++++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_pipe.c           |  2 +
 src/gallium/drivers/radeonsi/si_pipe.h           |  7 +++
 src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c | 43 ++++++++++++++
 5 files changed, 182 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index 09fcff8..d67453e 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -42,6 +42,15 @@ void si_init_buffer_clear(struct si_clear_info *info,
    info->offset = offset;
    info->size = size;
    info->clear_value = clear_value;
+   info->writemask = 0xffffffff;
+}
+
+static void si_init_buffer_clear_rmw(struct si_clear_info *info,
+                                     struct pipe_resource *resource, uint64_t offset,
+                                     uint32_t size, uint32_t clear_value, uint32_t writemask)
+{
+   si_init_buffer_clear(info, resource, offset, size, clear_value);
+   info->writemask = writemask;
 }
 
 void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
@@ -66,10 +75,18 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
 
    /* Execute clears. */
    for (unsigned i = 0; i < num_clears; i++) {
-      /* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */
-      si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size,
-                      &info[i].clear_value, 4, SI_OP_SKIP_CACHE_INV_BEFORE,
-                      SI_COHERENCY_CP, SI_COMPUTE_CLEAR_METHOD);
+      assert(info[i].size > 0);
+
+      if (info[i].writemask != 0xffffffff) {
+         si_compute_clear_buffer_rmw(sctx, info[i].resource, info[i].offset, info[i].size,
+                                     info[i].clear_value, info[i].writemask,
+                                     SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP);
+      } else {
+         /* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */
+         si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size,
+                         &info[i].clear_value, 4, SI_OP_SKIP_CACHE_INV_BEFORE,
+                         SI_COHERENCY_CP, SI_COMPUTE_CLEAR_METHOD);
+      }
    }
 
    /* Wait for idle. */
@@ -885,6 +902,41 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
                update_db_depth_clear = true;
                update_db_stencil_clear = true;
             }
+         } else {
+            /* Z-only or S-only clear when both Z/S are present using a read-modify-write
+             * compute shader.
+             *
+             * If we get both clears but only one of them can be fast-cleared, we use
+             * the draw-based fast clear to do both at the same time.
+             */
+            const uint32_t htile_depth_writemask = 0xfffffc0f;
+            const uint32_t htile_stencil_writemask = 0x000003f0;
+
+            if (htile_size &&
+                !(*buffers & PIPE_CLEAR_STENCIL) &&
+                si_can_fast_clear_depth(zstex, level, depth, *buffers)) {
+               /* Z-only clear with stencil left intact. */
+               assert(num_clears < ARRAY_SIZE(info));
+               si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
+                                        htile_size, si_get_htile_clear_value(zstex, depth),
+                                        htile_depth_writemask);
+               clear_types |= SI_CLEAR_TYPE_HTILE;
+               *buffers &= ~PIPE_CLEAR_DEPTH;
+               zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
+               update_db_depth_clear = true;
+            } else if (htile_size &&
+                       !(*buffers & PIPE_CLEAR_DEPTH) &&
+                       si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) {
+               /* Stencil-only clear with depth left intact. */
+               assert(num_clears < ARRAY_SIZE(info));
+               si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
+                                        htile_size, si_get_htile_clear_value(zstex, depth),
+                                        htile_stencil_writemask);
+               clear_types |= SI_CLEAR_TYPE_HTILE;
+               *buffers &= ~PIPE_CLEAR_STENCIL;
+               zstex->stencil_cleared_level_mask |= BITFIELD_BIT(level);
+               update_db_stencil_clear = true;
+            }
          }
 
          /* Update DB_DEPTH_CLEAR. */
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 10d12ca..5154d5b 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -115,6 +115,80 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
    }
 }
 
+/**
+ * Clear a buffer using read-modify-write with a 32-bit write bitmask.
+ * The clear value has 32 bits.
+ */
+void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *dst,
+                                 unsigned dst_offset, unsigned size,
+                                 uint32_t clear_value, uint32_t writebitmask,
+                                 unsigned flags, enum si_coherency coher)
+{
+   struct pipe_context *ctx = &sctx->b;
+
+   assert(dst_offset % 4 == 0);
+   assert(size % 4 == 0);
+
+   assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
+
+   if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
+      sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+
+   /* Save states. */
+   void *saved_cs = sctx->cs_shader_state.program;
+   struct pipe_shader_buffer saved_sb = {};
+   si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
+
+   unsigned saved_writable_mask = 0;
+   if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+       (1u << si_get_shaderbuf_slot(0)))
+      saved_writable_mask |= 1 << 0;
+
+   /* Use buffer_load_dwordx4 and buffer_store_dwordx4 per thread. */
+   unsigned dwords_per_instruction = 4;
+   unsigned wave_size = sctx->screen->compute_wave_size;
+   unsigned dwords_per_wave = dwords_per_instruction * wave_size;
+
+   unsigned num_dwords = size / 4;
+   unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+   struct pipe_grid_info info = {};
+   info.block[0] = MIN2(wave_size, num_instructions);
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+   info.grid[1] = 1;
+   info.grid[2] = 1;
+
+   struct pipe_shader_buffer sb = {};
+   sb.buffer = dst;
+   sb.buffer_offset = dst_offset;
+   sb.buffer_size = size;
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
+
+   sctx->cs_user_data[0] = clear_value & writebitmask;
+   sctx->cs_user_data[1] = ~writebitmask;
+
+   if (!sctx->cs_clear_buffer_rmw)
+      sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(&sctx->b);
+
+   ctx->bind_compute_state(ctx, sctx->cs_clear_buffer_rmw);
+
+   si_launch_grid_internal(sctx, &info, saved_cs, flags);
+
+   enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+
+   if (flags & SI_OP_SYNC_AFTER)
+      sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0;
+
+   if (cache_policy != L2_BYPASS)
+      si_resource(dst)->TC_L2_dirty = true;
+
+   /* Restore states. */
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
+   pipe_resource_reference(&saved_sb.buffer, NULL);
+}
+
 static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
                                             unsigned dst_offset, unsigned size,
                                             const uint32_t *clear_value, unsigned flags,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 7916727..9181aef 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -239,6 +239,8 @@ static void si_destroy_context(struct pipe_context *context)
       sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
    if (sctx->cs_clear_buffer)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
+   if (sctx->cs_clear_buffer_rmw)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw);
    if (sctx->cs_copy_buffer)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
    if (sctx->cs_copy_image)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index b448a91..3e86e49 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -955,6 +955,7 @@ struct si_context {
    void *vs_blit_color_layered;
    void *vs_blit_texcoord;
    void *cs_clear_buffer;
+   void *cs_clear_buffer_rmw;
    void *cs_copy_buffer;
    void *cs_copy_image;
    void *cs_copy_image_1d_array;
@@ -1368,6 +1369,7 @@ struct si_clear_info {
    uint64_t offset;
    uint32_t size;
    uint32_t clear_value;
+   uint32_t writemask;
 };
 
 enum pipe_format si_simplify_cb_format(enum pipe_format format);
@@ -1406,6 +1408,10 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
                      uint64_t offset, uint64_t size, uint32_t *clear_value,
                      uint32_t clear_value_size, unsigned flags,
                      enum si_coherency coher, enum si_clear_method method);
+void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *dst,
+                                 unsigned dst_offset, unsigned size,
+                                 uint32_t clear_value, uint32_t writebitmask,
+                                 unsigned flags, enum si_coherency coher);
 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
                             uint64_t size, unsigned value, unsigned flags);
 void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
@@ -1539,6 +1545,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
 void *si_create_fixed_func_tcs(struct si_context *sctx);
 void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
                                    bool dst_stream_cache_policy, bool is_copy);
+void *si_create_clear_buffer_rmw_cs(struct pipe_context *ctx);
 void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
 void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
 void *si_create_dcc_decompress_cs(struct pipe_context *ctx);
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index 8ada0ba..7c6f73c 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -212,6 +212,49 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords
    return cs;
 }
 
+/* Create a compute shader implementing clear_buffer or copy_buffer. */
+void *si_create_clear_buffer_rmw_cs(struct pipe_context *ctx)
+{
+   const char *text = "COMP\n"
+                      "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+                      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+                      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+                      "PROPERTY CS_USER_DATA_COMPONENTS_AMD 2\n"
+                      "DCL SV[0], THREAD_ID\n"
+                      "DCL SV[1], BLOCK_ID\n"
+                      "DCL SV[2], CS_USER_DATA_AMD\n"
+                      "DCL BUFFER[0]\n"
+                      "DCL TEMP[0..1]\n"
+                      "IMM[0] UINT32 {64, 16, 0, 0}\n"
+                      /* ADDRESS = BLOCK_ID * 64 + THREAD_ID; */
+                      "UMAD TEMP[0].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx\n"
+                      /* ADDRESS = ADDRESS * 16; (byte offset, loading one vec4 per thread) */
+                      "UMUL TEMP[0].x, TEMP[0].xxxx, IMM[0].yyyy\n"
+                      "LOAD TEMP[1], BUFFER[0], TEMP[0].xxxx\n"
+                      /* DATA &= inverted_writemask; */
+                      "AND TEMP[1], TEMP[1], SV[2].yyyy\n"
+                      /* DATA |= clear_value_masked; */
+                      "OR TEMP[1], TEMP[1], SV[2].xxxx\n"
+                      "STORE BUFFER[0].xyzw, TEMP[0], TEMP[1]%s\n"
+                      "END\n";
+   char final_text[2048];
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   snprintf(final_text, sizeof(final_text), text,
+            SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ", STREAM_CACHE_POLICY" : "");
+
+   if (!tgsi_text_translate(final_text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
+}
+
 /* Create a compute shader that copies DCC from one buffer to another
  * where each DCC buffer has a different layout.
  *
-- 
2.7.4