From 06b6af596c2d73410469733d6ef6fe88a3b7248c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 21 Mar 2021 16:57:15 -0400 Subject: [PATCH] radeonsi: do Z-only or S-only HTILE clear using a compute shader doing RMW This adds a clear_buffer compute shader that does read-modify-write to update a subset of bits in HTILE. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_clear.c | 60 +++++++++++++++++-- src/gallium/drivers/radeonsi/si_compute_blit.c | 74 ++++++++++++++++++++++++ src/gallium/drivers/radeonsi/si_pipe.c | 2 + src/gallium/drivers/radeonsi/si_pipe.h | 7 +++ src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c | 43 ++++++++++++++ 5 files changed, 182 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 09fcff8..d67453e 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -42,6 +42,15 @@ void si_init_buffer_clear(struct si_clear_info *info, info->offset = offset; info->size = size; info->clear_value = clear_value; + info->writemask = 0xffffffff; +} + +static void si_init_buffer_clear_rmw(struct si_clear_info *info, + struct pipe_resource *resource, uint64_t offset, + uint32_t size, uint32_t clear_value, uint32_t writemask) +{ + si_init_buffer_clear(info, resource, offset, size, clear_value); + info->writemask = writemask; } void si_execute_clears(struct si_context *sctx, struct si_clear_info *info, @@ -66,10 +75,18 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info, /* Execute clears. */ for (unsigned i = 0; i < num_clears; i++) { - /* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */ - si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size, - &info[i].clear_value, 4, SI_OP_SKIP_CACHE_INV_BEFORE, - SI_COHERENCY_CP, SI_COMPUTE_CLEAR_METHOD); + assert(info[i].size > 0); + + if (info[i].writemask != 0xffffffff) { + si_compute_clear_buffer_rmw(sctx, info[i].resource, info[i].offset, info[i].size, + info[i].clear_value, info[i].writemask, + SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP); + } else { + /* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */ + si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size, + &info[i].clear_value, 4, SI_OP_SKIP_CACHE_INV_BEFORE, + SI_COHERENCY_CP, SI_COMPUTE_CLEAR_METHOD); + } } /* Wait for idle. */ @@ -885,6 +902,41 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers, update_db_depth_clear = true; update_db_stencil_clear = true; } + } else { + /* Z-only or S-only clear when both Z/S are present using a read-modify-write + * compute shader. + * + * If we get both clears but only one of them can be fast-cleared, we use + * the draw-based fast clear to do both at the same time. + */ + const uint32_t htile_depth_writemask = 0xfffffc0f; + const uint32_t htile_stencil_writemask = 0x000003f0; + + if (htile_size && + !(*buffers & PIPE_CLEAR_STENCIL) && + si_can_fast_clear_depth(zstex, level, depth, *buffers)) { + /* Z-only clear with stencil left intact. */ + assert(num_clears < ARRAY_SIZE(info)); + si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset, + htile_size, si_get_htile_clear_value(zstex, depth), + htile_depth_writemask); + clear_types |= SI_CLEAR_TYPE_HTILE; + *buffers &= ~PIPE_CLEAR_DEPTH; + zstex->depth_cleared_level_mask |= BITFIELD_BIT(level); + update_db_depth_clear = true; + } else if (htile_size && + !(*buffers & PIPE_CLEAR_DEPTH) && + si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) { + /* Stencil-only clear with depth left intact. */ + assert(num_clears < ARRAY_SIZE(info)); + si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset, + htile_size, si_get_htile_clear_value(zstex, depth), + htile_stencil_writemask); + clear_types |= SI_CLEAR_TYPE_HTILE; + *buffers &= ~PIPE_CLEAR_STENCIL; + zstex->stencil_cleared_level_mask |= BITFIELD_BIT(level); + update_db_stencil_clear = true; + } } /* Update DB_DEPTH_CLEAR. */ diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 10d12ca..5154d5b 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -115,6 +115,80 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf } } +/** + * Clear a buffer using read-modify-write with a 32-bit write bitmask. + * The clear value has 32 bits. + */ +void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *dst, + unsigned dst_offset, unsigned size, + uint32_t clear_value, uint32_t writebitmask, + unsigned flags, enum si_coherency coher) +{ + struct pipe_context *ctx = &sctx->b; + + assert(dst_offset % 4 == 0); + assert(size % 4 == 0); + + assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); + + if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) + sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + + /* Save states. */ + void *saved_cs = sctx->cs_shader_state.program; + struct pipe_shader_buffer saved_sb = {}; + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); + + unsigned saved_writable_mask = 0; + if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & + (1u << si_get_shaderbuf_slot(0))) + saved_writable_mask |= 1 << 0; + + /* Use buffer_load_dwordx4 and buffer_store_dwordx4 per thread. */ + unsigned dwords_per_instruction = 4; + unsigned wave_size = sctx->screen->compute_wave_size; + unsigned dwords_per_wave = dwords_per_instruction * wave_size; + + unsigned num_dwords = size / 4; + unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); + + struct pipe_grid_info info = {}; + info.block[0] = MIN2(wave_size, num_instructions); + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); + info.grid[1] = 1; + info.grid[2] = 1; + + struct pipe_shader_buffer sb = {}; + sb.buffer = dst; + sb.buffer_offset = dst_offset; + sb.buffer_size = size; + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); + + sctx->cs_user_data[0] = clear_value & writebitmask; + sctx->cs_user_data[1] = ~writebitmask; + + if (!sctx->cs_clear_buffer_rmw) + sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(&sctx->b); + + ctx->bind_compute_state(ctx, sctx->cs_clear_buffer_rmw); + + si_launch_grid_internal(sctx, &info, saved_cs, flags); + + enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); + + if (flags & SI_OP_SYNC_AFTER) + sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0; + + if (cache_policy != L2_BYPASS) + si_resource(dst)->TC_L2_dirty = true; + + /* Restore states. */ + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); + pipe_resource_reference(&saved_sb.buffer, NULL); +} + static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_offset, unsigned size, const uint32_t *clear_value, unsigned flags, diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 7916727..9181aef 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -239,6 +239,8 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord); if (sctx->cs_clear_buffer) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer); + if (sctx->cs_clear_buffer_rmw) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw); if (sctx->cs_copy_buffer) sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer); if (sctx->cs_copy_image) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index b448a91..3e86e49 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -955,6 +955,7 @@ struct si_context { void *vs_blit_color_layered; void *vs_blit_texcoord; void *cs_clear_buffer; + void *cs_clear_buffer_rmw; void *cs_copy_buffer; void *cs_copy_image; void *cs_copy_image_1d_array; @@ -1368,6 +1369,7 @@ struct si_clear_info { uint64_t offset; uint32_t size; uint32_t clear_value; + uint32_t writemask; }; enum pipe_format si_simplify_cb_format(enum pipe_format format); @@ -1406,6 +1408,10 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, uint32_t *clear_value, uint32_t clear_value_size, unsigned flags, enum si_coherency coher, enum si_clear_method method); +void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *dst, + unsigned dst_offset, unsigned size, + uint32_t clear_value, uint32_t writebitmask, + unsigned flags, enum si_coherency coher); void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, unsigned flags); void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, @@ -1539,6 +1545,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, void *si_create_fixed_func_tcs(struct si_context *sctx); void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, bool dst_stream_cache_policy, bool is_copy); +void *si_create_clear_buffer_rmw_cs(struct pipe_context *ctx); void *si_create_copy_image_compute_shader(struct pipe_context *ctx); void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx); void *si_create_dcc_decompress_cs(struct pipe_context *ctx); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 8ada0ba..7c6f73c 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -212,6 +212,49 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords return cs; } +/* Create a compute shader implementing clear_buffer or copy_buffer. */ +void *si_create_clear_buffer_rmw_cs(struct pipe_context *ctx) +{ + const char *text = "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "PROPERTY CS_USER_DATA_COMPONENTS_AMD 2\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL SV[2], CS_USER_DATA_AMD\n" + "DCL BUFFER[0]\n" + "DCL TEMP[0..1]\n" + "IMM[0] UINT32 {64, 16, 0, 0}\n" + /* ADDRESS = BLOCK_ID * 64 + THREAD_ID; */ + "UMAD TEMP[0].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx\n" + /* ADDRESS = ADDRESS * 16; (byte offset, loading one vec4 per thread) */ + "UMUL TEMP[0].x, TEMP[0].xxxx, IMM[0].yyyy\n" + "LOAD TEMP[1], BUFFER[0], TEMP[0].xxxx\n" + /* DATA &= inverted_writemask; */ + "AND TEMP[1], TEMP[1], SV[2].yyyy\n" + /* DATA |= clear_value_masked; */ + "OR TEMP[1], TEMP[1], SV[2].xxxx\n" + "STORE BUFFER[0].xyzw, TEMP[0], TEMP[1]%s\n" + "END\n"; + char final_text[2048]; + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + snprintf(final_text, sizeof(final_text), text, + SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ", STREAM_CACHE_POLICY" : ""); + + if (!tgsi_text_translate(final_text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +} + /* Create a compute shader that copies DCC from one buffer to another * where each DCC buffer has a different layout. * -- 2.7.4