From b42a4a7f07c363ad55c54fdbddf7cdaa569c57ac Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 18 Jul 2022 20:51:20 -0400 Subject: [PATCH] radeonsi: remove compute-based DCC decompression because it's broken The new blit test discovered that it doesn't always work. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_blit.c | 53 ++----------- src/gallium/drivers/radeonsi/si_compute_blit.c | 94 ++++++++---------------- src/gallium/drivers/radeonsi/si_pipe.c | 2 - src/gallium/drivers/radeonsi/si_pipe.h | 5 +- src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c | 39 ---------- 5 files changed, 37 insertions(+), 156 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 26511dc..59c0cda 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -464,7 +464,6 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture first_level, last_level, level_mask); if (need_dcc_decompress) { - assert(sctx->gfx_level == GFX8 || tex->buffer.b.b.nr_storage_samples >= 2); custom_blend = sctx->custom_blend_dcc_decompress; assert(vi_dcc_enabled(tex, first_level)); @@ -971,7 +970,7 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst si_can_use_compute_blit(sctx, src->format, src->nr_samples, false, vi_dcc_enabled(ssrc, src_level)))) { si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, - src_box, false, SI_OP_SYNC_BEFORE_AFTER); + src_box, SI_OP_SYNC_BEFORE_AFTER); return; } @@ -1247,7 +1246,7 @@ static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) if (sscreen->async_compute_context) { si_compute_copy_image((struct si_context*)sctx->screen->async_compute_context, info->dst.resource, 0, info->src.resource, 0, 0, 0, 0, - &info->src.box, false, 0); + &info->src.box, 0); si_flush_gfx_cs((struct si_context*)sctx->screen->async_compute_context, 0, NULL); simple_mtx_unlock(&sscreen->async_compute_context_lock); return; @@ -1354,53 +1353,11 @@ void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex) /* If graphics is disabled, we can't decompress DCC, but it shouldn't * be compressed either. The caller should simply discard it. */ - if (!tex->surface.meta_offset || !sctx->has_graphics || sctx->in_dcc_decompress) + if (!tex->surface.meta_offset || !sctx->has_graphics) return; - sctx->in_dcc_decompress = true; - - if (sctx->gfx_level == GFX8 || tex->buffer.b.b.nr_storage_samples >= 2) { - si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0, - util_max_layer(&tex->buffer.b.b, 0), true, false); - } else { - struct pipe_resource *ptex = &tex->buffer.b.b; - assert(ptex->nr_storage_samples <= 1); - - /* DCC decompression using a compute shader. */ - for (unsigned level = 0; level < tex->surface.num_meta_levels; level++) { - struct pipe_box box; - - u_box_3d(0, 0, 0, u_minify(ptex->width0, level), - u_minify(ptex->height0, level), - util_num_layers(ptex, level), &box); - si_compute_copy_image(sctx, ptex, level, ptex, level, 0, 0, 0, &box, true, - /* Sync before the first copy and after the last copy */ - (level == 0 ? SI_OP_SYNC_BEFORE : 0) | - (level == tex->surface.num_meta_levels - 1 ? SI_OP_SYNC_AFTER : 0)); - } - - /* Now clear DCC metadata to uncompressed. - * - * This uses SI_COMPUTE_CLEAR_METHOD to avoid a failure when running this - * deqp caselist on gfx10: - * dEQP-GLES31.functional.image_load_store.2d.format_reinterpret.rgba32f_rgba32ui - * dEQP-GLES31.functional.image_load_store.2d.format_reinterpret.rgba32f_rgba32i - */ - uint32_t clear_value = DCC_UNCOMPRESSED; - si_clear_buffer(sctx, ptex, tex->surface.meta_offset, - tex->surface.meta_size, &clear_value, 4, SI_OP_SYNC_AFTER, - SI_COHERENCY_CB_META, SI_COMPUTE_CLEAR_METHOD); - si_mark_display_dcc_dirty(sctx, tex); - - /* Clearing DCC metadata requires flushing L2 and invalidating L2 metadata to make - * the metadata visible to L2 caches. This is because clear_buffer uses plain stores - * that can go to different L2 channels than where L2 metadata caches expect them. - * This is not done for fast clears because plain stores are visible to CB/DB. Only - * L2 metadata caches have the problem. - */ - sctx->flags |= SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA; - } - sctx->in_dcc_decompress = false; + si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0, + util_max_layer(&tex->buffer.b.b, 0), true, false); } void si_init_blit_functions(struct si_context *sctx) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 95fe540d..ecc83e9 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -543,9 +543,8 @@ static void si_launch_grid_internal_images(struct si_context *sctx, void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level, struct pipe_resource *src, unsigned src_level, unsigned dstx, unsigned dsty, unsigned dstz, const struct pipe_box *src_box, - bool is_dcc_decompress, unsigned flags) + unsigned flags) { - struct pipe_context *ctx = &sctx->b; struct si_texture *ssrc = (struct si_texture*)src; struct si_texture *sdst = (struct si_texture*)dst; enum pipe_format src_format = util_format_linear(src->format); @@ -652,75 +651,44 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u image[1].u.tex.first_layer = 0; image[1].u.tex.last_layer = util_max_layer(dst, dst_level); - if (is_dcc_decompress) - image[1].access |= SI_IMAGE_ACCESS_DCC_OFF; - struct pipe_grid_info info = {0}; - if (is_dcc_decompress) { - /* The DCC decompression is a normal blit where the load is compressed - * and the store is uncompressed. The workgroup size is either equal to - * the DCC block size or a multiple thereof. The shader uses a barrier - * between loads and stores to safely overwrite each DCC block of pixels. - */ - assert(src == dst); - assert(dst->target != PIPE_TEXTURE_1D && dst->target != PIPE_TEXTURE_1D_ARRAY); - - if (!sctx->cs_dcc_decompress) - sctx->cs_dcc_decompress = si_create_dcc_decompress_cs(ctx); - - unsigned block_x = ssrc->surface.u.gfx9.color.dcc_block_width; - unsigned block_y = ssrc->surface.u.gfx9.color.dcc_block_height; - unsigned block_z = ssrc->surface.u.gfx9.color.dcc_block_depth; - - unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);; - - /* Make sure the block size is at least the same as wave size. */ - while (block_x * block_y * block_z < default_wave_size) { - block_x *= 2; - } - - set_work_size(&info, block_x, block_y, block_z, src_box->width, src_box->height, src_box->depth); - - si_launch_grid_internal_images(sctx, image, 2, &info, sctx->cs_dcc_decompress, flags); + bool dst_is_1d = dst->target == PIPE_TEXTURE_1D || + dst->target == PIPE_TEXTURE_1D_ARRAY; + bool src_is_1d = src->target == PIPE_TEXTURE_1D || + src->target == PIPE_TEXTURE_1D_ARRAY; + int block_x, block_y; + int block_z = 1; + + /* Choose the block dimensions based on the copy area size. */ + if (src_box->height <= 4) { + block_y = util_next_power_of_two(src_box->height); + block_x = 64 / block_y; + } else if (src_box->width <= 4) { + block_x = util_next_power_of_two(src_box->width); + block_y = 64 / block_x; + } else if (is_linear) { + block_x = 64; + block_y = 1; } else { - bool dst_is_1d = dst->target == PIPE_TEXTURE_1D || - dst->target == PIPE_TEXTURE_1D_ARRAY; - bool src_is_1d = src->target == PIPE_TEXTURE_1D || - src->target == PIPE_TEXTURE_1D_ARRAY; - int block_x, block_y; - int block_z = 1; - - /* Choose the block dimensions based on the copy area size. */ - if (src_box->height <= 4) { - block_y = util_next_power_of_two(src_box->height); - block_x = 64 / block_y; - } else if (src_box->width <= 4) { - block_x = util_next_power_of_two(src_box->width); - block_y = 64 / block_x; - } else if (is_linear) { - block_x = 64; - block_y = 1; - } else { - block_x = 8; - block_y = 8; - } + block_x = 8; + block_y = 8; + } - sctx->cs_user_data[0] = src_box->x | (dstx << 16); - sctx->cs_user_data[1] = src_box->y | (dsty << 16); - sctx->cs_user_data[2] = src_box->z | (dstz << 16); + sctx->cs_user_data[0] = src_box->x | (dstx << 16); + sctx->cs_user_data[1] = src_box->y | (dsty << 16); + sctx->cs_user_data[2] = src_box->z | (dstz << 16); - set_work_size(&info, block_x, block_y, block_z, - src_box->width, src_box->height, src_box->depth); + set_work_size(&info, block_x, block_y, block_z, + src_box->width, src_box->height, src_box->depth); - void **copy_image_cs_ptr = &sctx->cs_copy_image[src_is_1d][dst_is_1d]; - if (!*copy_image_cs_ptr) - *copy_image_cs_ptr = si_create_copy_image_cs(sctx, src_is_1d, dst_is_1d); + void **copy_image_cs_ptr = &sctx->cs_copy_image[src_is_1d][dst_is_1d]; + if (!*copy_image_cs_ptr) + *copy_image_cs_ptr = si_create_copy_image_cs(sctx, src_is_1d, dst_is_1d); - assert(*copy_image_cs_ptr); + assert(*copy_image_cs_ptr); - si_launch_grid_internal_images(sctx, image, 2, &info, *copy_image_cs_ptr, flags); - } + si_launch_grid_internal_images(sctx, image, 2, &info, *copy_image_cs_ptr, flags); } void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index f82f8d9..2cbaa9b 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -273,8 +273,6 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array); if (sctx->cs_clear_12bytes_buffer) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer); - if (sctx->cs_dcc_decompress) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_decompress); for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_dcc_retile); i++) { if (sctx->cs_dcc_retile[i]) sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile[i]); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 5ddc271..f833c0b 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -984,7 +984,6 @@ struct si_context { void *cs_clear_render_target; void *cs_clear_render_target_1d_array; void *cs_clear_12bytes_buffer; - void *cs_dcc_decompress; void *cs_dcc_retile[32]; void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */ struct si_screen *screen; @@ -998,7 +997,6 @@ struct si_context { bool blitter_running; bool in_update_ps_colorbuf0_slot; - bool in_dcc_decompress; bool is_noop:1; bool has_graphics:1; bool gfx_flush_in_progress : 1; @@ -1428,7 +1426,7 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level, struct pipe_resource *src, unsigned src_level, unsigned dstx, unsigned dsty, unsigned dstz, const struct pipe_box *src_box, - bool is_dcc_decompress, unsigned flags); + unsigned flags); void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf, const union pipe_color_union *color, unsigned dstx, unsigned dsty, unsigned width, unsigned height, @@ -1554,7 +1552,6 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, bool dst_stream_cache_policy, bool is_copy); void *si_create_clear_buffer_rmw_cs(struct si_context *sctx); -void *si_create_dcc_decompress_cs(struct pipe_context *ctx); void *si_clear_render_target_shader(struct pipe_context *ctx); void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx); void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 0cc5348..821def5 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -402,45 +402,6 @@ void *si_create_query_result_cs(struct si_context *sctx) return sctx->b.create_compute_state(&sctx->b, &state); } -/* Create a compute shader implementing DCC decompression via a blit. - * This is a trivial copy_image shader except that it has a variable block - * size and a barrier. - */ -void *si_create_dcc_decompress_cs(struct pipe_context *ctx) -{ - static const char text[] = - "COMP\n" - "DCL SV[0], THREAD_ID\n" - "DCL SV[1], BLOCK_ID\n" - "DCL SV[2], BLOCK_SIZE\n" - "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" - "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" - "DCL TEMP[0..1]\n" - - "UMAD TEMP[0].xyz, SV[1].xyzz, SV[2].xyzz, SV[0].xyzz\n" - "LOAD TEMP[1], IMAGE[0], TEMP[0].xyzz, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" - /* Wait for the whole threadgroup (= DCC block) to load texels before - * overwriting them, because overwriting any pixel within a DCC block - * can break compression for the whole block. - */ - "BARRIER\n" - "STORE IMAGE[1], TEMP[0].xyzz, TEMP[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" - "END\n"; - - struct tgsi_token tokens[1024]; - struct pipe_compute_state state = {0}; - - if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return NULL; - } - - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; - - return ctx->create_compute_state(ctx, &state); -} - void *si_clear_render_target_shader(struct pipe_context *ctx) { static const char text[] = -- 2.7.4