From 3952b89ebb80d10cbcfa2ef30255e204782c0ba8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 6 Aug 2023 22:15:42 -0400 Subject: [PATCH] radeonsi: don't use threadID.yz/blockID.yz for copy_image if those are always 0 This can improve performance because fewer VGPRs and SGPRs need to be initialized. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_compute_blit.c | 13 ++++++++----- src/gallium/drivers/radeonsi/si_pipe.c | 6 ++++-- src/gallium/drivers/radeonsi/si_pipe.h | 5 +++-- src/gallium/drivers/radeonsi/si_shaderlib_nir.c | 12 +++++++++--- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 9a4af15..571fbbb 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -537,7 +537,7 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p } } -static void +static unsigned set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z, unsigned work_x, unsigned work_y, unsigned work_z) { @@ -550,6 +550,8 @@ set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, u info->last_block[i] = work[i] % info->block[i]; info->grid[i] = DIV_ROUND_UP(work[i], info->block[i]); } + + return work_z > 1 ? 3 : (work_y > 1 ? 2 : 1); } static void si_launch_grid_internal_images(struct si_context *sctx, @@ -764,12 +766,13 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u sctx->cs_user_data[1] = src_box->y | (dsty << 16); sctx->cs_user_data[2] = src_box->z | (dstz << 16); - set_work_size(&info, block_x, block_y, block_z, - src_box->width, src_box->height, src_box->depth); + unsigned wg_dim = + set_work_size(&info, block_x, block_y, block_z, + src_box->width, src_box->height, src_box->depth); - void **copy_image_cs_ptr = &sctx->cs_copy_image[src_is_1d][dst_is_1d]; + void **copy_image_cs_ptr = &sctx->cs_copy_image[wg_dim - 1][src_is_1d][dst_is_1d]; if (!*copy_image_cs_ptr) - *copy_image_cs_ptr = si_create_copy_image_cs(sctx, src_is_1d, dst_is_1d); + *copy_image_cs_ptr = si_create_copy_image_cs(sctx, wg_dim, src_is_1d, dst_is_1d); assert(*copy_image_cs_ptr); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index dcfdad9..e2c6a35 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -259,8 +259,10 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer); for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) { for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) { - if (sctx->cs_copy_image[i][j]) - sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image[i][j]); + for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) { + if (sctx->cs_copy_image[i][j][k]) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image[i][j][k]); + } } } if (sctx->cs_clear_render_target) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index eb25910..1cf4798 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1006,7 +1006,7 @@ struct si_context { void *cs_clear_buffer; void *cs_clear_buffer_rmw; void *cs_copy_buffer; - void *cs_copy_image[2][2]; /* [src_is_1d][dst_is_1d] */ + void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */ void *cs_clear_render_target; void *cs_clear_render_target_1d_array; void *cs_clear_12bytes_buffer; @@ -1582,7 +1582,8 @@ void si_suspend_queries(struct si_context *sctx); void si_resume_queries(struct si_context *sctx); /* si_shaderlib_nir.c */ -void *si_create_copy_image_cs(struct si_context *sctx, bool src_is_1d_array, bool dst_is_1d_array); +void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim, + bool src_is_1d_array, bool dst_is_1d_array); void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf); void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex); void *si_create_passthrough_tcs(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index ca2f631..1a400cd 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -73,7 +73,8 @@ deref_ssa(nir_builder *b, nir_variable *var) * It expects the source and destination (x,y,z) coords as user_data_amd, * packed into 3 SGPRs as 2x16bits per component. */ -void *si_create_copy_image_cs(struct si_context *sctx, bool src_is_1d_array, bool dst_is_1d_array) +void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim, + bool src_is_1d_array, bool dst_is_1d_array) { const nir_shader_compiler_options *options = sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE); @@ -87,14 +88,19 @@ void *si_create_copy_image_cs(struct si_context *sctx, bool src_is_1d_array, boo b.shader->info.workgroup_size_variable = true; b.shader->info.cs.user_data_components_amd = 3; - nir_def *ids = get_global_ids(&b, 3); + nir_def *ids = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3); nir_def *coord_src = NULL, *coord_dst = NULL; - unpack_2x16(&b, nir_load_user_data_amd(&b), &coord_src, &coord_dst); + unpack_2x16(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3), + &coord_src, &coord_dst); coord_src = nir_iadd(&b, coord_src, ids); coord_dst = nir_iadd(&b, coord_dst, ids); + /* Coordinates must have 4 channels in NIR. */ + coord_src = nir_pad_vector(&b, coord_src, 4); + coord_dst = nir_pad_vector(&b, coord_dst, 4); + static unsigned swizzle_xz[] = {0, 2, 0, 0}; if (src_is_1d_array) -- 2.7.4