From 69bc1180b77ae366e05298e892f3e83079414cd8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 11 Jun 2023 18:37:26 -0400 Subject: [PATCH] radeonsi/gfx11: use SET_SH_REG_PAIRS_PACKED for compute by buffering reg writes This is the compute portion of the work. It uses a separate buffer for compute SH registers in si_context. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_build_pm4.h | 17 +++++ src/gallium/drivers/radeonsi/si_compute.c | 94 ++++++++++++++++++-------- src/gallium/drivers/radeonsi/si_descriptors.c | 25 ++++--- src/gallium/drivers/radeonsi/si_gfx_cs.c | 2 + src/gallium/drivers/radeonsi/si_pipe.h | 2 + src/gallium/drivers/radeonsi/si_state.h | 1 + src/gallium/drivers/radeonsi/si_state_draw.cpp | 10 +++ 7 files changed, 112 insertions(+), 39 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index ab145e4..6dd4d3c 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -125,6 +125,13 @@ sctx->buffered_gfx_sh_regs[__i / 2].reg_value[__i % 2] = value; \ } while (0) +#define radeon_push_compute_sh_reg(reg, value) do { \ + unsigned __i = sctx->num_buffered_compute_sh_regs++; \ + assert(__i / 2 < ARRAY_SIZE(sctx->buffered_compute_sh_regs)); \ + sctx->buffered_compute_sh_regs[__i / 2].reg_offset[__i % 2] = ((reg) - SI_SH_REG_OFFSET) >> 2; \ + sctx->buffered_compute_sh_regs[__i / 2].reg_value[__i % 2] = value; \ +} while (0) + #define radeon_set_or_push_gfx_sh_reg(reg, value) do { \ if (GFX_VERSION >= GFX11) { \ radeon_push_gfx_sh_reg(reg, value); \ @@ -144,6 +151,16 @@ } \ } while (0) +#define radeon_opt_push_compute_sh_reg(offset, reg, val) do { \ + unsigned __value = val; \ + if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.other_reg_value[reg] != __value) { \ + radeon_push_compute_sh_reg(offset, __value); \ + sctx->tracked_regs.other_reg_saved_mask |= BITFIELD64_BIT(reg); \ + sctx->tracked_regs.other_reg_value[reg] = __value; \ + } \ +} while (0) + #define radeon_set_uconfig_reg_seq(reg, num, perfctr) do { \ assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \ radeon_emit(PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \ diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index cede9b4..8dd6fd4 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -499,24 +499,24 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY); if (sctx->gfx_level >= GFX11) { - radeon_begin(cs); - radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8); - radeon_opt_set_sh_reg2(sctx, R_00B848_COMPUTE_PGM_RSRC1, - SI_TRACKED_COMPUTE_PGM_RSRC1, - config->rsrc1, rsrc2); - radeon_opt_set_sh_reg(sctx, R_00B8A0_COMPUTE_PGM_RSRC3, - SI_TRACKED_COMPUTE_PGM_RSRC3, - S_00B8A0_INST_PREF_SIZE(si_get_shader_prefetch_size(shader))); - radeon_opt_set_sh_reg(sctx, R_00B860_COMPUTE_TMPRING_SIZE, - SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size); - + radeon_push_compute_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8); + radeon_opt_push_compute_sh_reg(R_00B848_COMPUTE_PGM_RSRC1, + SI_TRACKED_COMPUTE_PGM_RSRC1, config->rsrc1); + radeon_opt_push_compute_sh_reg(R_00B84C_COMPUTE_PGM_RSRC2, + SI_TRACKED_COMPUTE_PGM_RSRC2, rsrc2); + radeon_opt_push_compute_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3, + SI_TRACKED_COMPUTE_PGM_RSRC3, + S_00B8A0_INST_PREF_SIZE(si_get_shader_prefetch_size(shader))); + radeon_opt_push_compute_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE, + SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size); if (shader->scratch_bo) { - radeon_opt_set_sh_reg2(sctx, R_00B840_COMPUTE_DISPATCH_SCRATCH_BASE_LO, - SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO, - sctx->compute_scratch_buffer->gpu_address >> 8, - sctx->compute_scratch_buffer->gpu_address >> 40); + radeon_opt_push_compute_sh_reg(R_00B840_COMPUTE_DISPATCH_SCRATCH_BASE_LO, + SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO, + sctx->compute_scratch_buffer->gpu_address >> 8); + radeon_opt_push_compute_sh_reg(R_00B844_COMPUTE_DISPATCH_SCRATCH_BASE_HI, + SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_HI, + sctx->compute_scratch_buffer->gpu_address >> 40); } - radeon_end(); } else { radeon_begin(cs); radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8); @@ -730,24 +730,39 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr } radeon_begin_again(cs); } else { - radeon_set_sh_reg_seq(grid_size_reg, 3); - radeon_emit(info->grid[0]); - radeon_emit(info->grid[1]); - radeon_emit(info->grid[2]); + if (sctx->gfx_level >= GFX11) { + radeon_push_compute_sh_reg(grid_size_reg, info->grid[0]); + radeon_push_compute_sh_reg(grid_size_reg + 4, info->grid[1]); + radeon_push_compute_sh_reg(grid_size_reg + 8, info->grid[2]); + } else { + radeon_set_sh_reg_seq(grid_size_reg, 3); + radeon_emit(info->grid[0]); + radeon_emit(info->grid[1]); + radeon_emit(info->grid[2]); + } } } if (sel->info.uses_variable_block_size) { uint32_t value = info->block[0] | (info->block[1] << 10) | (info->block[2] << 20); - radeon_set_sh_reg(block_size_reg, value); + if (sctx->gfx_level >= GFX11) { + radeon_push_compute_sh_reg(block_size_reg, value); + } else { + radeon_set_sh_reg(block_size_reg, value); + } } if (sel->info.base.cs.user_data_components_amd) { unsigned num = sel->info.base.cs.user_data_components_amd; - radeon_set_sh_reg_seq(cs_user_data_reg, num); - radeon_emit_array(sctx->cs_user_data, num); + if (sctx->gfx_level >= GFX11) { + for (unsigned i = 0; i < num; i++) + radeon_push_compute_sh_reg(cs_user_data_reg + i * 4, sctx->cs_user_data[i]); + } else { + radeon_set_sh_reg_seq(cs_user_data_reg, num); + radeon_emit_array(sctx->cs_user_data, num); + } } radeon_end(); } @@ -777,9 +792,15 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ sctx->cs_max_waves_per_sh, threadgroups_per_cu); - radeon_opt_set_sh_reg(sctx, R_00B854_COMPUTE_RESOURCE_LIMITS, - SI_TRACKED_COMPUTE_RESOURCE_LIMITS, - compute_resource_limits); + if (sctx->gfx_level >= GFX11) { + radeon_opt_push_compute_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS, + SI_TRACKED_COMPUTE_RESOURCE_LIMITS, + compute_resource_limits); + } else { + radeon_opt_set_sh_reg(sctx, R_00B854_COMPUTE_RESOURCE_LIMITS, + SI_TRACKED_COMPUTE_RESOURCE_LIMITS, + compute_resource_limits); + } unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) | /* If the KMD allows it (there is a KMD hw register for it), @@ -816,9 +837,24 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ num_threads[2] = S_00B824_NUM_THREAD_FULL(info->block[2]); } - radeon_opt_set_sh_reg3(sctx, R_00B81C_COMPUTE_NUM_THREAD_X, - SI_TRACKED_COMPUTE_NUM_THREAD_X, - num_threads[0], num_threads[1], num_threads[2]); + if (sctx->gfx_level >= GFX11) { + radeon_opt_push_compute_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X, + SI_TRACKED_COMPUTE_NUM_THREAD_X, num_threads[0]); + radeon_opt_push_compute_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y, + SI_TRACKED_COMPUTE_NUM_THREAD_Y, num_threads[1]); + radeon_opt_push_compute_sh_reg(R_00B824_COMPUTE_NUM_THREAD_Z, + SI_TRACKED_COMPUTE_NUM_THREAD_Z, num_threads[2]); + } else { + radeon_opt_set_sh_reg3(sctx, R_00B81C_COMPUTE_NUM_THREAD_X, + SI_TRACKED_COMPUTE_NUM_THREAD_X, + num_threads[0], num_threads[1], num_threads[2]); + } + + if (sctx->gfx_level >= GFX11) { + radeon_end(); + gfx11_emit_buffered_compute_sh_regs(sctx); + radeon_begin_again(cs); + } if (info->indirect) { uint64_t base_va = si_resource(info->indirect)->gpu_address; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 8397e67..f738401 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -2155,17 +2155,17 @@ void si_shader_change_notify(struct si_context *sctx) } } -#define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base) do { \ +#define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base, type) do { \ unsigned sh_reg_base = (sh_base); \ if (sh_reg_base) { \ unsigned mask = sctx->shader_pointers_dirty & (pointer_mask); \ \ - if (sctx->gfx_level >= GFX11 && sh_reg_base != R_00B900_COMPUTE_USER_DATA_0) { \ + if (sctx->gfx_level >= GFX11) { \ u_foreach_bit(i, mask) { \ struct si_descriptors *descs = &sctx->descriptors[i]; \ unsigned sh_reg = sh_reg_base + descs->shader_userdata_offset; \ \ - radeon_push_gfx_sh_reg(sh_reg, descs->gpu_address); \ + radeon_push_##type##_sh_reg(sh_reg, descs->gpu_address); \ } \ } else { \ while (mask) { \ @@ -2231,15 +2231,15 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx) radeon_begin(&sctx->gfx_cs); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX), - sh_base[PIPE_SHADER_VERTEX]); + sh_base[PIPE_SHADER_VERTEX], gfx); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL), - sh_base[PIPE_SHADER_TESS_EVAL]); + sh_base[PIPE_SHADER_TESS_EVAL], gfx); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT), - sh_base[PIPE_SHADER_FRAGMENT]); + sh_base[PIPE_SHADER_FRAGMENT], gfx); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL), - sh_base[PIPE_SHADER_TESS_CTRL]); + sh_base[PIPE_SHADER_TESS_CTRL], gfx); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY), - sh_base[PIPE_SHADER_GEOMETRY]); + sh_base[PIPE_SHADER_GEOMETRY], gfx); if (sctx->gs_attribute_ring_pointer_dirty) { assert(sctx->gfx_level >= GFX11); @@ -2266,11 +2266,16 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) radeon_begin(cs); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), - R_00B900_COMPUTE_USER_DATA_0); + R_00B900_COMPUTE_USER_DATA_0, compute); sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE); if (sctx->compute_bindless_pointer_dirty) { - radeon_emit_one_32bit_pointer(sctx, &sctx->bindless_descriptors, base); + if (sctx->gfx_level >= GFX11) { + radeon_push_compute_sh_reg(base + sctx->bindless_descriptors.shader_userdata_offset, + sctx->bindless_descriptors.gpu_address); + } else { + radeon_emit_one_32bit_pointer(sctx, &sctx->bindless_descriptors, base); + } sctx->compute_bindless_pointer_dirty = false; } diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 4aca0e0..bec1c1e 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -546,7 +546,9 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) ctx->last_num_tcs_input_cp = -1; assert(ctx->num_buffered_gfx_sh_regs == 0); + assert(ctx->num_buffered_compute_sh_regs == 0); ctx->num_buffered_gfx_sh_regs = 0; + ctx->num_buffered_compute_sh_regs = 0; if (ctx->scratch_buffer) { si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 8805508..3a8e1af 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1034,6 +1034,8 @@ struct si_context { /* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_PACKED*. */ unsigned num_buffered_gfx_sh_regs; struct si_sh_reg_pair buffered_gfx_sh_regs[32]; + unsigned num_buffered_compute_sh_regs; + struct si_sh_reg_pair buffered_compute_sh_regs[32]; /* Atom declarations. */ struct si_framebuffer framebuffer; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f7d7489..0abc887 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -624,6 +624,7 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex_elements *velems, struct pipe_vertex_buffer *vb, unsigned element_index, uint32_t *out); +void gfx11_emit_buffered_compute_sh_regs(struct si_context *sctx); void si_init_draw_functions_GFX6(struct si_context *sctx); void si_init_draw_functions_GFX7(struct si_context *sctx); void si_init_draw_functions_GFX8(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index c7dcfae..d388182 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1517,6 +1517,16 @@ gfx11_emit_buffered_sh_regs_inline(struct si_context *sctx, unsigned *num_regs, radeon_end(); } +#if GFX_VER == 6 /* declare this function only once because there is only one variant. */ + +void gfx11_emit_buffered_compute_sh_regs(struct si_context *sctx) +{ + gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_compute_sh_regs, + sctx->buffered_compute_sh_regs); +} + +#endif + #define EMIT_SQTT_END_DRAW \ do { \ if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt_enabled)) { \ -- 2.7.4