From 54ebd9073988ea22e0bc67c02b156bc581e1e497 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 22 Feb 2023 06:38:00 -0500 Subject: [PATCH] radeonsi: merge si_emit_initial_compute_regs with si_init_cs_preamble_state It's better to set all immutable registers in one place. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_compute.c | 103 --------------------- src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c | 5 +- src/gallium/drivers/radeonsi/si_gfx_cs.c | 21 ++--- src/gallium/drivers/radeonsi/si_pipe.c | 4 +- src/gallium/drivers/radeonsi/si_pipe.h | 1 - src/gallium/drivers/radeonsi/si_state.c | 86 +++++++++++++++-- 6 files changed, 93 insertions(+), 127 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index d8491fb..7aff257 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -379,102 +379,6 @@ static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsi } } -void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs) -{ - const struct radeon_info *info = &sctx->screen->info; - - radeon_begin(cs); - radeon_set_sh_reg(R_00B834_COMPUTE_PGM_HI, - S_00B834_DATA(sctx->screen->info.address32_hi >> 8)); - - radeon_set_sh_reg_seq(R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); - /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1, - * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */ - radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en)); - radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en)); - - if (sctx->gfx_level == GFX6) { - /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID - * and is now per pipe, so it should be handled in the - * kernel if we want to use something other than the default value. - * - * TODO: This should be: - * (number of compute units) * 4 * (waves per simd) - 1 - */ - radeon_set_sh_reg(R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); - radeon_set_config_reg(R_00950C_TA_CS_BC_BASE_ADDR, sctx->border_color_buffer->gpu_address >> 8); - } - - if (sctx->gfx_level >= GFX7) { - /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */ - radeon_set_sh_reg_seq(R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2); - radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en)); - radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en)); - - /* Disable profiling on compute queues. */ - if (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics) { - radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0); - radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0); - } - - /* Set the pointer to border colors. */ - /* MI200 doesn't support border colors. */ - if (sctx->border_color_buffer) { - uint64_t bc_va = sctx->border_color_buffer->gpu_address; - - radeon_set_uconfig_reg_seq(R_030E00_TA_CS_BC_BASE_ADDR, 2, false); - radeon_emit(bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */ - radeon_emit(S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */ - } - } - - /* cs_preamble_state initializes this for the gfx queue, so only do this - * if we are on a compute queue. - */ - if (sctx->gfx_level >= GFX9 && sctx->gfx_level < GFX11 && - (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics)) { - radeon_set_uconfig_reg(R_0301EC_CP_COHER_START_DELAY, - sctx->gfx_level >= GFX10 ? 0x20 : 0); - } - - if (!info->has_graphics && info->family >= CHIP_MI100) { - radeon_set_sh_reg_seq(R_00B894_COMPUTE_STATIC_THREAD_MGMT_SE4, 4); - radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en)); - radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en)); - radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en)); - radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en)); - } - - if (sctx->gfx_level >= GFX10) { - radeon_set_sh_reg_seq(R_00B890_COMPUTE_USER_ACCUM_0, 4); - radeon_emit(0); /* R_00B890_COMPUTE_USER_ACCUM_0 */ - radeon_emit(0); /* R_00B894_COMPUTE_USER_ACCUM_1 */ - radeon_emit(0); /* R_00B898_COMPUTE_USER_ACCUM_2 */ - radeon_emit(0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */ - - radeon_set_sh_reg(R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); - - if (sctx->gfx_level < GFX11) - radeon_set_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3, 0); - } - - if (sctx->gfx_level >= GFX11) { - radeon_set_sh_reg_seq(R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, 4); - radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE4 */ - radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE5 */ - radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE6 */ - radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE7 */ - - /* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits. - * Only these values are valid: 0 (disabled), 64, 128, 256, 512 - * Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure) - */ - radeon_set_sh_reg(R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256)); - } - - radeon_end(); -} - static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader) { uint64_t scratch_bo_size, scratch_needed; @@ -1019,13 +923,6 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info if (sctx->bo_list_add_all_compute_resources) si_compute_resources_add_all_to_bo_list(sctx); - if (!sctx->cs_shader_state.initialized) { - si_emit_initial_compute_regs(sctx, &sctx->gfx_cs); - - sctx->cs_shader_state.emitted_program = NULL; - sctx->cs_shader_state.initialized = true; - } - /* First emit registers. */ bool prefetch; if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc, &prefetch, diff --git a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c index 9b5c15ab..ecdbc5e 100644 --- a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c +++ b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c @@ -38,8 +38,9 @@ static void si_set_context_reg_array(struct radeon_cmdbuf *cs, unsigned reg, uns void si_init_cp_reg_shadowing(struct si_context *sctx) { - if (sctx->screen->info.mid_command_buffer_preemption_enabled || - sctx->screen->debug_flags & DBG(SHADOW_REGS)) { + if (sctx->has_graphics && + (sctx->screen->info.mid_command_buffer_preemption_enabled || + sctx->screen->debug_flags & DBG(SHADOW_REGS))) { sctx->shadowed_regs = si_aligned_buffer_create(sctx->b.screen, PIPE_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL, diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index a400789..e6e94ce 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -423,9 +423,17 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) } si_add_all_descriptors_to_bo_list(ctx); - si_shader_pointers_mark_dirty(ctx); - ctx->cs_shader_state.initialized = false; + ctx->cs_shader_state.emitted_program = NULL; + + /* The CS initialization should be emitted before everything else. */ + if (ctx->cs_preamble_state) { + struct si_pm4_state *preamble = is_secure ? ctx->cs_preamble_state_tmz : + ctx->cs_preamble_state; + ctx->ws->cs_set_preamble(&ctx->gfx_cs, preamble->pm4, preamble->ndw, + preamble != ctx->last_preamble); + ctx->last_preamble = preamble; + } if (!ctx->has_graphics) { ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw; @@ -443,15 +451,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) */ si_pm4_reset_emitted(ctx); - /* The CS initialization should be emitted before everything else. */ - if (ctx->cs_preamble_state) { - struct si_pm4_state *preamble = is_secure ? ctx->cs_preamble_state_tmz : - ctx->cs_preamble_state; - ctx->ws->cs_set_preamble(&ctx->gfx_cs, preamble->pm4, preamble->ndw, - preamble != ctx->last_preamble); - ctx->last_preamble = preamble; - } - if (ctx->queued.named.ls) ctx->prefetch_L2_mask |= SI_PREFETCH_LS; if (ctx->queued.named.hs) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index a358b62..06b13ba 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -753,9 +753,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign /* The remainder of this function initializes the gfx CS and must be last. */ assert(sctx->gfx_cs.current.cdw == 0); - if (sctx->has_graphics) { - si_init_cp_reg_shadowing(sctx); - } + si_init_cp_reg_shadowing(sctx); /* Set immutable fields of shader keys. */ if (sctx->gfx_level >= GFX9) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 792fb4a..368e3ab 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -734,7 +734,6 @@ struct si_cs_shader_state { struct si_compute *emitted_program; unsigned offset; uint32_t variable_shared_size; - bool initialized; }; struct si_samplers { diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 7560a07..d03fe3b 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5592,7 +5592,10 @@ unsigned gfx103_get_cu_mask_ps(struct si_screen *sscreen) void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) { struct si_screen *sscreen = sctx->screen; - uint64_t border_color_va = sctx->border_color_buffer->gpu_address; + uint64_t border_color_va = + sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0; + uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) | + S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en); bool has_clear_state = sscreen->info.has_clear_state; struct si_cs_preamble { @@ -5607,7 +5610,7 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) /* Add all the space that we allocated. */ pm4->max_dw = (sizeof(struct si_cs_preamble) - offsetof(struct si_cs_preamble, pm4.pm4)) / 4; - if (!uses_reg_shadowing) { + if (sctx->has_graphics && !uses_reg_shadowing) { si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1)); si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1)); @@ -5623,6 +5626,79 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) } } + /* Compute registers. */ + si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8)); + si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en); + si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en); + + if (sctx->gfx_level == GFX6) { + /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID and is now per pipe, + * so it should be handled in the kernel if we want to use something other than + * the default value. + * TODO: This should be: (number of compute units) * 4 * (waves per simd) - 1 + */ + si_pm4_set_reg(pm4, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */); + } + + if (sctx->gfx_level >= GFX7) { + si_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en); + si_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en); + + /* Disable profiling on compute chips. */ + if (!sscreen->info.has_graphics) { + si_pm4_set_reg(pm4, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0); + si_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0); + } + } + + if (sctx->gfx_level >= GFX9 && sctx->gfx_level < GFX11) + si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, sctx->gfx_level >= GFX10 ? 0x20 : 0); + + if (!sscreen->info.has_graphics && sscreen->info.family >= CHIP_MI100) { + si_pm4_set_reg(pm4, R_00B894_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en); + si_pm4_set_reg(pm4, R_00B898_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en); + si_pm4_set_reg(pm4, R_00B89C_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en); + si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en); + } + + if (sctx->gfx_level >= GFX10) { + si_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0); + si_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0); + si_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0); + si_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0); + + if (sctx->gfx_level < GFX11) + si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_PGM_RSRC3, 0); + + si_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); + } + + if (sctx->gfx_level >= GFX11) { + si_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en); + si_pm4_set_reg(pm4, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en); + si_pm4_set_reg(pm4, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en); + si_pm4_set_reg(pm4, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en); + + /* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits. + * Only these values are valid: 0 (disabled), 64, 128, 256, 512 + * Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure) + */ + si_pm4_set_reg(pm4, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256)); + } + + /* Set the pointer to border colors. MI200 doesn't support border colors. */ + if (sctx->gfx_level >= GFX7 && sctx->border_color_buffer) { + si_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8); + si_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, + S_030E04_ADDRESS(border_color_va >> 40)); + } else if (sctx->gfx_level == GFX6) { + si_pm4_set_reg(pm4, R_00950C_TA_CS_BC_BASE_ADDR, border_color_va >> 8); + } + + if (!sctx->has_graphics) + goto done; + + /* Graphics registers. */ /* CLEAR_STATE doesn't restore these correctly. */ si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, @@ -5806,11 +5882,6 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 1); si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); - - if (sctx->gfx_level < GFX11) { - si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, - sctx->gfx_level >= GFX10 ? 0x20 : 0); - } } if (sctx->gfx_level >= GFX10) { @@ -5993,6 +6064,7 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) S_03111C_L1_POLICY(1)); } +done: sctx->cs_preamble_state = pm4; /* Make a copy of the preamble for TMZ. */ -- 2.7.4