It's better to set all immutable registers in one place.
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21403>
}
}
-void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs)
-{
- const struct radeon_info *info = &sctx->screen->info;
-
- radeon_begin(cs);
- radeon_set_sh_reg(R_00B834_COMPUTE_PGM_HI,
- S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
-
- radeon_set_sh_reg_seq(R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
- /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
- * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
- radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
- radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-
- if (sctx->gfx_level == GFX6) {
- /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
- * and is now per pipe, so it should be handled in the
- * kernel if we want to use something other than the default value.
- *
- * TODO: This should be:
- * (number of compute units) * 4 * (waves per simd) - 1
- */
- radeon_set_sh_reg(R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
- radeon_set_config_reg(R_00950C_TA_CS_BC_BASE_ADDR, sctx->border_color_buffer->gpu_address >> 8);
- }
-
- if (sctx->gfx_level >= GFX7) {
- /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
- radeon_set_sh_reg_seq(R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
- radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
- radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-
- /* Disable profiling on compute queues. */
- if (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics) {
- radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
- radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
- }
-
- /* Set the pointer to border colors. */
- /* MI200 doesn't support border colors. */
- if (sctx->border_color_buffer) {
- uint64_t bc_va = sctx->border_color_buffer->gpu_address;
-
- radeon_set_uconfig_reg_seq(R_030E00_TA_CS_BC_BASE_ADDR, 2, false);
- radeon_emit(bc_va >> 8); /* R_030E00_TA_CS_BC_BASE_ADDR */
- radeon_emit(S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
- }
- }
-
- /* cs_preamble_state initializes this for the gfx queue, so only do this
- * if we are on a compute queue.
- */
- if (sctx->gfx_level >= GFX9 && sctx->gfx_level < GFX11 &&
- (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics)) {
- radeon_set_uconfig_reg(R_0301EC_CP_COHER_START_DELAY,
- sctx->gfx_level >= GFX10 ? 0x20 : 0);
- }
-
- if (!info->has_graphics && info->family >= CHIP_MI100) {
- radeon_set_sh_reg_seq(R_00B894_COMPUTE_STATIC_THREAD_MGMT_SE4, 4);
- radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
- radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
- radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
- radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
- }
-
- if (sctx->gfx_level >= GFX10) {
- radeon_set_sh_reg_seq(R_00B890_COMPUTE_USER_ACCUM_0, 4);
- radeon_emit(0); /* R_00B890_COMPUTE_USER_ACCUM_0 */
- radeon_emit(0); /* R_00B894_COMPUTE_USER_ACCUM_1 */
- radeon_emit(0); /* R_00B898_COMPUTE_USER_ACCUM_2 */
- radeon_emit(0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */
-
- radeon_set_sh_reg(R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
-
- if (sctx->gfx_level < GFX11)
- radeon_set_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3, 0);
- }
-
- if (sctx->gfx_level >= GFX11) {
- radeon_set_sh_reg_seq(R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, 4);
- radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE4 */
- radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE5 */
- radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE6 */
- radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE7 */
-
- /* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits.
- * Only these values are valid: 0 (disabled), 64, 128, 256, 512
- * Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure)
- */
- radeon_set_sh_reg(R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256));
- }
-
- radeon_end();
-}
-
static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader)
{
uint64_t scratch_bo_size, scratch_needed;
if (sctx->bo_list_add_all_compute_resources)
si_compute_resources_add_all_to_bo_list(sctx);
- if (!sctx->cs_shader_state.initialized) {
- si_emit_initial_compute_regs(sctx, &sctx->gfx_cs);
-
- sctx->cs_shader_state.emitted_program = NULL;
- sctx->cs_shader_state.initialized = true;
- }
-
/* First emit registers. */
bool prefetch;
if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc, &prefetch,
void si_init_cp_reg_shadowing(struct si_context *sctx)
{
- if (sctx->screen->info.mid_command_buffer_preemption_enabled ||
- sctx->screen->debug_flags & DBG(SHADOW_REGS)) {
+ if (sctx->has_graphics &&
+ (sctx->screen->info.mid_command_buffer_preemption_enabled ||
+ sctx->screen->debug_flags & DBG(SHADOW_REGS))) {
sctx->shadowed_regs =
si_aligned_buffer_create(sctx->b.screen,
PIPE_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
}
si_add_all_descriptors_to_bo_list(ctx);
-
si_shader_pointers_mark_dirty(ctx);
- ctx->cs_shader_state.initialized = false;
+ ctx->cs_shader_state.emitted_program = NULL;
+
+ /* The CS initialization should be emitted before everything else. */
+ if (ctx->cs_preamble_state) {
+ struct si_pm4_state *preamble = is_secure ? ctx->cs_preamble_state_tmz :
+ ctx->cs_preamble_state;
+ ctx->ws->cs_set_preamble(&ctx->gfx_cs, preamble->pm4, preamble->ndw,
+ preamble != ctx->last_preamble);
+ ctx->last_preamble = preamble;
+ }
if (!ctx->has_graphics) {
ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
*/
si_pm4_reset_emitted(ctx);
- /* The CS initialization should be emitted before everything else. */
- if (ctx->cs_preamble_state) {
- struct si_pm4_state *preamble = is_secure ? ctx->cs_preamble_state_tmz :
- ctx->cs_preamble_state;
- ctx->ws->cs_set_preamble(&ctx->gfx_cs, preamble->pm4, preamble->ndw,
- preamble != ctx->last_preamble);
- ctx->last_preamble = preamble;
- }
-
if (ctx->queued.named.ls)
ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
if (ctx->queued.named.hs)
/* The remainder of this function initializes the gfx CS and must be last. */
assert(sctx->gfx_cs.current.cdw == 0);
- if (sctx->has_graphics) {
- si_init_cp_reg_shadowing(sctx);
- }
+ si_init_cp_reg_shadowing(sctx);
/* Set immutable fields of shader keys. */
if (sctx->gfx_level >= GFX9) {
struct si_compute *emitted_program;
unsigned offset;
uint32_t variable_shared_size;
- bool initialized;
};
struct si_samplers {
void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
{
struct si_screen *sscreen = sctx->screen;
- uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
+ uint64_t border_color_va =
+ sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0;
+ uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) |
+ S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en);
bool has_clear_state = sscreen->info.has_clear_state;
struct si_cs_preamble {
/* Add all the space that we allocated. */
pm4->max_dw = (sizeof(struct si_cs_preamble) - offsetof(struct si_cs_preamble, pm4.pm4)) / 4;
- if (!uses_reg_shadowing) {
+ if (sctx->has_graphics && !uses_reg_shadowing) {
si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1));
si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1));
}
}
+ /* Compute registers. */
+ si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
+ si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en);
+ si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en);
+
+ if (sctx->gfx_level == GFX6) {
+ /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID and is now per pipe,
+ * so it should be handled in the kernel if we want to use something other than
+ * the default value.
+ * TODO: This should be: (number of compute units) * 4 * (waves per simd) - 1
+ */
+ si_pm4_set_reg(pm4, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
+ }
+
+ if (sctx->gfx_level >= GFX7) {
+ si_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en);
+ si_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en);
+
+ /* Disable profiling on compute chips. */
+ if (!sscreen->info.has_graphics) {
+ si_pm4_set_reg(pm4, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
+ si_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
+ }
+ }
+
+ if (sctx->gfx_level >= GFX9 && sctx->gfx_level < GFX11)
+ si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, sctx->gfx_level >= GFX10 ? 0x20 : 0);
+
+ if (!sscreen->info.has_graphics && sscreen->info.family >= CHIP_MI100) {
+ si_pm4_set_reg(pm4, R_00B894_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en);
+ si_pm4_set_reg(pm4, R_00B898_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en);
+ si_pm4_set_reg(pm4, R_00B89C_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en);
+ si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en);
+ }
+
+ if (sctx->gfx_level >= GFX10) {
+ si_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0);
+ si_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0);
+ si_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0);
+ si_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
+
+ if (sctx->gfx_level < GFX11)
+ si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
+
+ si_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
+ }
+
+ if (sctx->gfx_level >= GFX11) {
+ si_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en);
+ si_pm4_set_reg(pm4, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en);
+ si_pm4_set_reg(pm4, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en);
+ si_pm4_set_reg(pm4, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en);
+
+ /* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits.
+ * Only these values are valid: 0 (disabled), 64, 128, 256, 512
+ * Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure)
+ */
+ si_pm4_set_reg(pm4, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256));
+ }
+
+ /* Set the pointer to border colors. MI200 doesn't support border colors. */
+ if (sctx->gfx_level >= GFX7 && sctx->border_color_buffer) {
+ si_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
+ si_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI,
+ S_030E04_ADDRESS(border_color_va >> 40));
+ } else if (sctx->gfx_level == GFX6) {
+ si_pm4_set_reg(pm4, R_00950C_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
+ }
+
+ if (!sctx->has_graphics)
+ goto done;
+
+ /* Graphics registers. */
/* CLEAR_STATE doesn't restore these correctly. */
si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 1);
si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
-
- if (sctx->gfx_level < GFX11) {
- si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY,
- sctx->gfx_level >= GFX10 ? 0x20 : 0);
- }
}
if (sctx->gfx_level >= GFX10) {
S_03111C_L1_POLICY(1));
}
+done:
sctx->cs_preamble_state = pm4;
/* Make a copy of the preamble for TMZ. */