radeonsi: merge si_emit_initial_compute_regs with si_init_cs_preamble_state
authorMarek Olšák <marek.olsak@amd.com>
Wed, 22 Feb 2023 11:38:00 +0000 (06:38 -0500)
committerMarge Bot <emma+marge@anholt.net>
Wed, 8 Mar 2023 07:29:09 +0000 (07:29 +0000)
It's better to set all immutable registers in one place.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21403>

src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c
src/gallium/drivers/radeonsi/si_gfx_cs.c
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_state.c

index d8491fb..7aff257 100644 (file)
@@ -379,102 +379,6 @@ static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsi
    }
 }
 
-void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs)
-{
-   const struct radeon_info *info = &sctx->screen->info;
-
-   radeon_begin(cs);
-   radeon_set_sh_reg(R_00B834_COMPUTE_PGM_HI,
-                     S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
-
-   radeon_set_sh_reg_seq(R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
-   /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
-    * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
-   radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-   radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-
-   if (sctx->gfx_level == GFX6) {
-      /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
-       * and is now per pipe, so it should be handled in the
-       * kernel if we want to use something other than the default value.
-       *
-       * TODO: This should be:
-       * (number of compute units) * 4 * (waves per simd) - 1
-       */
-      radeon_set_sh_reg(R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
-      radeon_set_config_reg(R_00950C_TA_CS_BC_BASE_ADDR, sctx->border_color_buffer->gpu_address >> 8);
-   }
-
-   if (sctx->gfx_level >= GFX7) {
-      /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
-      radeon_set_sh_reg_seq(R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
-      radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-      radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-
-      /* Disable profiling on compute queues. */
-      if (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics) {
-         radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
-         radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
-      }
-
-      /* Set the pointer to border colors. */
-      /* MI200 doesn't support border colors. */
-      if (sctx->border_color_buffer) {
-         uint64_t bc_va = sctx->border_color_buffer->gpu_address;
-
-         radeon_set_uconfig_reg_seq(R_030E00_TA_CS_BC_BASE_ADDR, 2, false);
-         radeon_emit(bc_va >> 8);                    /* R_030E00_TA_CS_BC_BASE_ADDR */
-         radeon_emit(S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
-      }
-   }
-
-   /* cs_preamble_state initializes this for the gfx queue, so only do this
-    * if we are on a compute queue.
-    */
-   if (sctx->gfx_level >= GFX9 && sctx->gfx_level < GFX11 &&
-       (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics)) {
-      radeon_set_uconfig_reg(R_0301EC_CP_COHER_START_DELAY,
-                             sctx->gfx_level >= GFX10 ? 0x20 : 0);
-   }
-
-   if (!info->has_graphics && info->family >= CHIP_MI100) {
-      radeon_set_sh_reg_seq(R_00B894_COMPUTE_STATIC_THREAD_MGMT_SE4, 4);
-      radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-      radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-      radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-      radeon_emit(S_00B858_SH0_CU_EN(info->spi_cu_en) | S_00B858_SH1_CU_EN(info->spi_cu_en));
-   }
-
-   if (sctx->gfx_level >= GFX10) {
-      radeon_set_sh_reg_seq(R_00B890_COMPUTE_USER_ACCUM_0, 4);
-      radeon_emit(0); /* R_00B890_COMPUTE_USER_ACCUM_0 */
-      radeon_emit(0); /* R_00B894_COMPUTE_USER_ACCUM_1 */
-      radeon_emit(0); /* R_00B898_COMPUTE_USER_ACCUM_2 */
-      radeon_emit(0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */
-
-      radeon_set_sh_reg(R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
-
-      if (sctx->gfx_level < GFX11)
-         radeon_set_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3, 0);
-   }
-
-   if (sctx->gfx_level >= GFX11) {
-      radeon_set_sh_reg_seq(R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, 4);
-      radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE4 */
-      radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE5 */
-      radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE6 */
-      radeon_emit(S_00B8AC_SA0_CU_EN(info->spi_cu_en) | S_00B8AC_SA1_CU_EN(info->spi_cu_en)); /* SE7 */
-
-      /* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits.
-       * Only these values are valid: 0 (disabled), 64, 128, 256, 512
-       * Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure)
-       */
-      radeon_set_sh_reg(R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256));
-   }
-
-   radeon_end();
-}
-
 static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader)
 {
    uint64_t scratch_bo_size, scratch_needed;
@@ -1019,13 +923,6 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
    if (sctx->bo_list_add_all_compute_resources)
       si_compute_resources_add_all_to_bo_list(sctx);
 
-   if (!sctx->cs_shader_state.initialized) {
-      si_emit_initial_compute_regs(sctx, &sctx->gfx_cs);
-
-      sctx->cs_shader_state.emitted_program = NULL;
-      sctx->cs_shader_state.initialized = true;
-   }
-
    /* First emit registers. */
    bool prefetch;
    if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc, &prefetch,
index 9b5c15a..ecdbc5e 100644 (file)
@@ -38,8 +38,9 @@ static void si_set_context_reg_array(struct radeon_cmdbuf *cs, unsigned reg, uns
 
 void si_init_cp_reg_shadowing(struct si_context *sctx)
 {
-   if (sctx->screen->info.mid_command_buffer_preemption_enabled ||
-       sctx->screen->debug_flags & DBG(SHADOW_REGS)) {
+   if (sctx->has_graphics &&
+       (sctx->screen->info.mid_command_buffer_preemption_enabled ||
+        sctx->screen->debug_flags & DBG(SHADOW_REGS))) {
       sctx->shadowed_regs =
             si_aligned_buffer_create(sctx->b.screen,
                                      PIPE_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
index a400789..e6e94ce 100644 (file)
@@ -423,9 +423,17 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
    }
 
    si_add_all_descriptors_to_bo_list(ctx);
-
    si_shader_pointers_mark_dirty(ctx);
-   ctx->cs_shader_state.initialized = false;
+   ctx->cs_shader_state.emitted_program = NULL;
+
+   /* The CS initialization should be emitted before everything else. */
+   if (ctx->cs_preamble_state) {
+      struct si_pm4_state *preamble = is_secure ? ctx->cs_preamble_state_tmz :
+                                                  ctx->cs_preamble_state;
+      ctx->ws->cs_set_preamble(&ctx->gfx_cs, preamble->pm4, preamble->ndw,
+                               preamble != ctx->last_preamble);
+      ctx->last_preamble = preamble;
+   }
 
    if (!ctx->has_graphics) {
       ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
@@ -443,15 +451,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
     */
    si_pm4_reset_emitted(ctx);
 
-   /* The CS initialization should be emitted before everything else. */
-   if (ctx->cs_preamble_state) {
-      struct si_pm4_state *preamble = is_secure ? ctx->cs_preamble_state_tmz :
-                                                  ctx->cs_preamble_state;
-      ctx->ws->cs_set_preamble(&ctx->gfx_cs, preamble->pm4, preamble->ndw,
-                               preamble != ctx->last_preamble);
-      ctx->last_preamble = preamble;
-   }
-
    if (ctx->queued.named.ls)
       ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
    if (ctx->queued.named.hs)
index a358b62..06b13ba 100644 (file)
@@ -753,9 +753,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
    /* The remainder of this function initializes the gfx CS and must be last. */
    assert(sctx->gfx_cs.current.cdw == 0);
 
-   if (sctx->has_graphics) {
-      si_init_cp_reg_shadowing(sctx);
-   }
+   si_init_cp_reg_shadowing(sctx);
 
    /* Set immutable fields of shader keys. */
    if (sctx->gfx_level >= GFX9) {
index 792fb4a..368e3ab 100644 (file)
@@ -734,7 +734,6 @@ struct si_cs_shader_state {
    struct si_compute *emitted_program;
    unsigned offset;
    uint32_t variable_shared_size;
-   bool initialized;
 };
 
 struct si_samplers {
index 7560a07..d03fe3b 100644 (file)
@@ -5592,7 +5592,10 @@ unsigned gfx103_get_cu_mask_ps(struct si_screen *sscreen)
 void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
 {
    struct si_screen *sscreen = sctx->screen;
-   uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
+   uint64_t border_color_va =
+      sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0;
+   uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) |
+                            S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en);
    bool has_clear_state = sscreen->info.has_clear_state;
 
    struct si_cs_preamble {
@@ -5607,7 +5610,7 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
    /* Add all the space that we allocated. */
    pm4->max_dw = (sizeof(struct si_cs_preamble) - offsetof(struct si_cs_preamble, pm4.pm4)) / 4;
 
-   if (!uses_reg_shadowing) {
+   if (sctx->has_graphics && !uses_reg_shadowing) {
       si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
       si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1));
       si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1));
@@ -5623,6 +5626,79 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
       }
    }
 
+   /* Compute registers. */
+   si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
+   si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en);
+   si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en);
+
+   if (sctx->gfx_level == GFX6) {
+      /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID and is now per pipe,
+       * so it should be handled in the kernel if we want to use something other than
+       * the default value.
+       * TODO: This should be: (number of compute units) * 4 * (waves per simd) - 1
+       */
+      si_pm4_set_reg(pm4, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
+   }
+
+   if (sctx->gfx_level >= GFX7) {
+      si_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en);
+      si_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en);
+
+      /* Disable profiling on compute chips. */
+      if (!sscreen->info.has_graphics) {
+         si_pm4_set_reg(pm4, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
+         si_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
+      }
+   }
+
+   if (sctx->gfx_level >= GFX9 && sctx->gfx_level < GFX11)
+      si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, sctx->gfx_level >= GFX10 ? 0x20 : 0);
+
+   if (!sscreen->info.has_graphics && sscreen->info.family >= CHIP_MI100) {
+      si_pm4_set_reg(pm4, R_00B894_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en);
+      si_pm4_set_reg(pm4, R_00B898_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en);
+      si_pm4_set_reg(pm4, R_00B89C_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en);
+      si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en);
+   }
+
+   if (sctx->gfx_level >= GFX10) {
+      si_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0);
+      si_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0);
+      si_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0);
+      si_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
+
+      if (sctx->gfx_level < GFX11)
+         si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
+
+      si_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
+   }
+
+   if (sctx->gfx_level >= GFX11) {
+      si_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en);
+      si_pm4_set_reg(pm4, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en);
+      si_pm4_set_reg(pm4, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en);
+      si_pm4_set_reg(pm4, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en);
+
+      /* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits.
+       * Only these values are valid: 0 (disabled), 64, 128, 256, 512
+       * Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure)
+       */
+      si_pm4_set_reg(pm4, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256));
+   }
+
+   /* Set the pointer to border colors. MI200 doesn't support border colors. */
+   if (sctx->gfx_level >= GFX7 && sctx->border_color_buffer) {
+      si_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
+      si_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI,
+                     S_030E04_ADDRESS(border_color_va >> 40));
+   } else if (sctx->gfx_level == GFX6) {
+      si_pm4_set_reg(pm4, R_00950C_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
+   }
+
+   if (!sctx->has_graphics)
+      goto done;
+
+   /* Graphics registers. */
    /* CLEAR_STATE doesn't restore these correctly. */
    si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
    si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
@@ -5806,11 +5882,6 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
 
       si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 1);
       si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
-
-      if (sctx->gfx_level < GFX11) {
-         si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY,
-                        sctx->gfx_level >= GFX10 ? 0x20 : 0);
-      }
    }
 
    if (sctx->gfx_level >= GFX10) {
@@ -5993,6 +6064,7 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
                      S_03111C_L1_POLICY(1));
    }
 
+done:
    sctx->cs_preamble_state = pm4;
 
    /* Make a copy of the preamble for TMZ. */