amd: massively simplify how info->spi_cu_en is applied
authorMarek Olšák <marek.olsak@amd.com>
Tue, 21 Feb 2023 17:22:38 +0000 (12:22 -0500)
committerMarge Bot <emma+marge@anholt.net>
Fri, 3 Mar 2023 00:41:48 +0000 (00:41 +0000)
Instead of having ac_set_reg_cu_en that sets the register, replace it with
ac_apply_cu_en that only returns the modified register value,
which allows a large simplification in both drivers because a lot of code
becomes duplicated after it's switched to ac_apply_cu_en.

RADV also didn't apply it to a few registers. Fixed.

This removes 82 lines of code in total.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21641>

src/amd/common/ac_shader_util.c
src/amd/common/ac_shader_util.h
src/amd/vulkan/radv_cs.h
src/amd/vulkan/radv_pipeline.c
src/amd/vulkan/si_cmd_buffer.c
src/gallium/drivers/radeonsi/si_build_pm4.h
src/gallium/drivers/radeonsi/si_pm4.c
src/gallium/drivers/radeonsi/si_pm4.h
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state_shaders.cpp

index 9ffb90c..340d51f 100644 (file)
@@ -958,9 +958,8 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
    return CLAMP(workgroup_size, 1, 256);
 }
 
-void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask,
-                      unsigned value_shift, const struct radeon_info *info,
-                      void set_sh_reg(void*, unsigned, uint32_t))
+uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
+                        const struct radeon_info *info)
 {
    /* Register field position and mask. */
    uint32_t cu_en_mask = ~clear_mask;
@@ -970,10 +969,8 @@ void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t cl
 
    /* AND the field by spi_cu_en. */
    uint32_t spi_cu_en = info->spi_cu_en >> value_shift;
-   uint32_t new_value = (value & ~cu_en_mask) |
-                        (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
-
-   set_sh_reg(cs, reg_offset, new_value);
+   return (value & ~cu_en_mask) |
+          (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
 }
 
 /* Return the register value and tune bytes_per_wave to increase scratch performance. */
index 3b28ae2..1c0d2fd 100644 (file)
@@ -166,9 +166,8 @@ unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned w
 unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
                                        unsigned max_vtx_out, unsigned prim_amp_factor);
 
-void ac_set_reg_cu_en(void *cs, unsigned reg_offset, uint32_t value, uint32_t clear_mask,
-                      unsigned value_shift, const struct radeon_info *info,
-                      void set_sh_reg(void*, unsigned, uint32_t));
+uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
+                        const struct radeon_info *info);
 
 void ac_get_scratch_tmpring_size(const struct radeon_info *info,
                                  unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
index ac3d107..351d8e2 100644 (file)
@@ -118,17 +118,6 @@ radeon_set_sh_reg_idx(const struct radv_physical_device *pdevice, struct radeon_
 }
 
 static inline void
-gfx10_set_sh_reg_idx3(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
-{
-   assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
-   assert(cs->cdw + 3 <= cs->max_dw);
-
-   radeon_emit(cs, PKT3(PKT3_SET_SH_REG_INDEX, 1, 0));
-   radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2 | (3 << 28));
-   radeon_emit(cs, value);
-}
-
-static inline void
 radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
    assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
index 0f13744..3382e41 100644 (file)
@@ -3853,15 +3853,10 @@ radv_pipeline_emit_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
                          &late_alloc_wave64, &cu_mask);
 
    if (pdevice->rad_info.gfx_level >= GFX7) {
-      if (pdevice->rad_info.gfx_level >= GFX10) {
-         ac_set_reg_cu_en(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
-                          S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F),
-                          C_00B118_CU_EN, 0, &pdevice->rad_info,
-                          (void*)gfx10_set_sh_reg_idx3);
-      } else {
-         radeon_set_sh_reg_idx(pdevice, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
-                               S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
-      }
+      radeon_set_sh_reg_idx(pdevice, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
+                            ac_apply_cu_en(S_00B118_CU_EN(cu_mask) |
+                                           S_00B118_WAVE_LIMIT(0x3F),
+                                           C_00B118_CU_EN, 0, &pdevice->rad_info));
       radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
    }
    if (pdevice->rad_info.gfx_level >= GFX10) {
@@ -4032,28 +4027,21 @@ radv_pipeline_emit_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs
    ac_compute_late_alloc(&pdevice->rad_info, true, shader->info.has_ngg_culling,
                          shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
 
+   radeon_set_sh_reg_idx(pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
+                         ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) |
+                                        S_00B21C_WAVE_LIMIT(0x3F),
+                                        C_00B21C_CU_EN, 0, &pdevice->rad_info));
+
    if (pdevice->rad_info.gfx_level >= GFX11) {
-      /* TODO: figure out how S_00B204_CU_EN_GFX11 interacts with ac_set_reg_cu_en */
-      gfx10_set_sh_reg_idx3(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                            S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
-      gfx10_set_sh_reg_idx3(
-         cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-         S_00B204_CU_EN_GFX11(0x1) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
-   } else if (pdevice->rad_info.gfx_level >= GFX10) {
-      ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                       S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F),
-                       C_00B21C_CU_EN, 0, &pdevice->rad_info, (void*)gfx10_set_sh_reg_idx3);
-      ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                       S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
-                       C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info,
-                       (void*)gfx10_set_sh_reg_idx3);
+      radeon_set_sh_reg_idx(pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
+                            ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) |
+                                           S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
+                                           C_00B204_CU_EN_GFX11, 16, &pdevice->rad_info));
    } else {
-      radeon_set_sh_reg_idx(
-         pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
-         S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
-      radeon_set_sh_reg_idx(
-         pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
-         S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+      radeon_set_sh_reg_idx(pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
+                            ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) |
+                                           S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
+                                           C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info));
    }
 
    uint32_t oversub_pc_lines = late_alloc_wave64 ? pdevice->rad_info.pc_lines / 4 : 0;
@@ -4213,25 +4201,16 @@ radv_pipeline_emit_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
       radeon_emit(cs, gs->config.rsrc2);
    }
 
-   if (pdevice->rad_info.gfx_level >= GFX10) {
-      ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                       S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F),
-                       C_00B21C_CU_EN, 0, &pdevice->rad_info,
-                       (void*)gfx10_set_sh_reg_idx3);
-      ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                       S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
-                       C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info,
-                       (void*)gfx10_set_sh_reg_idx3);
-   } else if (pdevice->rad_info.gfx_level >= GFX7) {
-      radeon_set_sh_reg_idx(
-         pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
-         S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
+   radeon_set_sh_reg_idx(pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
+                         ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
+                                        S_00B21C_WAVE_LIMIT(0x3F),
+                                        C_00B21C_CU_EN, 0, &pdevice->rad_info));
 
-      if (pdevice->rad_info.gfx_level >= GFX10) {
-         radeon_set_sh_reg_idx(
-            pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
-            S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
-      }
+   if (pdevice->rad_info.gfx_level >= GFX10) {
+      radeon_set_sh_reg_idx(pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
+                            ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) |
+                                           S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
+                                           C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info));
    }
 
    radv_pipeline_emit_hw_vs(ctx_cs, cs, pipeline, pipeline->base.gs_copy_shader);
index 6641cc8..e399282 100644 (file)
@@ -353,34 +353,35 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
       if (physical_device->rad_info.gfx_level >= GFX10 &&
           physical_device->rad_info.gfx_level < GFX11) {
          /* Logical CUs 16 - 31 */
-         ac_set_reg_cu_en(cs, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff),
-                          C_00B104_CU_EN, 16, &physical_device->rad_info,
-                          (void*)gfx10_set_sh_reg_idx3);
+         radeon_set_sh_reg_idx(physical_device, cs, R_00B104_SPI_SHADER_PGM_RSRC4_VS, 3,
+                               ac_apply_cu_en(S_00B104_CU_EN(0xffff),
+                                              C_00B104_CU_EN, 16, &physical_device->rad_info));
       }
 
       if (physical_device->rad_info.gfx_level >= GFX10) {
-         ac_set_reg_cu_en(cs, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff),
-                          C_00B404_CU_EN, 16, &physical_device->rad_info,
-                          (void*)gfx10_set_sh_reg_idx3);
-         ac_set_reg_cu_en(cs, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16),
-                          C_00B004_CU_EN, 16, &physical_device->rad_info,
-                          (void*)gfx10_set_sh_reg_idx3);
+         radeon_set_sh_reg_idx(physical_device, cs, R_00B404_SPI_SHADER_PGM_RSRC4_HS, 3,
+                               ac_apply_cu_en(S_00B404_CU_EN(0xffff),
+                                              C_00B404_CU_EN, 16, &physical_device->rad_info));
+         radeon_set_sh_reg_idx(physical_device, cs, R_00B004_SPI_SHADER_PGM_RSRC4_PS, 3,
+                               ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16),
+                                              C_00B004_CU_EN, 16, &physical_device->rad_info));
       }
 
-      if (physical_device->rad_info.gfx_level >= GFX10) {
-         ac_set_reg_cu_en(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
-                          S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F),
-                          C_00B41C_CU_EN, 0, &physical_device->rad_info,
-                          (void*)gfx10_set_sh_reg_idx3);
-      } else if (physical_device->rad_info.gfx_level == GFX9) {
+      if (physical_device->rad_info.gfx_level >= GFX9) {
          radeon_set_sh_reg_idx(physical_device, cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 3,
-                               S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
+                               ac_apply_cu_en(S_00B41C_CU_EN(0xffff) |
+                                              S_00B41C_WAVE_LIMIT(0x3F),
+                                              C_00B41C_CU_EN, 0, &physical_device->rad_info));
       } else {
          radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
-                           S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
+                           ac_apply_cu_en(S_00B51C_CU_EN(0xffff) |
+                                          S_00B51C_WAVE_LIMIT(0x3F),
+                                          C_00B51C_CU_EN, 0, &physical_device->rad_info));
          radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
          radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
-                           S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
+                           ac_apply_cu_en(S_00B31C_CU_EN(0xffff) |
+                                          S_00B31C_WAVE_LIMIT(0x3F),
+                                          C_00B31C_CU_EN, 0, &physical_device->rad_info));
          /* If this is 0, Bonaire can hang even if GS isn't being used.
           * Other chips are unaffected. These are suboptimal values,
           * but we don't use on-chip GS.
@@ -389,16 +390,11 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
                                 S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
       }
 
-      if (physical_device->rad_info.gfx_level >= GFX10) {
-         ac_set_reg_cu_en(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
-                          S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F) |
-                          S_00B01C_LDS_GROUP_SIZE(physical_device->rad_info.gfx_level >= GFX11),
-                          C_00B01C_CU_EN, 0, &physical_device->rad_info,
-                          (void*)gfx10_set_sh_reg_idx3);
-      } else {
-         radeon_set_sh_reg_idx(physical_device, cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 3,
-                               S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F));
-      }
+      radeon_set_sh_reg_idx(physical_device, cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 3,
+                            ac_apply_cu_en(S_00B01C_CU_EN(cu_mask_ps) |
+                                           S_00B01C_WAVE_LIMIT(0x3F) |
+                                           S_00B01C_LDS_GROUP_SIZE(physical_device->rad_info.gfx_level >= GFX11),
+                                           C_00B01C_CU_EN, 0, &physical_device->rad_info));
    }
 
    if (physical_device->rad_info.gfx_level >= GFX10) {
index 3578270..eb96eff 100644 (file)
    radeon_emit(((reg) - SI_SH_REG_OFFSET) >> 2); \
 } while (0)
 
-#define radeon_set_sh_reg_idx3_seq(reg, num) do { \
+#define radeon_set_sh_reg_idx3_seq(sctx, reg, num) do { \
    SI_CHECK_SHADOWED_REGS(reg, num); \
    assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \
    radeon_emit(PKT3(PKT3_SET_SH_REG_INDEX, num, 0)); \
-   radeon_emit((((reg) - SI_SH_REG_OFFSET) >> 2) | (3 << 28)); \
+   radeon_emit((((reg) - SI_SH_REG_OFFSET) >> 2) | ((sctx)->gfx_level >= GFX10 ? 3 << 28 : 0)); \
 } while (0)
 
 #define radeon_set_sh_reg(reg, value) do { \
    radeon_emit(value); \
 } while (0)
 
-#define radeon_set_sh_reg_idx3(reg, value) do { \
-   radeon_set_sh_reg_idx3_seq(reg, 1); \
+#define radeon_set_sh_reg_idx3(sctx, reg, value) do { \
+   radeon_set_sh_reg_idx3_seq(sctx, reg, 1); \
    radeon_emit(value); \
 } while (0)
 
    unsigned __value = val; \
    if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
        sctx->tracked_regs.reg_value[reg] != __value) { \
-      if (sctx->gfx_level >= GFX10) \
-         radeon_set_sh_reg_idx3(offset, __value); \
-      else \
-         radeon_set_sh_reg(offset, __value); \
+      radeon_set_sh_reg_idx3(sctx, offset, __value); \
       sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \
       sctx->tracked_regs.reg_value[reg] = __value; \
    } \
    radeon_emit_32bit_pointer(sctx->screen, (desc)->gpu_address); \
 } while (0)
 
-/* Wrappers that are only used when they are passed as function pointers. */
-static inline void radeon_set_sh_reg_func(struct radeon_cmdbuf *cs, unsigned reg_offset,
-                                          uint32_t value)
-{
-   radeon_begin(cs);
-   radeon_set_sh_reg(reg_offset, value);
-   radeon_end();
-}
-
-static inline void radeon_set_sh_reg_idx3_func(struct radeon_cmdbuf *cs, unsigned reg_offset,
-                                               uint32_t value)
-{
-   radeon_begin(cs);
-   radeon_set_sh_reg_idx3(reg_offset, value);
-   radeon_end();
-}
-
 /* This should be evaluated at compile time if all parameters are constants. */
 static ALWAYS_INLINE unsigned
 si_get_user_data_base(enum amd_gfx_level gfx_level, enum si_has_tess has_tess,
index 280125b..d79cd01 100644 (file)
@@ -104,11 +104,13 @@ void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
    si_pm4_set_reg_custom(state, reg, val, opcode, 0);
 }
 
-void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val)
+void si_pm4_set_reg_idx3(struct si_screen *sscreen, struct si_pm4_state *state,
+                         unsigned reg, uint32_t val)
 {
    SI_CHECK_SHADOWED_REGS(reg, 1);
 
-   si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
+   si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX,
+                         sscreen->info.gfx_level >= GFX10 ? 3 : 0);
 }
 
 void si_pm4_clear_state(struct si_pm4_state *state)
index 486b627..0add231 100644 (file)
@@ -31,7 +31,8 @@
 extern "C" {
 #endif
 
-// forward defines
+/* forward definitions */
+struct si_screen;
 struct si_context;
 
 /* State atoms are callbacks which write a sequence of packets into a GPU
@@ -64,7 +65,8 @@ struct si_pm4_state {
 
 void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
 void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
-void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val);
+void si_pm4_set_reg_idx3(struct si_screen *sscreen, struct si_pm4_state *state,
+                         unsigned reg, uint32_t val);
 
 void si_pm4_clear_state(struct si_pm4_state *state);
 void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);
index 72e6451..fcd8900 100644 (file)
@@ -5671,12 +5671,11 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
       cu_mask_ps = gfx103_get_cu_mask_ps(sscreen);
 
    if (sctx->gfx_level >= GFX7) {
-      ac_set_reg_cu_en(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
-                       S_00B01C_CU_EN(cu_mask_ps) |
-                       S_00B01C_WAVE_LIMIT(0x3F) |
-                       S_00B01C_LDS_GROUP_SIZE(sctx->gfx_level >= GFX11),
-                       C_00B01C_CU_EN, 0, &sscreen->info,
-                       (void*)(sctx->gfx_level >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
+      si_pm4_set_reg_idx3(sscreen, pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
+                          ac_apply_cu_en(S_00B01C_CU_EN(cu_mask_ps) |
+                                         S_00B01C_WAVE_LIMIT(0x3F) |
+                                         S_00B01C_LDS_GROUP_SIZE(sctx->gfx_level >= GFX11),
+                                         C_00B01C_CU_EN, 0, &sscreen->info));
    }
 
    if (sctx->gfx_level <= GFX8) {
@@ -5711,13 +5710,13 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
    }
 
    if (sctx->gfx_level >= GFX7 && sctx->gfx_level <= GFX8) {
-      ac_set_reg_cu_en(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
-                       S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F),
-                       C_00B51C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
+      si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
+                     ac_apply_cu_en(S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F),
+                                    C_00B51C_CU_EN, 0, &sscreen->info));
       si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
-      ac_set_reg_cu_en(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
-                       S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F),
-                       C_00B31C_CU_EN, 0, &sscreen->info, (void*)si_pm4_set_reg);
+      si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
+                     ac_apply_cu_en(S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F),
+                                    C_00B31C_CU_EN, 0, &sscreen->info));
 
       /* If this is 0, Bonaire can hang even if GS isn't being used.
        * Other chips are unaffected. These are suboptimal values,
@@ -5774,10 +5773,9 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
    }
 
    if (sctx->gfx_level >= GFX9) {
-      ac_set_reg_cu_en(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
-                       S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F), C_00B41C_CU_EN,
-                       0, &sscreen->info,
-                       (void*)(sctx->gfx_level >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
+      si_pm4_set_reg_idx3(sscreen, pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+                          ac_apply_cu_en(S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F),
+                                         C_00B41C_CU_EN, 0, &sscreen->info));
 
       si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
                      S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
@@ -5885,12 +5883,15 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
 
    if (sctx->gfx_level >= GFX10 && sctx->gfx_level <= GFX10_3) {
       /* Logical CUs 16 - 31 */
-      ac_set_reg_cu_en(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16),
-                       C_00B004_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg_idx3);
-      ac_set_reg_cu_en(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff),
-                       C_00B104_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg_idx3);
-      ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff),
-                       C_00B404_CU_EN, 16, &sscreen->info, (void*)si_pm4_set_reg_idx3);
+      si_pm4_set_reg_idx3(sscreen, pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
+                          ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16),
+                                         C_00B004_CU_EN, 16, &sscreen->info));
+      si_pm4_set_reg_idx3(sscreen, pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS,
+                          ac_apply_cu_en(S_00B104_CU_EN(0xffff),
+                                         C_00B104_CU_EN, 16, &sscreen->info));
+      si_pm4_set_reg_idx3(sscreen, pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
+                          ac_apply_cu_en(S_00B404_CU_EN(0xffff),
+                                         C_00B404_CU_EN, 16, &sscreen->info));
 
       si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
       si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0);
index 9d5b3a3..09dd033 100644 (file)
@@ -713,11 +713,10 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
 
    if (sscreen->info.gfx_level >= GFX9) {
       if (sscreen->info.gfx_level >= GFX11) {
-         ac_set_reg_cu_en(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
-                          S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
-                          S_00B404_CU_EN(0xffff),
-                          C_00B404_CU_EN, 16, &sscreen->info,
-                          (void (*)(void*, unsigned, uint32_t))si_pm4_set_reg_idx3);
+         si_pm4_set_reg_idx3(sscreen, pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
+                             ac_apply_cu_en(S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
+                                            S_00B404_CU_EN(0xffff),
+                                            C_00B404_CU_EN, 16, &sscreen->info));
       }
       if (sscreen->info.gfx_level >= GFX10) {
          si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
@@ -983,37 +982,18 @@ static void si_emit_shader_gs(struct si_context *sctx)
    radeon_end_update_context_roll(sctx);
 
    /* These don't cause any context rolls. */
-   if (sctx->screen->info.spi_cu_en_has_effect) {
-      if (sctx->gfx_level >= GFX7) {
-         ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                          shader->gs.spi_shader_pgm_rsrc3_gs,
-                          C_00B21C_CU_EN, 0, &sctx->screen->info,
-                          (void (*)(void*, unsigned, uint32_t))
-                          (sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
-         sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS);
-      }
-      if (sctx->gfx_level >= GFX10) {
-         ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                          shader->gs.spi_shader_pgm_rsrc4_gs,
-                          C_00B204_CU_EN_GFX10, 16, &sctx->screen->info,
-                          (void (*)(void*, unsigned, uint32_t))
-                          (sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
-         sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS);
-      }
-   } else {
-      radeon_begin_again(&sctx->gfx_cs);
-      if (sctx->gfx_level >= GFX7) {
-         radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                                    SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
-                                    shader->gs.spi_shader_pgm_rsrc3_gs);
-      }
-      if (sctx->gfx_level >= GFX10) {
-         radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                                    SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
-                                    shader->gs.spi_shader_pgm_rsrc4_gs);
-      }
-      radeon_end();
+   radeon_begin_again(&sctx->gfx_cs);
+   if (sctx->gfx_level >= GFX7) {
+      radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                                 SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+                                 shader->gs.spi_shader_pgm_rsrc3_gs);
    }
+   if (sctx->gfx_level >= GFX10) {
+      radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                                 SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+                                 shader->gs.spi_shader_pgm_rsrc4_gs);
+   }
+   radeon_end();
 }
 
 static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
@@ -1125,10 +1105,14 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
       si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
       si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
 
-      shader->gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
-                                           S_00B21C_WAVE_LIMIT(0x3F);
-      shader->gs.spi_shader_pgm_rsrc4_gs = S_00B204_CU_EN_GFX10(0xffff) |
-                                           S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0);
+      shader->gs.spi_shader_pgm_rsrc3_gs =
+         ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
+                        S_00B21C_WAVE_LIMIT(0x3F),
+                        C_00B21C_CU_EN, 0, &sscreen->info);
+      shader->gs.spi_shader_pgm_rsrc4_gs =
+         ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) |
+                        S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
+                        C_00B204_CU_EN_GFX10, 16, &sscreen->info);
 
       shader->gs.vgt_gs_onchip_cntl =
          S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
@@ -1143,8 +1127,10 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 
       polaris_set_vgt_vertex_reuse(sscreen, shader->key.ge.part.gs.es, shader);
    } else {
-      shader->gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
-                                           S_00B21C_WAVE_LIMIT(0x3F);
+      shader->gs.spi_shader_pgm_rsrc3_gs =
+         ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
+                        S_00B21C_WAVE_LIMIT(0x3F),
+                        C_00B21C_CU_EN, 0, &sscreen->info);
 
       si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
       pm4->reg_va_low_idx = pm4->ndw - 1;
@@ -1218,30 +1204,13 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
    radeon_begin_again(&sctx->gfx_cs);
    radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
                               shader->ngg.ge_pc_alloc);
-   if (sctx->screen->info.spi_cu_en_has_effect) {
-      radeon_end();
-      ac_set_reg_cu_en(&sctx->gfx_cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                       shader->ngg.spi_shader_pgm_rsrc3_gs,
-                       C_00B21C_CU_EN, 0, &sctx->screen->info,
-                       (void (*)(void*, unsigned, uint32_t))
-                       (sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
-      ac_set_reg_cu_en(&sctx->gfx_cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                       shader->ngg.spi_shader_pgm_rsrc4_gs,
-                       sctx->gfx_level >= GFX11 ? C_00B204_CU_EN_GFX11 : C_00B204_CU_EN_GFX10, 16,
-                       &sctx->screen->info,
-                       (void (*)(void*, unsigned, uint32_t))
-                       (sctx->gfx_level >= GFX10 ? radeon_set_sh_reg_idx3_func : radeon_set_sh_reg_func));
-      sctx->tracked_regs.reg_saved &= ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS) &
-                                      ~BITFIELD64_BIT(SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS);
-   } else {
-      radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                                 SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
-                                 shader->ngg.spi_shader_pgm_rsrc3_gs);
-      radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-                                 SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
-                                 shader->ngg.spi_shader_pgm_rsrc4_gs);
-      radeon_end();
-   }
+   radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                              SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+                              shader->ngg.spi_shader_pgm_rsrc3_gs);
+   radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                              SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+                              shader->ngg.spi_shader_pgm_rsrc4_gs);
+   radeon_end();
 }
 
 static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
@@ -1439,15 +1408,21 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                      S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
                      S_00B22C_LDS_SIZE(shader->config.lds_size));
 
-   shader->ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) |
-                                         S_00B21C_WAVE_LIMIT(0x3F);
+   shader->ngg.spi_shader_pgm_rsrc3_gs =
+      ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) |
+                     S_00B21C_WAVE_LIMIT(0x3F),
+                     C_00B21C_CU_EN, 0, &sscreen->info);
    if (sscreen->info.gfx_level >= GFX11) {
       shader->ngg.spi_shader_pgm_rsrc4_gs =
-         S_00B204_CU_EN_GFX11(0x1) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64) |
-         S_00B204_INST_PREF_SIZE(si_get_shader_prefetch_size(shader));
+         ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) |
+                        S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64) |
+                        S_00B204_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)),
+                        C_00B204_CU_EN_GFX11, 16, &sscreen->info);
    } else {
       shader->ngg.spi_shader_pgm_rsrc4_gs =
-         S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64);
+         ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) |
+                        S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
+                        C_00B204_CU_EN_GFX10, 16, &sscreen->info);
    }
 
    nparams = MAX2(shader->info.nr_param_exports, 1);
@@ -1727,11 +1702,10 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
    oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
 
    if (sscreen->info.gfx_level >= GFX7) {
-      ac_set_reg_cu_en(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
-                       S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F),
-                       C_00B118_CU_EN, 0, &sscreen->info,
-                       (void (*)(void*, unsigned, uint32_t))
-                       (sscreen->info.gfx_level >= GFX10 ? si_pm4_set_reg_idx3 : si_pm4_set_reg));
+      si_pm4_set_reg_idx3(sscreen, pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+                          ac_apply_cu_en(S_00B118_CU_EN(cu_mask) |
+                                         S_00B118_WAVE_LIMIT(0x3F),
+                                         C_00B118_CU_EN, 0, &sscreen->info));
       si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
    }
 
@@ -2049,11 +2023,10 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
    if (sscreen->info.gfx_level >= GFX11) {
       unsigned cu_mask_ps = gfx103_get_cu_mask_ps(sscreen);
 
-      ac_set_reg_cu_en(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
-                       S_00B004_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
-                       S_00B004_CU_EN(cu_mask_ps >> 16),
-                       C_00B004_CU_EN, 16, &sscreen->info,
-                       (void (*)(void*, unsigned, uint32_t))si_pm4_set_reg_idx3);
+      si_pm4_set_reg_idx3(sscreen, pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
+                          ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16) |
+                                         S_00B004_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)),
+                                         C_00B004_CU_EN, 16, &sscreen->info));
    }
 }