radeonsi: merge pm4 state and atom emit loops into one

author Marek Olšák <marek.olsak@amd.com>

Sun, 16 Jul 2023 12:38:17 +0000 (08:38 -0400)

committer Marge Bot <emma+marge@anholt.net>

Thu, 17 Aug 2023 15:34:06 +0000 (15:34 +0000)
author Marek Olšák <marek.olsak@amd.com>
Sun, 16 Jul 2023 12:38:17 +0000 (08:38 -0400)
committer Marge Bot <emma+marge@anholt.net>
Thu, 17 Aug 2023 15:34:06 +0000 (15:34 +0000)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index efcd4a3..fb0e6e6 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1033,11 +1033,9 @@ struct si_context {
     unsigned last_num_draw_calls;
     unsigned flags; /* flush flags */
  
-   /* Atoms (direct states). */
+   /* Atoms (state emit functions). */
     union si_state_atoms atoms;
-   unsigned dirty_atoms; /* mask */
-   /* PM4 states (precomputed immutable states) */
-   unsigned dirty_states;
+   uint64_t dirty_atoms; /* mask */
     union si_state queued;
     union si_state emitted;
     /* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_PACKED*. */
@@ -1759,14 +1757,14 @@ static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx,
     return 2048 + sctx->num_cs_dw_queries_suspend + num_draws * 10;
  }
  
-static inline unsigned si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
+static inline uint64_t si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
  {
-   return 1 << (atom - sctx->atoms.array);
+   return 1ull << (atom - sctx->atoms.array);
  }
  
  static inline void si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty)
  {
-   unsigned bit = si_get_atom_bit(sctx, atom);
+   uint64_t bit = si_get_atom_bit(sctx, atom);
  
     if (dirty)
        sctx->dirty_atoms |= bit;
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c

index e6e51c7..9263f35 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -316,7 +316,7 @@ void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsi
  
        if (sctx->queued.array[idx] == state) {
           sctx->queued.array[idx] = NULL;
-         sctx->dirty_states &= ~BITFIELD_BIT(idx);
+         sctx->dirty_atoms &= ~BITFIELD64_BIT(idx);
        }
     }
  
@@ -361,7 +361,7 @@ void si_pm4_reset_emitted(struct si_context *sctx)
  
     for (unsigned i = 0; i < SI_NUM_STATES; i++) {
        if (sctx->queued.array[i])
-         sctx->dirty_states |= BITFIELD_BIT(i);
+         sctx->dirty_atoms |= BITFIELD64_BIT(i);
     }
  }
  
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c

index 536e358..2b7e6d9 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -5413,6 +5413,9 @@ void si_init_state_compute_functions(struct si_context *sctx)
  
  void si_init_state_functions(struct si_context *sctx)
  {
+   for (unsigned i = 0; i < ARRAY_SIZE(sctx->atoms.s.pm4_states); i++)
+      sctx->atoms.s.pm4_states[i].emit = si_pm4_emit_state;
+
     sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
     sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
     sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h

index 5700b23..16c0811 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -176,18 +176,13 @@ union si_state {
  };
  
  #define SI_STATE_IDX(name) (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
-#define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name))
+#define SI_STATE_BIT(name) (1ull << SI_STATE_IDX(name))
  #define SI_NUM_STATES      (sizeof(union si_state) / sizeof(struct si_pm4_state *))
  
-static inline unsigned si_states_that_always_roll_context(void)
-{
-   return (SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) |
-           SI_STATE_BIT(poly_offset));
-}
-
  union si_state_atoms {
     struct si_atoms_s {
-      /* The order matters. */
+      /* This must be first. */
+      struct si_atom pm4_states[SI_NUM_STATES];
        struct si_atom render_cond;
        struct si_atom streamout_begin;
        struct si_atom streamout_enable; /* must be after streamout_begin */
@@ -217,15 +212,17 @@ union si_state_atoms {
     struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
  };
  
-#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / sizeof(struct si_atom)))
+#define SI_ATOM_BIT(name) (1ull << (offsetof(union si_state_atoms, s.name) / sizeof(struct si_atom)))
  #define SI_NUM_ATOMS      (sizeof(union si_state_atoms) / sizeof(struct si_atom))
  
-static inline unsigned si_atoms_that_always_roll_context(void)
+static inline uint64_t si_atoms_that_always_roll_context(void)
  {
-   return (SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) |
-           SI_ATOM_BIT(sample_locations) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color) |
-           SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports) |
-           SI_ATOM_BIT(stencil_ref) | SI_ATOM_BIT(scratch_state) | SI_ATOM_BIT(window_rectangles));
+   return SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) |
+          SI_STATE_BIT(poly_offset) |
+          SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) |
+          SI_ATOM_BIT(sample_locations) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color)|
+          SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports)|
+          SI_ATOM_BIT(stencil_ref) | SI_ATOM_BIT(scratch_state) | SI_ATOM_BIT(window_rectangles);
  }
  
  struct si_shader_data {
@@ -516,9 +513,9 @@ struct si_buffer_resources {
     do {                                                                                            \
        (sctx)->queued.named.member = (value);                                                       \
        if (value && value != (sctx)->emitted.named.member)                                          \
-         (sctx)->dirty_states |= SI_STATE_BIT(member);                                             \
+         (sctx)->dirty_atoms |= SI_STATE_BIT(member);                                              \
        else                                                                                         \
-         (sctx)->dirty_states &= ~SI_STATE_BIT(member);                                            \
+         (sctx)->dirty_atoms &= ~SI_STATE_BIT(member);                                             \
     } while (0)
  
  /* si_descriptors.c */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp

index c830aad..77f63bc 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -1936,28 +1936,33 @@ static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_d
  }
  
  ALWAYS_INLINE
-static void si_emit_all_states(struct si_context *sctx, unsigned skip_atom_mask)
+static void si_emit_all_states(struct si_context *sctx, uint64_t skip_atom_mask)
  {
-   /* Emit state atoms. */
-   unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
-   if (mask) {
-      do {
-         unsigned i = u_bit_scan(&mask);
-         sctx->atoms.array[i].emit(sctx, i);
-      } while (mask);
+   /* Emit states by calling their emit functions. */
+   uint64_t dirty = sctx->dirty_atoms & ~skip_atom_mask;
  
+   if (dirty) {
        sctx->dirty_atoms &= skip_atom_mask;
-   }
  
-   /* Emit states. */
-   mask = sctx->dirty_states;
-   if (mask) {
-      do {
-         unsigned i = u_bit_scan(&mask);
-         si_pm4_emit_state(sctx, i);
-      } while (mask);
+      /* u_bit_scan64 is too slow on i386. */
+      if (sizeof(void*) == 8) {
+         do {
+            unsigned i = u_bit_scan64(&dirty);
+            sctx->atoms.array[i].emit(sctx, i);
+         } while (dirty);
+      } else {
+         unsigned dirty_lo = dirty;
+         unsigned dirty_hi = dirty >> 32;
  
-      sctx->dirty_states = 0;
+         while (dirty_lo) {
+            unsigned i = u_bit_scan(&dirty_lo);
+            sctx->atoms.array[i].emit(sctx, i);
+         }
+         while (dirty_hi) {
+            unsigned i = 32 + u_bit_scan(&dirty_hi);
+            sctx->atoms.array[i].emit(sctx, i);
+         }
+      }
     }
  }
  
@@ -2230,7 +2235,7 @@ static void si_draw(struct pipe_context *ctx,
      * It's better to draw before prefetches because we want to start fetching indices before
      * shaders. The idea is to minimize the time when the CUs are idle.
      */
-   unsigned masked_atoms = 0;
+   uint64_t masked_atoms = 0;
     if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND)) {
        /* The render condition state should be emitted after cache flushes. */
        masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
@@ -2247,8 +2252,7 @@ static void si_draw(struct pipe_context *ctx,
        gfx9_scissor_bug = true;
  
        if ((!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) ||
-          sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
-          sctx->dirty_states & si_states_that_always_roll_context())
+          sctx->dirty_atoms & si_atoms_that_always_roll_context())
           sctx->context_roll = true;
     }
author	Marek Olšák <marek.olsak@amd.com>
	Sun, 16 Jul 2023 12:38:17 +0000 (08:38 -0400)
committer	Marge Bot <emma+marge@anholt.net>
	Thu, 17 Aug 2023 15:34:06 +0000 (15:34 +0000)
src/gallium/drivers/radeonsi/si_pipe.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_pm4.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_draw.cpp		patch \| blob \| history