radeonsi: rewrite how occlusion query precision is determined for performance
authorMarek Olšák <marek.olsak@amd.com>
Thu, 27 Jul 2023 05:09:47 +0000 (01:09 -0400)
committerMarge Bot <emma+marge@anholt.net>
Thu, 17 Aug 2023 15:34:06 +0000 (15:34 +0000)
The precision of occlusion queries is determined from active queries.
Then the register programming is determined from the precision and other
states.

This has the effect that we no longer set PERFECT_ZPASS_COUNTS
for PIPE_QUERY_OCCLUSION_PREDICATE in some cases, resulting in higher
performance.

This also disables conservative occlusion queries for gfx11 because it's
not recommended with late Z, but detecting late Z vs early Z would be
more complicated, so just never use it, which results in better performance
with late Z.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>

src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_query.c
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state.h

index a6bd3a7..eb25910 100644 (file)
@@ -165,6 +165,13 @@ enum si_clear_code
 #define SI_IMAGE_ACCESS_ALLOW_DCC_STORE      (1 << 9)
 #define SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT (1 << 10) /* for compressed/subsampled images */
 
+enum si_occlusion_query_mode {
+   SI_OCCLUSION_QUERY_MODE_DISABLE,
+   SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER,
+   SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN,
+   SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN,
+};
+
 /* Debug flags. */
 enum
 {
@@ -1286,8 +1293,10 @@ struct si_context {
 
    /* Queries. */
    /* Maintain the list of active queries for pausing between IBs. */
-   int num_occlusion_queries;
-   int num_perfect_occlusion_queries;
+   enum si_occlusion_query_mode occlusion_query_mode;
+   int num_integer_occlusion_queries;
+   int num_boolean_occlusion_queries;
+   int num_conservative_occlusion_queries;
    int num_pipeline_stat_queries;
    int num_pipeline_stat_emulated_queries;
    int num_hw_pipestat_streamout_queries;
index fea2d13..9373625 100644 (file)
@@ -707,23 +707,45 @@ static void si_update_occlusion_query_state(struct si_context *sctx, unsigned ty
 {
    if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
        type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-      bool old_enable = sctx->num_occlusion_queries != 0;
-      bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
-      bool enable, perfect_enable;
+      switch (type) {
+      case PIPE_QUERY_OCCLUSION_COUNTER:
+         sctx->num_integer_occlusion_queries += diff;
+         break;
+      case PIPE_QUERY_OCCLUSION_PREDICATE:
+         sctx->num_boolean_occlusion_queries += diff;
+         break;
+      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+         sctx->num_conservative_occlusion_queries += diff;
+         break;
+      }
 
-      sctx->num_occlusion_queries += diff;
-      assert(sctx->num_occlusion_queries >= 0);
+      assert(sctx->num_integer_occlusion_queries >= 0);
+      assert(sctx->num_boolean_occlusion_queries >= 0);
+      assert(sctx->num_conservative_occlusion_queries >= 0);
 
-      if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-         sctx->num_perfect_occlusion_queries += diff;
-         assert(sctx->num_perfect_occlusion_queries >= 0);
-      }
+      enum si_occlusion_query_mode new_mode =
+         sctx->num_integer_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER :
+         sctx->num_boolean_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN :
+         sctx->num_conservative_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN :
+         SI_OCCLUSION_QUERY_MODE_DISABLE;
+
+      /* Conservative queries are only available on gfx10+. On gfx11+, they perform worse
+       * with late Z, but not early Z. Instead of trying to detect late Z, never enable
+       * conservative queries to keep it simple. This is the recommended programming.
+       */
+      if (new_mode == SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN &&
+          (sctx->gfx_level < GFX10 || sctx ->gfx_level >= GFX11))
+         new_mode = SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN;
+
+      if (sctx->occlusion_query_mode != new_mode) {
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
-      enable = sctx->num_occlusion_queries != 0;
-      perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+         if (sctx->screen->info.has_out_of_order_rast &&
+             (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER) !=
+             (new_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER))
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 
-      if (enable != old_enable || perfect_enable != old_perfect_enable) {
-         si_set_occlusion_query_state(sctx, old_perfect_enable);
+         sctx->occlusion_query_mode = new_mode;
       }
    }
 }
index a459d4e..d0076b3 100644 (file)
@@ -739,8 +739,10 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
         sctx->framebuffer.has_dcc_msaa))
       si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
 
-   if (sctx->screen->info.has_export_conflict_bug &&
-       old_blend->blend_enable_4bit != blend->blend_enable_4bit)
+   if ((sctx->screen->info.has_export_conflict_bug &&
+        old_blend->blend_enable_4bit != blend->blend_enable_4bit) ||
+       (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN &&
+        !!old_blend->cb_target_mask != !!blend->cb_target_enabled_4bit))
       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
    if (old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
@@ -1454,6 +1456,11 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
       sctx->do_update_shaders = true;
    }
 
+   if (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN &&
+       (old_dsa->depth_enabled != dsa->depth_enabled ||
+        old_dsa->depth_write_enabled != dsa->depth_write_enabled))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
    if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled ||
                                        old_dsa->stencil_enabled != dsa->stencil_enabled ||
                                        old_dsa->db_can_write != dsa->db_can_write)))
@@ -1511,16 +1518,6 @@ static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
    }
 }
 
-void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable)
-{
-   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
-   bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
-
-   if (perfect_enable != old_perfect_enable)
-      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-}
-
 void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
 {
    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
@@ -1574,30 +1571,41 @@ static void si_emit_db_render_state(struct si_context *sctx, unsigned index)
    }
 
    /* DB_COUNT_CONTROL (occlusion queries) */
-   if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) {
-      bool perfect = sctx->num_perfect_occlusion_queries > 0;
-      bool gfx10_perfect = sctx->gfx_level >= GFX10 && perfect;
-
-      if (sctx->gfx_level >= GFX7) {
-         unsigned log_sample_rate = sctx->framebuffer.log_samples;
-
-         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-                            S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
-                            S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) |
-                            S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
-      } else {
-         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-                            S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
-      }
+   if (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_DISABLE ||
+       sctx->occlusion_queries_disabled) {
+      /* Occlusion queries disabled. */
+      if (sctx->gfx_level >= GFX7)
+         db_count_control = S_028004_ZPASS_ENABLE(0);
+      else
+         db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
    } else {
-      /* Disable occlusion queries. */
+      /* Occlusion queries enabled. */
+      db_count_control = S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
+
       if (sctx->gfx_level >= GFX7) {
-         db_count_control = 0;
-      } else {
-         db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
+         db_count_control |= S_028004_ZPASS_ENABLE(1) |
+                             S_028004_SLICE_EVEN_ENABLE(1) |
+                             S_028004_SLICE_ODD_ENABLE(1);
       }
+
+      if (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER ||
+          /* Boolean occlusion queries must set PERFECT_ZPASS_COUNTS for depth-only rendering
+           * without depth writes or when depth testing is disabled. */
+          (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN &&
+           (!sctx->queued.named.dsa->depth_enabled ||
+            (!sctx->queued.named.blend->cb_target_mask &&
+             !sctx->queued.named.dsa->depth_write_enabled))))
+         db_count_control |= S_028004_PERFECT_ZPASS_COUNTS(1);
+
+      if (sctx->gfx_level >= GFX10 &&
+          sctx->occlusion_query_mode != SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN)
+         db_count_control |= S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(1);
    }
 
+   /* This should always be set on GFX11. */
+   if (sctx->gfx_level >= GFX11)
+      db_count_control |= S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(1);
+
    db_shader_control = sctx->ps_db_shader_control;
 
    if (sctx->screen->info.has_export_conflict_bug &&
@@ -3597,7 +3605,8 @@ static bool si_out_of_order_rasterization(struct si_context *sctx)
           !dsa_order_invariant.pass_set)
          return false;
 
-      if (sctx->num_perfect_occlusion_queries != 0 && !dsa_order_invariant.pass_set)
+      if (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER &&
+          !dsa_order_invariant.pass_set)
          return false;
    }
 
index 0f9d684..1db7f7e 100644 (file)
@@ -580,7 +580,6 @@ void si_mark_display_dcc_dirty(struct si_context *sctx, struct si_texture *tex);
 void si_update_ps_iter_samples(struct si_context *sctx);
 void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st);
 void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st);
-void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable);
 unsigned gfx103_get_cu_mask_ps(struct si_screen *sscreen);
 
 struct si_fast_udiv_info32 {