tu: Split out some state into a separate struct

author Connor Abbott <cwabbott0@gmail.com>

Wed, 29 Jun 2022 09:03:25 +0000 (11:03 +0200)

committer Marge Bot <emma+marge@anholt.net>

Wed, 27 Jul 2022 19:40:44 +0000 (19:40 +0000)
author Connor Abbott <cwabbott0@gmail.com>
Wed, 29 Jun 2022 09:03:25 +0000 (11:03 +0200)
committer Marge Bot <emma+marge@anholt.net>
Wed, 27 Jul 2022 19:40:44 +0000 (19:40 +0000)
diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c

index 80c732c..4feee54 100644 (file)
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@@ -480,7 +480,7 @@ fallback_use_bypass(const struct tu_render_pass *pass,
                      const struct tu_framebuffer *framebuffer,
                      const struct tu_cmd_buffer *cmd_buffer)
  {
-   if (cmd_buffer->state.drawcall_count > 5)
+   if (cmd_buffer->state.rp.drawcall_count > 5)
        return false;
  
     for (unsigned i = 0; i < pass->subpass_count; i++) {
@@ -504,12 +504,12 @@ estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
  {
     const struct tu_cmd_state *state = &cmd->state;
  
-   if (!state->drawcall_count)
+   if (!state->rp.drawcall_count)
        return 0;
  
     /* sample count times drawcall_bandwidth_per_sample */
     return (uint64_t)avg_renderpass_sample_count *
-      state->drawcall_bandwidth_per_sample_sum / state->drawcall_count;
+      state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
  }
  
  bool
@@ -583,12 +583,12 @@ tu_autotune_use_bypass(struct tu_autotune *at,
        if (TU_AUTOTUNE_DEBUG_LOG) {
           const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
           const float drawcall_bandwidth_per_sample =
-            (float)cmd_buffer->state.drawcall_bandwidth_per_sample_sum /
-            cmd_buffer->state.drawcall_count;
+            (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
+            cmd_buffer->state.rp.drawcall_count;
  
           mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
                 renderpass_key,
-               cmd_buffer->state.drawcall_count,
+               cmd_buffer->state.rp.drawcall_count,
                 select_sysmem ? "sysmem" : "gmem");
           mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
                 avg_samples,
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c

index 3b45a7a..2efe79e 100644 (file)
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -569,7 +569,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
      * use_sysmem_rendering() should have made sure we only ended up here if no
      * XFB was used.
      */
-   if (cmd->state.xfb_used) {
+   if (cmd->state.rp.xfb_used) {
        assert(fb->binning_possible);
        return true;
     }
@@ -579,7 +579,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
      * be multiplied by tile count.
      * See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3131
      */
-   if (cmd->state.has_prim_generated_query_in_rp ||
+   if (cmd->state.rp.has_prim_generated_query_in_rp ||
         cmd->state.prim_generated_query_running_before_rp) {
        assert(fb->binning_possible);
        return true;
@@ -607,20 +607,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
         cmd->state.render_area.extent.height == 0)
        return true;
  
-   if (cmd->state.has_tess)
+   if (cmd->state.rp.has_tess)
        return true;
  
-   if (cmd->state.disable_gmem)
+   if (cmd->state.rp.disable_gmem)
        return true;
  
     /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
-   if (cmd->state.xfb_used && !cmd->state.framebuffer->binning_possible)
+   if (cmd->state.rp.xfb_used && !cmd->state.framebuffer->binning_possible)
        return true;
  
     /* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning
      * GMEM rendering, see use_hw_binning.
      */
-   if ((cmd->state.has_prim_generated_query_in_rp ||
+   if ((cmd->state.rp.has_prim_generated_query_in_rp ||
          cmd->state.prim_generated_query_running_before_rp) &&
         !cmd->state.framebuffer->binning_possible)
        return true;
@@ -1411,7 +1411,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
     }
  
     /* Predicate is changed in draw_cs so we have to re-emit it */
-   if (cmd->state.draw_cs_writes_to_cond_pred)
+   if (cmd->state.rp.draw_cs_writes_to_cond_pred)
        tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false);
  
     tu_cs_emit_call(cs, &cmd->tile_store_cs);
@@ -2270,7 +2270,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
  
     tu_cond_exec_end(cs);
  
-   cmd->state.xfb_used = true;
+   cmd->state.rp.xfb_used = true;
  }
  
  VKAPI_ATTR void VKAPI_CALL
@@ -2396,7 +2396,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
     }
  
     if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
-      cmd->state.has_tess = true;
+      cmd->state.rp.has_tess = true;
  
        /* maximum number of patches that can fit in tess factor/param buffers */
        uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type),
@@ -3331,6 +3331,21 @@ tu_flush_for_stage(struct tu_cache_state *cache,
     }
  }
  
+static void
+tu_render_pass_state_merge(struct tu_render_pass_state *dst,
+                           const struct tu_render_pass_state *src)
+{
+   dst->xfb_used |= src->xfb_used;
+   dst->has_tess |= src->has_tess;
+   dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
+   dst->disable_gmem |= src->disable_gmem;
+   dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred;
+
+   dst->drawcall_count += src->drawcall_count;
+   dst->drawcall_bandwidth_per_sample_sum +=
+      src->drawcall_bandwidth_per_sample_sum;
+}
+
  VKAPI_ATTR void VKAPI_CALL
  tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
                        uint32_t commandBufferCount,
@@ -3370,28 +3385,13 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
              break;
           }
  
-         if (secondary->state.has_tess) {
-            cmd->state.has_tess = true;
-         }
-         if (secondary->state.disable_gmem)
-            cmd->state.disable_gmem = true;
-         if (secondary->state.xfb_used)
-            cmd->state.xfb_used = true;
-         if (secondary->state.has_prim_generated_query_in_rp)
-            cmd->state.has_prim_generated_query_in_rp = true;
-
-         cmd->state.drawcall_count += secondary->state.drawcall_count;
-         cmd->state.drawcall_bandwidth_per_sample_sum +=
-            secondary->state.drawcall_bandwidth_per_sample_sum;
-
-         cmd->state.draw_cs_writes_to_cond_pred |=
-            secondary->state.draw_cs_writes_to_cond_pred;
-
           /* If LRZ was made invalid in secondary - we should disable
            * LRZ retroactively for the whole renderpass.
            */
           if (!secondary->state.lrz.valid)
              cmd->state.lrz.valid = false;
+
+         tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp);
        } else {
           assert(tu_cs_is_empty(&secondary->draw_cs));
           assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
@@ -3588,8 +3588,6 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
        return;
     }
  
-   cmd->state.draw_cs_writes_to_cond_pred = false;
-
     for (unsigned i = 0; i < pass->attachment_count; i++) {
        cmd->state.attachments[i] = pAttachmentInfo ?
           tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
@@ -3635,8 +3633,6 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
  
     cmd->state.attachments = cmd->dynamic_attachments;
  
-   cmd->state.draw_cs_writes_to_cond_pred = false;
-
     for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
        uint32_t a = cmd->dynamic_subpass.color_attachments[i].attachment;
        if (!pRenderingInfo->pColorAttachments[i].imageView)
@@ -4020,23 +4016,23 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
     const struct tu_pipeline *pipeline = cmd->state.pipeline;
  
     /* Fill draw stats for autotuner */
-   cmd->state.drawcall_count++;
+   cmd->state.rp.drawcall_count++;
  
-   cmd->state.drawcall_bandwidth_per_sample_sum +=
+   cmd->state.rp.drawcall_bandwidth_per_sample_sum +=
        cmd->state.pipeline->color_bandwidth_per_sample;
  
     /* add depth memory bandwidth cost */
     const uint32_t depth_bandwidth = cmd->state.pipeline->depth_cpp_per_sample;
     if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
-      cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
+      cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
     if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
-      cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
+      cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
  
     /* add stencil memory bandwidth cost */
     const uint32_t stencil_bandwidth =
        cmd->state.pipeline->stencil_cpp_per_sample;
     if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE)
-      cmd->state.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
+      cmd->state.rp.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
  
     tu_emit_cache_flush_renderpass(cmd, cs);
  
@@ -4813,7 +4809,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer)
  
     cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
  
-   if (cmd_buffer->state.has_tess)
+   if (cmd_buffer->state.rp.has_tess)
        tu6_lazy_emit_tessfactor_addr(cmd_buffer);
  
     struct tu_renderpass_result *autotune_result = NULL;
@@ -4839,12 +4835,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer)
     cmd_buffer->state.subpass = NULL;
     cmd_buffer->state.framebuffer = NULL;
     cmd_buffer->state.attachments = NULL;
-   cmd_buffer->state.has_tess = false;
-   cmd_buffer->state.xfb_used = false;
-   cmd_buffer->state.disable_gmem = false;
-   cmd_buffer->state.drawcall_count = 0;
-   cmd_buffer->state.drawcall_bandwidth_per_sample_sum = 0;
-   cmd_buffer->state.has_prim_generated_query_in_rp = false;
+   memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
  
     /* LRZ is not valid next time we use it */
     cmd_buffer->state.lrz.valid = false;
@@ -4969,7 +4960,7 @@ tu_barrier(struct tu_cmd_buffer *cmd,
         */
        if ((srcStage & ~framebuffer_space_stages) ||
            (dstStage & ~framebuffer_space_stages)) {
-         cmd->state.disable_gmem = true;
+         cmd->state.rp.disable_gmem = true;
        }
     }
  
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h

index d73a834..55afea1 100644 (file)
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -1285,6 +1285,48 @@ struct tu_vs_params {
     uint32_t first_instance;
  };
  
+/* This should be for state that is set inside a renderpass and used at
+ * renderpass end time, e.g. to decide whether to use sysmem. This needs
+ * special handling for secondary cmdbufs and suspending/resuming render
+ * passes where the state may need to be combined afterwards.
+ */
+struct tu_render_pass_state
+{
+   bool xfb_used;
+   bool has_tess;
+   bool has_prim_generated_query_in_rp;
+   bool disable_gmem;
+
+   /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
+   bool draw_cs_writes_to_cond_pred;
+
+   uint32_t drawcall_count;
+
+   /* A calculated "draw cost" value for renderpass, which tries to
+    * estimate the bandwidth-per-sample of all the draws according
+    * to:
+    *
+    *    foreach_draw (...) {
+    *      sum += pipeline->color_bandwidth_per_sample;
+    *      if (depth_test_enabled)
+    *        sum += pipeline->depth_cpp_per_sample;
+    *      if (depth_write_enabled)
+    *        sum += pipeline->depth_cpp_per_sample;
+    *      if (stencil_write_enabled)
+    *        sum += pipeline->stencil_cpp_per_sample * 2;
+    *    }
+    *    drawcall_bandwidth_per_sample = sum / drawcall_count;
+    *
+    * It allows us to estimate the total bandwidth of drawcalls later, by
+    * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
+    *
+    * This does ignore depth buffer traffic for samples which do not
+    * pass due to depth-test fail, and some other details.  But it is
+    * just intended to be a rough estimate that is easy to calculate.
+    */
+   uint32_t drawcall_bandwidth_per_sample_sum;
+};
+
  struct tu_cmd_state
  {
     uint32_t dirty;
@@ -1292,6 +1334,8 @@ struct tu_cmd_state
     struct tu_pipeline *pipeline;
     struct tu_pipeline *compute_pipeline;
  
+   struct tu_render_pass_state rp;
+
     /* Vertex buffers, viewports, and scissors
      * the states for these can be updated partially, so we need to save these
      * to be able to emit a complete draw state
@@ -1358,43 +1402,12 @@ struct tu_cmd_state
     VkRect2D render_area;
  
     const struct tu_image_view **attachments;
-   /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
-   bool draw_cs_writes_to_cond_pred;
  
-   bool xfb_used;
-   bool has_tess;
     bool tessfactor_addr_set;
     bool predication_active;
-   bool disable_gmem;
     enum a5xx_line_mode line_mode;
     bool z_negative_one_to_one;
  
-   uint32_t drawcall_count;
-
-   /* A calculated "draw cost" value for renderpass, which tries to
-    * estimate the bandwidth-per-sample of all the draws according
-    * to:
-    *
-    *    foreach_draw (...) {
-    *      sum += pipeline->color_bandwidth_per_sample;
-    *      if (depth_test_enabled)
-    *        sum += pipeline->depth_cpp_per_sample;
-    *      if (depth_write_enabled)
-    *        sum += pipeline->depth_cpp_per_sample;
-    *      if (stencil_write_enabled)
-    *        sum += pipeline->stencil_cpp_per_sample * 2;
-    *    }
-    *    drawcall_bandwidth_per_sample = sum / drawcall_count;
-    *
-    * It allows us to estimate the total bandwidth of drawcalls later, by
-    * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
-    *
-    * This does ignore depth buffer traffic for samples which do not
-    * pass due to depth-test fail, and some other details.  But it is
-    * just intended to be a rough estimate that is easy to calculate.
-    */
-   uint32_t drawcall_bandwidth_per_sample_sum;
-
     /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
      * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
      * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
@@ -1402,7 +1415,6 @@ struct tu_cmd_state
     uint32_t prim_counters_running;
  
     bool prim_generated_query_running_before_rp;
-   bool has_prim_generated_query_in_rp;
  
     struct tu_lrz_state lrz;
  
diff --git a/src/freedreno/vulkan/tu_query.c b/src/freedreno/vulkan/tu_query.c

index 3a13d37..3ce688d 100644 (file)
--- a/src/freedreno/vulkan/tu_query.c
+++ b/src/freedreno/vulkan/tu_query.c
@@ -932,7 +932,7 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
     uint32_t last_pass = ~0;
  
     if (cmdbuf->state.pass) {
-      cmdbuf->state.draw_cs_writes_to_cond_pred = true;
+      cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
     }
  
     /* Querying perf counters happens in these steps:
@@ -1023,7 +1023,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
     uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
  
     if (cmdbuf->state.pass) {
-      cmdbuf->state.has_prim_generated_query_in_rp = true;
+      cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
     } else {
        cmdbuf->state.prim_generated_query_running_before_rp = true;
     }
author	Connor Abbott <cwabbott0@gmail.com>
	Wed, 29 Jun 2022 09:03:25 +0000 (11:03 +0200)
committer	Marge Bot <emma+marge@anholt.net>
	Wed, 27 Jul 2022 19:40:44 +0000 (19:40 +0000)
src/freedreno/vulkan/tu_autotune.c		patch \| blob \| history
src/freedreno/vulkan/tu_cmd_buffer.c		patch \| blob \| history
src/freedreno/vulkan/tu_private.h		patch \| blob \| history
src/freedreno/vulkan/tu_query.c		patch \| blob \| history