turnip: consider render pass costs in autotune

author Chia-I Wu <olvaffe@gmail.com>

Thu, 26 May 2022 21:40:49 +0000 (14:40 -0700)

committer Marge Bot <emma+marge@anholt.net>

Wed, 8 Jun 2022 12:48:08 +0000 (12:48 +0000)
author Chia-I Wu <olvaffe@gmail.com>
Thu, 26 May 2022 21:40:49 +0000 (14:40 -0700)
committer Marge Bot <emma+marge@anholt.net>
Wed, 8 Jun 2022 12:48:08 +0000 (12:48 +0000)
diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c

index 57ebd11..2446cc6 100644 (file)
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@@ -491,6 +491,24 @@ fallback_use_bypass(const struct tu_render_pass *pass,
     return true;
  }
  
+static uint32_t
+get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
+{
+   const VkExtent2D *extent = &cmd->state.render_area.extent;
+   return extent->width * extent->height;
+}
+
+static uint64_t
+estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
+                            uint32_t avg_renderpass_sample_count)
+{
+   const struct tu_cmd_state *state = &cmd->state;
+
+   /* sample count times drawcall_bandwidth_per_sample */
+   return (uint64_t)avg_renderpass_sample_count *
+      state->drawcall_bandwidth_per_sample_sum / state->drawcall_count;
+}
+
  bool
  tu_autotune_use_bypass(struct tu_autotune *at,
                         struct tu_cmd_buffer *cmd_buffer,
@@ -539,40 +557,46 @@ tu_autotune_use_bypass(struct tu_autotune *at,
  
     uint32_t avg_samples = 0;
     if (get_history(at, renderpass_key, &avg_samples)) {
-      /* TODO we should account for load/stores/clears/resolves especially
-       * with low drawcall count and ~fb_size samples passed, in D3D11 games
-       * we are seeing many renderpasses like:
-       *  - color attachment load
-       *  - single fullscreen draw
-       *  - color attachment store
+      const uint32_t pass_pixel_count =
+         get_render_pass_pixel_count(cmd_buffer);
+      uint64_t sysmem_bandwidth =
+         (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
+      uint64_t gmem_bandwidth =
+         (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
+
+      const uint64_t total_draw_call_bandwidth =
+         estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
+
+      /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
+      sysmem_bandwidth += total_draw_call_bandwidth;
+
+      /* drawcalls access gmem in gmem rendering, but we do not want to ignore
+       * them completely.  The state changes between tiles also have an
+       * overhead.  The magic numbers of 11 and 10 are randomly chosen.
         */
+      gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
  
-      /* Low sample count could mean there was only a clear.. or there was
-       * a clear plus draws that touch no or few samples
-       */
-      if (avg_samples < 500) {
-         if (TU_AUTOTUNE_DEBUG_LOG) {
-            mesa_logi("%016"PRIx64":%u\t avg_samples=%u selecting sysmem",
-               renderpass_key, cmd_buffer->state.drawcall_count, avg_samples);
-         }
-         return true;
-      }
-
-      /* Cost-per-sample is an estimate for the average number of reads+
-       * writes for a given passed sample.
-       */
-      float sample_cost = cmd_buffer->state.total_drawcalls_cost;
-      sample_cost /= cmd_buffer->state.drawcall_count;
-
-      float single_draw_cost = (avg_samples * sample_cost) / cmd_buffer->state.drawcall_count;
-
-      bool select_sysmem = single_draw_cost < 6000.0;
-
+      const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
        if (TU_AUTOTUNE_DEBUG_LOG) {
-         mesa_logi("%016"PRIx64":%u\t avg_samples=%u, "
-             "sample_cost=%f, single_draw_cost=%f selecting %s",
-             renderpass_key, cmd_buffer->state.drawcall_count, avg_samples,
-             sample_cost, single_draw_cost, select_sysmem ? "sysmem" : "gmem");
+         const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
+         const float drawcall_bandwidth_per_sample =
+            (float)cmd_buffer->state.drawcall_bandwidth_per_sample_sum /
+            cmd_buffer->state.drawcall_count;
+
+         mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
+               renderpass_key,
+               cmd_buffer->state.drawcall_count,
+               select_sysmem ? "sysmem" : "gmem");
+         mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
+               avg_samples,
+               drawcall_bandwidth_per_sample,
+               total_draw_call_bandwidth);
+         mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
+               extent->width, extent->height,
+               pass->sysmem_bandwidth_per_pixel,
+               pass->gmem_bandwidth_per_pixel);
+         mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
+               sysmem_bandwidth, gmem_bandwidth);
        }
  
        return select_sysmem;
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c

index 4ef6458..65f15ee 100644 (file)
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -3991,17 +3991,21 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
     /* Fill draw stats for autotuner */
     cmd->state.drawcall_count++;
  
-   cmd->state.total_drawcalls_cost += cmd->state.pipeline->drawcall_base_cost;
+   cmd->state.drawcall_bandwidth_per_sample_sum +=
+      cmd->state.pipeline->color_bandwidth_per_sample;
  
     /* add depth memory bandwidth cost */
+   const uint32_t depth_bandwidth = cmd->state.pipeline->depth_cpp_per_sample;
     if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
-      cmd->state.total_drawcalls_cost++;
+      cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
     if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
-      cmd->state.total_drawcalls_cost++;
+      cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
  
     /* add stencil memory bandwidth cost */
+   const uint32_t stencil_bandwidth =
+      cmd->state.pipeline->stencil_cpp_per_sample;
     if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE)
-      cmd->state.total_drawcalls_cost += 2;
+      cmd->state.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
  
     tu_emit_cache_flush_renderpass(cmd, cs);
  
@@ -4808,7 +4812,7 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
     cmd_buffer->state.has_subpass_predication = false;
     cmd_buffer->state.disable_gmem = false;
     cmd_buffer->state.drawcall_count = 0;
-   cmd_buffer->state.total_drawcalls_cost = 0;
+   cmd_buffer->state.drawcall_bandwidth_per_sample_sum = 0;
  
     /* LRZ is not valid next time we use it */
     cmd_buffer->state.lrz.valid = false;
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c

index 04f6988..d7cdc79 100644 (file)
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -2145,10 +2145,10 @@ tu6_emit_rb_mrt_controls(struct tu_cs *cs,
                           const VkPipelineColorBlendStateCreateInfo *blend_info,
                           const VkFormat attachment_formats[MAX_RTS],
                           uint32_t *blend_enable_mask,
-                         uint8_t *drawcall_base_cost)
+                         uint32_t *color_bandwidth_per_sample)
  {
     *blend_enable_mask = 0;
-   *drawcall_base_cost = 0;
+   *color_bandwidth_per_sample = 0;
  
     bool rop_reads_dst = false;
     uint32_t rb_mrt_control_rop = 0;
@@ -2159,7 +2159,7 @@ tu6_emit_rb_mrt_controls(struct tu_cs *cs,
           A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp));
     }
  
-   uint32_t total_comps = 0;
+   uint32_t total_bpp = 0;
     for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
        const VkPipelineColorBlendAttachmentState *att =
           &blend_info->pAttachments[i];
@@ -2169,17 +2169,29 @@ tu6_emit_rb_mrt_controls(struct tu_cs *cs,
        uint32_t rb_mrt_blend_control = 0;
        if (format != VK_FORMAT_UNDEFINED) {
           const bool has_alpha = vk_format_has_alpha(format);
-         const uint32_t write_comps = util_bitcount(att->colorWriteMask);
  
           rb_mrt_control =
              tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha);
           rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
  
-         total_comps += write_comps;
+         /* calculate bpp based on format and write mask */
+         uint32_t write_bpp = 0;
+         if (att->colorWriteMask == 0xf) {
+            write_bpp = vk_format_get_blocksizebits(format);
+         } else {
+            const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
+            for (uint32_t i = 0; i < 4; i++) {
+               if (att->colorWriteMask & (1 << i)) {
+                  write_bpp += util_format_get_component_bits(pipe_format,
+                        UTIL_FORMAT_COLORSPACE_RGB, i);
+               }
+            }
+         }
+         total_bpp += write_bpp;
  
           if (att->blendEnable || rop_reads_dst) {
              *blend_enable_mask |= 1 << i;
-            total_comps += write_comps;
+            total_bpp += write_bpp;
           }
        }
  
@@ -2188,7 +2200,7 @@ tu6_emit_rb_mrt_controls(struct tu_cs *cs,
        tu_cs_emit(cs, rb_mrt_blend_control);
     }
  
-   *drawcall_base_cost = total_comps / 4;
+   *color_bandwidth_per_sample = total_bpp / 8;
  }
  
  static void
@@ -3364,6 +3376,8 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
        builder->create_info->pDepthStencilState;
     const VkPipelineRasterizationStateCreateInfo *rast_info =
        builder->create_info->pRasterizationState;
+   const enum pipe_format pipe_format =
+      vk_format_to_pipe_format(builder->depth_attachment_format);
     uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
     struct tu_cs cs;
  
@@ -3387,6 +3401,9 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
  
        if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
           tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
+
+      pipeline->depth_cpp_per_sample = util_format_get_component_bits(
+            pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
     } else {
        /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
         * to 0 when this pipeline is used, as enabling depth test when there
@@ -3416,6 +3433,9 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
              A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
              A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
        }
+
+      pipeline->stencil_cpp_per_sample = util_format_get_component_bits(
+            pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
     }
  
     if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
@@ -3505,7 +3525,7 @@ tu_pipeline_builder_parse_multisample_and_color_blend(
     tu6_emit_rb_mrt_controls(&cs, blend_info,
                              builder->color_attachment_formats,
                              &blend_enable_mask,
-                            &pipeline->drawcall_base_cost);
+                            &pipeline->color_bandwidth_per_sample);
  
     tu6_emit_blend_control(&cs, blend_enable_mask,
                            builder->use_dual_src_blend, msaa_info);
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h

index 2992d40..2f7444c 100644 (file)
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -1223,27 +1223,24 @@ struct tu_cmd_state
      * to:
      *
      *    foreach_draw (...) {
-    *      cost += num_frag_outputs;
-    *      if (blend_enabled)
-    *        cost += num_blend_enabled;
+    *      sum += pipeline->color_bandwidth_per_sample;
      *      if (depth_test_enabled)
-    *        cost++;
+    *        sum += pipeline->depth_cpp_per_sample;
      *      if (depth_write_enabled)
-    *        cost++;
+    *        sum += pipeline->depth_cpp_per_sample;
+    *      if (stencil_write_enabled)
+    *        sum += pipeline->stencil_cpp_per_sample * 2;
      *    }
+    *    drawcall_bandwidth_per_sample = sum / drawcall_count;
      *
-    * The idea is that each sample-passed minimally does one write
-    * per MRT.  If blend is enabled, the hw will additionally do
-    * a framebuffer read per sample-passed (for each MRT with blend
-    * enabled).  If depth-test is enabled, the hw will additionally
-    * a depth buffer read.  If depth-write is enable, the hw will
-    * additionally do a depth buffer write.
+    * It allows us to estimate the total bandwidth of drawcalls later, by
+    * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
      *
      * This does ignore depth buffer traffic for samples which do not
-    * pass do to depth-test fail, and some other details.  But it is
+    * pass due to depth-test fail, and some other details.  But it is
      * just intended to be a rough estimate that is easy to calculate.
      */
-   uint32_t total_drawcalls_cost;
+   uint32_t drawcall_bandwidth_per_sample_sum;
  
     struct tu_lrz_state lrz;
  
@@ -1506,8 +1503,11 @@ struct tu_pipeline
  
     bool z_negative_one_to_one;
  
-   /* Base drawcall cost for sysmem vs gmem autotuner */
-   uint8_t drawcall_base_cost;
+   /* memory bandwidth cost (in bytes) for color attachments */
+   uint32_t color_bandwidth_per_sample;
+
+   uint32_t depth_cpp_per_sample;
+   uint32_t stencil_cpp_per_sample;
  
     void *executables_mem_ctx;
     /* tu_pipeline_executable */
author	Chia-I Wu <olvaffe@gmail.com>
	Thu, 26 May 2022 21:40:49 +0000 (14:40 -0700)
committer	Marge Bot <emma+marge@anholt.net>
	Wed, 8 Jun 2022 12:48:08 +0000 (12:48 +0000)
src/freedreno/vulkan/tu_autotune.c		patch \| blob \| history
src/freedreno/vulkan/tu_cmd_buffer.c		patch \| blob \| history
src/freedreno/vulkan/tu_pipeline.c		patch \| blob \| history
src/freedreno/vulkan/tu_private.h		patch \| blob \| history