return true;
}
+static uint32_t
+get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
+{
+ const VkExtent2D *extent = &cmd->state.render_area.extent;
+ return extent->width * extent->height;
+}
+
+static uint64_t
+estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
+ uint32_t avg_renderpass_sample_count)
+{
+ const struct tu_cmd_state *state = &cmd->state;
+
+ /* sample count times drawcall_bandwidth_per_sample */
+ return (uint64_t)avg_renderpass_sample_count *
+ state->drawcall_bandwidth_per_sample_sum / state->drawcall_count;
+}
+
bool
tu_autotune_use_bypass(struct tu_autotune *at,
struct tu_cmd_buffer *cmd_buffer,
uint32_t avg_samples = 0;
if (get_history(at, renderpass_key, &avg_samples)) {
- /* TODO we should account for load/stores/clears/resolves especially
- * with low drawcall count and ~fb_size samples passed, in D3D11 games
- * we are seeing many renderpasses like:
- * - color attachment load
- * - single fullscreen draw
- * - color attachment store
+ const uint32_t pass_pixel_count =
+ get_render_pass_pixel_count(cmd_buffer);
+ uint64_t sysmem_bandwidth =
+ (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
+ uint64_t gmem_bandwidth =
+ (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
+
+ const uint64_t total_draw_call_bandwidth =
+ estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
+
+ /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
+ sysmem_bandwidth += total_draw_call_bandwidth;
+
+ /* drawcalls access gmem in gmem rendering, but we do not want to ignore
+ * them completely. The state changes between tiles also have an
+ * overhead. The magic numbers of 11 and 10 are randomly chosen.
*/
+ gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
- /* Low sample count could mean there was only a clear.. or there was
- * a clear plus draws that touch no or few samples
- */
- if (avg_samples < 500) {
- if (TU_AUTOTUNE_DEBUG_LOG) {
- mesa_logi("%016"PRIx64":%u\t avg_samples=%u selecting sysmem",
- renderpass_key, cmd_buffer->state.drawcall_count, avg_samples);
- }
- return true;
- }
-
- /* Cost-per-sample is an estimate for the average number of reads+
- * writes for a given passed sample.
- */
- float sample_cost = cmd_buffer->state.total_drawcalls_cost;
- sample_cost /= cmd_buffer->state.drawcall_count;
-
- float single_draw_cost = (avg_samples * sample_cost) / cmd_buffer->state.drawcall_count;
-
- bool select_sysmem = single_draw_cost < 6000.0;
-
+ const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
if (TU_AUTOTUNE_DEBUG_LOG) {
- mesa_logi("%016"PRIx64":%u\t avg_samples=%u, "
- "sample_cost=%f, single_draw_cost=%f selecting %s",
- renderpass_key, cmd_buffer->state.drawcall_count, avg_samples,
- sample_cost, single_draw_cost, select_sysmem ? "sysmem" : "gmem");
+ const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
+ const float drawcall_bandwidth_per_sample =
+ (float)cmd_buffer->state.drawcall_bandwidth_per_sample_sum /
+ cmd_buffer->state.drawcall_count;
+
+ mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
+ renderpass_key,
+ cmd_buffer->state.drawcall_count,
+ select_sysmem ? "sysmem" : "gmem");
+ mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
+ avg_samples,
+ drawcall_bandwidth_per_sample,
+ total_draw_call_bandwidth);
+ mesa_logi(" render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
+ extent->width, extent->height,
+ pass->sysmem_bandwidth_per_pixel,
+ pass->gmem_bandwidth_per_pixel);
+ mesa_logi(" sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
+ sysmem_bandwidth, gmem_bandwidth);
}
return select_sysmem;
/* Fill draw stats for autotuner */
cmd->state.drawcall_count++;
- cmd->state.total_drawcalls_cost += cmd->state.pipeline->drawcall_base_cost;
+ cmd->state.drawcall_bandwidth_per_sample_sum +=
+ cmd->state.pipeline->color_bandwidth_per_sample;
/* add depth memory bandwidth cost */
+ const uint32_t depth_bandwidth = cmd->state.pipeline->depth_cpp_per_sample;
if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
- cmd->state.total_drawcalls_cost++;
+ cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
- cmd->state.total_drawcalls_cost++;
+ cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
/* add stencil memory bandwidth cost */
+ const uint32_t stencil_bandwidth =
+ cmd->state.pipeline->stencil_cpp_per_sample;
if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE)
- cmd->state.total_drawcalls_cost += 2;
+ cmd->state.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
tu_emit_cache_flush_renderpass(cmd, cs);
cmd_buffer->state.has_subpass_predication = false;
cmd_buffer->state.disable_gmem = false;
cmd_buffer->state.drawcall_count = 0;
- cmd_buffer->state.total_drawcalls_cost = 0;
+ cmd_buffer->state.drawcall_bandwidth_per_sample_sum = 0;
/* LRZ is not valid next time we use it */
cmd_buffer->state.lrz.valid = false;
const VkPipelineColorBlendStateCreateInfo *blend_info,
const VkFormat attachment_formats[MAX_RTS],
uint32_t *blend_enable_mask,
- uint8_t *drawcall_base_cost)
+ uint32_t *color_bandwidth_per_sample)
{
*blend_enable_mask = 0;
- *drawcall_base_cost = 0;
+ *color_bandwidth_per_sample = 0;
bool rop_reads_dst = false;
uint32_t rb_mrt_control_rop = 0;
A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp));
}
- uint32_t total_comps = 0;
+ uint32_t total_bpp = 0;
for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
const VkPipelineColorBlendAttachmentState *att =
&blend_info->pAttachments[i];
uint32_t rb_mrt_blend_control = 0;
if (format != VK_FORMAT_UNDEFINED) {
const bool has_alpha = vk_format_has_alpha(format);
- const uint32_t write_comps = util_bitcount(att->colorWriteMask);
rb_mrt_control =
tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha);
rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
- total_comps += write_comps;
+ /* calculate bpp based on format and write mask */
+ uint32_t write_bpp = 0;
+ if (att->colorWriteMask == 0xf) {
+ write_bpp = vk_format_get_blocksizebits(format);
+ } else {
+ const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
+ for (uint32_t i = 0; i < 4; i++) {
+ if (att->colorWriteMask & (1 << i)) {
+ write_bpp += util_format_get_component_bits(pipe_format,
+ UTIL_FORMAT_COLORSPACE_RGB, i);
+ }
+ }
+ }
+ total_bpp += write_bpp;
if (att->blendEnable || rop_reads_dst) {
*blend_enable_mask |= 1 << i;
- total_comps += write_comps;
+ total_bpp += write_bpp;
}
}
tu_cs_emit(cs, rb_mrt_blend_control);
}
- *drawcall_base_cost = total_comps / 4;
+ *color_bandwidth_per_sample = total_bpp / 8;
}
static void
builder->create_info->pDepthStencilState;
const VkPipelineRasterizationStateCreateInfo *rast_info =
builder->create_info->pRasterizationState;
+ const enum pipe_format pipe_format =
+ vk_format_to_pipe_format(builder->depth_attachment_format);
uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
struct tu_cs cs;
if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
+
+ pipeline->depth_cpp_per_sample = util_format_get_component_bits(
+ pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
} else {
/* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
* to 0 when this pipeline is used, as enabling depth test when there
A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
}
+
+ pipeline->stencil_cpp_per_sample = util_format_get_component_bits(
+ pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
}
if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
tu6_emit_rb_mrt_controls(&cs, blend_info,
builder->color_attachment_formats,
&blend_enable_mask,
- &pipeline->drawcall_base_cost);
+ &pipeline->color_bandwidth_per_sample);
tu6_emit_blend_control(&cs, blend_enable_mask,
builder->use_dual_src_blend, msaa_info);
* to:
*
* foreach_draw (...) {
- * cost += num_frag_outputs;
- * if (blend_enabled)
- * cost += num_blend_enabled;
+ * sum += pipeline->color_bandwidth_per_sample;
* if (depth_test_enabled)
- * cost++;
+ * sum += pipeline->depth_cpp_per_sample;
* if (depth_write_enabled)
- * cost++;
+ * sum += pipeline->depth_cpp_per_sample;
+ * if (stencil_write_enabled)
+ * sum += pipeline->stencil_cpp_per_sample * 2;
* }
+ * drawcall_bandwidth_per_sample = sum / drawcall_count;
*
- * The idea is that each sample-passed minimally does one write
- * per MRT. If blend is enabled, the hw will additionally do
- * a framebuffer read per sample-passed (for each MRT with blend
- * enabled). If depth-test is enabled, the hw will additionally
- * a depth buffer read. If depth-write is enable, the hw will
- * additionally do a depth buffer write.
+ * It allows us to estimate the total bandwidth of drawcalls later, by
+ * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
*
* This does ignore depth buffer traffic for samples which do not
- * pass do to depth-test fail, and some other details. But it is
+ * pass due to depth-test fail, and some other details. But it is
* just intended to be a rough estimate that is easy to calculate.
*/
- uint32_t total_drawcalls_cost;
+ uint32_t drawcall_bandwidth_per_sample_sum;
struct tu_lrz_state lrz;
bool z_negative_one_to_one;
- /* Base drawcall cost for sysmem vs gmem autotuner */
- uint8_t drawcall_base_cost;
+ /* memory bandwidth cost (in bytes) for color attachments */
+ uint32_t color_bandwidth_per_sample;
+
+ uint32_t depth_cpp_per_sample;
+ uint32_t stencil_cpp_per_sample;
void *executables_mem_ctx;
/* tu_pipeline_executable */