const struct tu_framebuffer *framebuffer,
const struct tu_cmd_buffer *cmd_buffer)
{
- if (cmd_buffer->state.drawcall_count > 5)
+ if (cmd_buffer->state.rp.drawcall_count > 5)
return false;
for (unsigned i = 0; i < pass->subpass_count; i++) {
{
const struct tu_cmd_state *state = &cmd->state;
- if (!state->drawcall_count)
+ if (!state->rp.drawcall_count)
return 0;
/* sample count times drawcall_bandwidth_per_sample */
return (uint64_t)avg_renderpass_sample_count *
- state->drawcall_bandwidth_per_sample_sum / state->drawcall_count;
+ state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
}
bool
if (TU_AUTOTUNE_DEBUG_LOG) {
const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
const float drawcall_bandwidth_per_sample =
- (float)cmd_buffer->state.drawcall_bandwidth_per_sample_sum /
- cmd_buffer->state.drawcall_count;
+ (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
+ cmd_buffer->state.rp.drawcall_count;
mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
renderpass_key,
- cmd_buffer->state.drawcall_count,
+ cmd_buffer->state.rp.drawcall_count,
select_sysmem ? "sysmem" : "gmem");
mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
avg_samples,
* use_sysmem_rendering() should have made sure we only ended up here if no
* XFB was used.
*/
- if (cmd->state.xfb_used) {
+ if (cmd->state.rp.xfb_used) {
assert(fb->binning_possible);
return true;
}
* be multiplied by tile count.
* See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3131
*/
- if (cmd->state.has_prim_generated_query_in_rp ||
+ if (cmd->state.rp.has_prim_generated_query_in_rp ||
cmd->state.prim_generated_query_running_before_rp) {
assert(fb->binning_possible);
return true;
cmd->state.render_area.extent.height == 0)
return true;
- if (cmd->state.has_tess)
+ if (cmd->state.rp.has_tess)
return true;
- if (cmd->state.disable_gmem)
+ if (cmd->state.rp.disable_gmem)
return true;
/* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
- if (cmd->state.xfb_used && !cmd->state.framebuffer->binning_possible)
+ if (cmd->state.rp.xfb_used && !cmd->state.framebuffer->binning_possible)
return true;
/* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning
* GMEM rendering, see use_hw_binning.
*/
- if ((cmd->state.has_prim_generated_query_in_rp ||
+ if ((cmd->state.rp.has_prim_generated_query_in_rp ||
cmd->state.prim_generated_query_running_before_rp) &&
!cmd->state.framebuffer->binning_possible)
return true;
}
/* Predicate is changed in draw_cs so we have to re-emit it */
- if (cmd->state.draw_cs_writes_to_cond_pred)
+ if (cmd->state.rp.draw_cs_writes_to_cond_pred)
tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false);
tu_cs_emit_call(cs, &cmd->tile_store_cs);
tu_cond_exec_end(cs);
- cmd->state.xfb_used = true;
+ cmd->state.rp.xfb_used = true;
}
VKAPI_ATTR void VKAPI_CALL
}
if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
- cmd->state.has_tess = true;
+ cmd->state.rp.has_tess = true;
/* maximum number of patches that can fit in tess factor/param buffers */
uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type),
}
}
+static void
+tu_render_pass_state_merge(struct tu_render_pass_state *dst,
+ const struct tu_render_pass_state *src)
+{
+ dst->xfb_used |= src->xfb_used;
+ dst->has_tess |= src->has_tess;
+ dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
+ dst->disable_gmem |= src->disable_gmem;
+ dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred;
+
+ dst->drawcall_count += src->drawcall_count;
+ dst->drawcall_bandwidth_per_sample_sum +=
+ src->drawcall_bandwidth_per_sample_sum;
+}
+
VKAPI_ATTR void VKAPI_CALL
tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
uint32_t commandBufferCount,
break;
}
- if (secondary->state.has_tess) {
- cmd->state.has_tess = true;
- }
- if (secondary->state.disable_gmem)
- cmd->state.disable_gmem = true;
- if (secondary->state.xfb_used)
- cmd->state.xfb_used = true;
- if (secondary->state.has_prim_generated_query_in_rp)
- cmd->state.has_prim_generated_query_in_rp = true;
-
- cmd->state.drawcall_count += secondary->state.drawcall_count;
- cmd->state.drawcall_bandwidth_per_sample_sum +=
- secondary->state.drawcall_bandwidth_per_sample_sum;
-
- cmd->state.draw_cs_writes_to_cond_pred |=
- secondary->state.draw_cs_writes_to_cond_pred;
-
/* If LRZ was made invalid in secondary - we should disable
* LRZ retroactively for the whole renderpass.
*/
if (!secondary->state.lrz.valid)
cmd->state.lrz.valid = false;
+
+ tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp);
} else {
assert(tu_cs_is_empty(&secondary->draw_cs));
assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
return;
}
- cmd->state.draw_cs_writes_to_cond_pred = false;
-
for (unsigned i = 0; i < pass->attachment_count; i++) {
cmd->state.attachments[i] = pAttachmentInfo ?
tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
cmd->state.attachments = cmd->dynamic_attachments;
- cmd->state.draw_cs_writes_to_cond_pred = false;
-
for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
uint32_t a = cmd->dynamic_subpass.color_attachments[i].attachment;
if (!pRenderingInfo->pColorAttachments[i].imageView)
const struct tu_pipeline *pipeline = cmd->state.pipeline;
/* Fill draw stats for autotuner */
- cmd->state.drawcall_count++;
+ cmd->state.rp.drawcall_count++;
- cmd->state.drawcall_bandwidth_per_sample_sum +=
+ cmd->state.rp.drawcall_bandwidth_per_sample_sum +=
cmd->state.pipeline->color_bandwidth_per_sample;
/* add depth memory bandwidth cost */
const uint32_t depth_bandwidth = cmd->state.pipeline->depth_cpp_per_sample;
if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
- cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
+ cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
- cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
+ cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
/* add stencil memory bandwidth cost */
const uint32_t stencil_bandwidth =
cmd->state.pipeline->stencil_cpp_per_sample;
if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE)
- cmd->state.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
+ cmd->state.rp.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
tu_emit_cache_flush_renderpass(cmd, cs);
cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
- if (cmd_buffer->state.has_tess)
+ if (cmd_buffer->state.rp.has_tess)
tu6_lazy_emit_tessfactor_addr(cmd_buffer);
struct tu_renderpass_result *autotune_result = NULL;
cmd_buffer->state.subpass = NULL;
cmd_buffer->state.framebuffer = NULL;
cmd_buffer->state.attachments = NULL;
- cmd_buffer->state.has_tess = false;
- cmd_buffer->state.xfb_used = false;
- cmd_buffer->state.disable_gmem = false;
- cmd_buffer->state.drawcall_count = 0;
- cmd_buffer->state.drawcall_bandwidth_per_sample_sum = 0;
- cmd_buffer->state.has_prim_generated_query_in_rp = false;
+ memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
/* LRZ is not valid next time we use it */
cmd_buffer->state.lrz.valid = false;
*/
if ((srcStage & ~framebuffer_space_stages) ||
(dstStage & ~framebuffer_space_stages)) {
- cmd->state.disable_gmem = true;
+ cmd->state.rp.disable_gmem = true;
}
}
uint32_t first_instance;
};
+/* This should be for state that is set inside a renderpass and used at
+ * renderpass end time, e.g. to decide whether to use sysmem. This needs
+ * special handling for secondary cmdbufs and suspending/resuming render
+ * passes where the state may need to be combined afterwards.
+ */
+struct tu_render_pass_state
+{
+ bool xfb_used;
+ bool has_tess;
+ bool has_prim_generated_query_in_rp;
+ bool disable_gmem;
+
+ /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
+ bool draw_cs_writes_to_cond_pred;
+
+ uint32_t drawcall_count;
+
+ /* A calculated "draw cost" value for renderpass, which tries to
+ * estimate the bandwidth-per-sample of all the draws according
+ * to:
+ *
+ * foreach_draw (...) {
+ * sum += pipeline->color_bandwidth_per_sample;
+ * if (depth_test_enabled)
+ * sum += pipeline->depth_cpp_per_sample;
+ * if (depth_write_enabled)
+ * sum += pipeline->depth_cpp_per_sample;
+ * if (stencil_write_enabled)
+ * sum += pipeline->stencil_cpp_per_sample * 2;
+ * }
+ * drawcall_bandwidth_per_sample = sum / drawcall_count;
+ *
+ * It allows us to estimate the total bandwidth of drawcalls later, by
+ * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
+ *
+ * This does ignore depth buffer traffic for samples which do not
+ * pass due to depth-test fail, and some other details. But it is
+ * just intended to be a rough estimate that is easy to calculate.
+ */
+ uint32_t drawcall_bandwidth_per_sample_sum;
+};
+
struct tu_cmd_state
{
uint32_t dirty;
struct tu_pipeline *pipeline;
struct tu_pipeline *compute_pipeline;
+ struct tu_render_pass_state rp;
+
/* Vertex buffers, viewports, and scissors
* the states for these can be updated partially, so we need to save these
* to be able to emit a complete draw state
VkRect2D render_area;
const struct tu_image_view **attachments;
- /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
- bool draw_cs_writes_to_cond_pred;
- bool xfb_used;
- bool has_tess;
bool tessfactor_addr_set;
bool predication_active;
- bool disable_gmem;
enum a5xx_line_mode line_mode;
bool z_negative_one_to_one;
- uint32_t drawcall_count;
-
- /* A calculated "draw cost" value for renderpass, which tries to
- * estimate the bandwidth-per-sample of all the draws according
- * to:
- *
- * foreach_draw (...) {
- * sum += pipeline->color_bandwidth_per_sample;
- * if (depth_test_enabled)
- * sum += pipeline->depth_cpp_per_sample;
- * if (depth_write_enabled)
- * sum += pipeline->depth_cpp_per_sample;
- * if (stencil_write_enabled)
- * sum += pipeline->stencil_cpp_per_sample * 2;
- * }
- * drawcall_bandwidth_per_sample = sum / drawcall_count;
- *
- * It allows us to estimate the total bandwidth of drawcalls later, by
- * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
- *
- * This does ignore depth buffer traffic for samples which do not
- * pass due to depth-test fail, and some other details. But it is
- * just intended to be a rough estimate that is easy to calculate.
- */
- uint32_t drawcall_bandwidth_per_sample_sum;
-
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
uint32_t prim_counters_running;
bool prim_generated_query_running_before_rp;
- bool has_prim_generated_query_in_rp;
struct tu_lrz_state lrz;