tu_cs_emit_regs(cs,
A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
- tu_cs_emit_regs(cs,
- A6XX_GRAS_LRZ_BUFFER_BASE(0),
- A6XX_GRAS_LRZ_BUFFER_PITCH(0),
- A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
-
tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
return;
tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
tu_cs_image_flag_ref(cs, &iview->view, 0);
- tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_BUFFER_BASE(.qword = iview->image->iova + iview->image->lrz_offset),
- A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = iview->image->lrz_pitch),
- A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE());
-
if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
attachment->format == VK_FORMAT_S8_UINT) {
tu_emit_input_attachments(cmd, subpass, false));
}
+
static void
tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
const VkRenderPassBeginInfo *info)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
+ tu_lrz_sysmem_begin(cmd, cs);
+
assert(fb->width > 0 && fb->height > 0);
tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
tu6_emit_window_offset(cs, 0, 0);
tu6_emit_bin_size(cs, 0, 0,
- A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM));
-
- tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
+ A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM) |
+ A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS);
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 0x0);
- tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
+ tu_lrz_sysmem_end(cmd, cs);
tu_cs_sanity_check(cs);
}
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
- tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
+ tu_lrz_tiling_begin(cmd, cs);
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 0x0);
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
- tu_cs_emit_regs(cs,
- A6XX_GRAS_LRZ_CNTL(0));
-
- tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
+ tu_lrz_tiling_end(cmd, cs);
tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
}
if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+ TU_FROM_HANDLE(tu_framebuffer, fb, pBeginInfo->pInheritanceInfo->framebuffer);
+
cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
cmd_buffer->state.subpass =
&cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
+
+ tu_lrz_begin_secondary_cmdbuf(cmd_buffer, fb);
} else {
/* When executing in the middle of another command buffer, the CCU
* state is unknown.
cmd->state.draw_cs_writes_to_cond_pred |=
secondary->state.draw_cs_writes_to_cond_pred;
+ /* If LRZ was made invalid in secondary - we should disable
+ * LRZ retroactively for the whole renderpass.
+ */
+ if (!secondary->state.lrz.valid)
+ cmd->state.lrz.valid = false;
} else {
assert(tu_cs_is_empty(&secondary->draw_cs));
assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
}
cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
- if (cmd->state.pass) {
+ if (!cmd->state.lrz.gpu_dir_tracking && cmd->state.pass) {
/* After a secondary command buffer is executed, LRZ is not valid
* until it is cleared again.
*/
if (pass->subpasses[0].feedback_invalidate)
cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
- /* Track LRZ valid state */
- uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
- if (a != VK_ATTACHMENT_UNUSED) {
- const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
- struct tu_image *image = cmd->state.attachments[a]->image;
- /* if image has lrz and it isn't a stencil-only clear: */
- if (image->lrz_height &&
- (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT))) {
- cmd->state.lrz.image = image;
- cmd->state.lrz.valid = true;
- cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
-
- tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]);
-
- /* Clearing writes via CCU color in the PS stage, and LRZ is read via
- * UCHE in the earlier GRAS stage.
- */
- cmd->state.cache.flush_bits |=
- TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
- TU_CMD_FLAG_WAIT_FOR_IDLE;
- } else {
- cmd->state.lrz.valid = false;
- }
- cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
- }
+ tu_lrz_begin_renderpass(cmd, pRenderPassBegin);
cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
}
-/* update lrz state based on stencil-test func:
- *
- * Conceptually the order of the pipeline is:
- *
- *
- * FS -> Alpha-Test -> Stencil-Test -> Depth-Test
- * | |
- * if wrmask != 0 if wrmask != 0
- * | |
- * v v
- * Stencil-Write Depth-Write
- *
- * Because Stencil-Test can have side effects (Stencil-Write) prior
- * to depth test, in this case we potentially need to disable early
- * lrz-test. See:
- *
- * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
- */
-static void
-tu6_lrz_stencil_op(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
- VkCompareOp func,
- bool stencil_write,
- bool *invalidate_lrz)
-{
- switch (func) {
- case VK_COMPARE_OP_ALWAYS:
- /* nothing to do for LRZ, but for stencil test when stencil-
- * write is enabled, we need to disable lrz-test, since
- * conceptually stencil test and write happens before depth-test.
- */
- if (stencil_write) {
- gras_lrz_cntl->enable = false;
- gras_lrz_cntl->z_test_enable = false;
- *invalidate_lrz = true;
- }
- break;
- case VK_COMPARE_OP_NEVER:
- /* fragment never passes, disable lrz_write for this draw. */
- gras_lrz_cntl->lrz_write = false;
- break;
- default:
- /* whether the fragment passes or not depends on result
- * of stencil test, which we cannot know when doing binning
- * pass.
- */
- gras_lrz_cntl->lrz_write = false;
- /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
- * effects from stencil test we need to disable lrz-test.
- */
- if (stencil_write) {
- gras_lrz_cntl->enable = false;
- gras_lrz_cntl->z_test_enable = false;
- *invalidate_lrz = true;
- }
- break;
- }
-}
-
-static struct A6XX_GRAS_LRZ_CNTL
-tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
- const uint32_t a)
-{
- struct tu_pipeline *pipeline = cmd->state.pipeline;
- bool z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
- bool z_write_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
- bool z_read_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
- bool z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
- VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
-
- struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
-
- /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth
- * or early fragment tests.
- */
- if (pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ) {
- cmd->state.lrz.valid = false;
- return gras_lrz_cntl;
- }
-
- /* If depth test is disabled we shouldn't touch LRZ.
- * Same if there is no depth attachment.
- */
- if (a == VK_ATTACHMENT_UNUSED || !z_test_enable ||
- (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ))
- return gras_lrz_cntl;
-
- if (!cmd->state.attachments) {
- /* Secondary cmdbuf - there is nothing we could do. */
- return gras_lrz_cntl;
- }
-
- gras_lrz_cntl.enable = z_test_enable;
- gras_lrz_cntl.lrz_write =
- z_write_enable &&
- !(pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE);
- gras_lrz_cntl.z_test_enable = z_read_enable;
- gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
-
- /* See comment in tu_pipeline about disabling LRZ write for blending. */
- if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) &&
- cmd->state.logic_op_enabled && cmd->state.rop_reads_dst)
- gras_lrz_cntl.lrz_write = false;
-
- if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) &&
- cmd->state.color_write_enable != MASK(cmd->state.pipeline->num_rts))
- gras_lrz_cntl.lrz_write = false;
-
- /* LRZ is disabled until it is cleared, which means that one "wrong"
- * depth test or shader could disable LRZ until depth buffer is cleared.
- */
- bool disable_lrz = false;
- bool temporary_disable_lrz = false;
-
- /* If Z is not written - it doesn't affect LRZ buffer state.
- * Which means two things:
- * - Don't lock direction until Z is written for the first time;
- * - If Z isn't written and direction IS locked it's possible to just
- * temporary disable LRZ instead of fully bailing out, when direction
- * is changed.
- */
-
- enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
- switch (depth_compare_op) {
- case VK_COMPARE_OP_ALWAYS:
- case VK_COMPARE_OP_NOT_EQUAL:
- /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
- * so if there is a depth write - LRZ must be disabled.
- */
- if (z_write_enable) {
- perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL");
- disable_lrz = true;
- } else {
- perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
- temporary_disable_lrz = true;
- }
- break;
- case VK_COMPARE_OP_EQUAL:
- case VK_COMPARE_OP_NEVER:
- /* Blob disables LRZ for OP_EQUAL, and from our empirical
- * evidence it is a right thing to do.
- *
- * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
- * we could just temporary disable LRZ.
- */
- temporary_disable_lrz = true;
- break;
- case VK_COMPARE_OP_GREATER:
- case VK_COMPARE_OP_GREATER_OR_EQUAL:
- lrz_direction = TU_LRZ_GREATER;
- gras_lrz_cntl.greater = true;
- break;
- case VK_COMPARE_OP_LESS:
- case VK_COMPARE_OP_LESS_OR_EQUAL:
- lrz_direction = TU_LRZ_LESS;
- gras_lrz_cntl.greater = false;
- break;
- default:
- unreachable("bad VK_COMPARE_OP value or uninitialized");
- break;
- };
-
- /* If depthfunc direction is changed, bail out on using LRZ. The
- * LRZ buffer encodes a min/max depth value per block, but if
- * we switch from GT/GE <-> LT/LE, those values cannot be
- * interpreted properly.
- */
- if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
- lrz_direction != TU_LRZ_UNKNOWN &&
- cmd->state.lrz.prev_direction != lrz_direction) {
- if (z_write_enable) {
- perf_debug(cmd->device, "Invalidating LRZ due to direction change");
- disable_lrz = true;
- } else {
- perf_debug(cmd->device, "Skipping LRZ due to direction change");
- temporary_disable_lrz = true;
- }
- }
-
- /* Consider the following sequence of depthfunc changes:
- *
- * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
- * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
- * during second VK_COMPARE_OP_GREATER.
- *
- * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
- * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
- * invalid during COMPARE_OP_LESS.
- *
- * This shows that we should keep last KNOWN direction.
- */
- if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
- cmd->state.lrz.prev_direction = lrz_direction;
-
- /* Invalidate LRZ and disable write if stencil test is enabled */
- bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
- if (stencil_test_enable) {
- bool stencil_front_writemask =
- (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
- (cmd->state.dynamic_stencil_wrmask & 0xff) :
- (pipeline->stencil_wrmask & 0xff);
-
- bool stencil_back_writemask =
- (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
- ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
- (pipeline->stencil_wrmask & 0xff00) >> 8;
-
- VkCompareOp stencil_front_compare_op =
- (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;
-
- VkCompareOp stencil_back_compare_op =
- (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;
-
- tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_front_compare_op,
- stencil_front_writemask, &disable_lrz);
-
- tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_back_compare_op,
- stencil_back_writemask, &disable_lrz);
- }
-
- if (disable_lrz)
- cmd->state.lrz.valid = false;
-
- if (temporary_disable_lrz)
- gras_lrz_cntl.enable = false;
-
- cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
- if (!cmd->state.lrz.enabled)
- memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
-
- return gras_lrz_cntl;
-}
-
-static void
-tu6_build_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
-{
- const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
- struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
-
- tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(
- .enable = gras_lrz_cntl.enable,
- .greater = gras_lrz_cntl.greater,
- .lrz_write = gras_lrz_cntl.lrz_write,
- .z_test_enable = gras_lrz_cntl.z_test_enable,
- .z_bounds_enable = gras_lrz_cntl.z_bounds_enable));
- tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
-}
-
static bool
tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable)
{
struct tu_cs cs;
cmd->state.lrz_and_depth_plane_state =
tu_cs_draw_state(&cmd->sub_cs, &cs, 8);
- tu6_build_lrz(cmd, &cs);
+ tu6_emit_lrz(cmd, &cs);
tu6_build_depth_plane_z_mode(cmd, &cs);
}
--- /dev/null
+/*
+ * Copyright © 2022 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tu_private.h"
+
+#include "tu_cs.h"
+
+/* Low-resolution Z buffer is very similar to a depth prepass that helps
+ * the HW avoid executing the fragment shader on those fragments that will
+ * be subsequently discarded by the depth test afterwards.
+ *
+ * The interesting part of this feature is that it allows applications
+ * to submit the vertices in any order.
+ *
+ * In the binning pass it is possible to store the depth value of each
+ * vertex into internal low resolution depth buffer and quickly test
+ * the primitives against it during the render pass.
+ *
+ * There are a number of limitations when LRZ cannot be used:
+ * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
+ * - Writing to stencil buffer
+ * - Writing depth while:
+ * - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
+ * - Using OP_ALWAYS or OP_NOT_EQUAL;
+ * - Clearing depth with vkCmdClearAttachments;
+ * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
+ * - (pre-a650) Using secondary command buffers;
+ * - Sysmem rendering (with small caveat).
+ *
+ * Pre-a650 (before gen3)
+ * ======================
+ *
+ * The direction is fully tracked on CPU. In renderpass LRZ starts with
+ * unknown direction, the direction is set first time when depth write occurs
+ * and if it does change afterwards - direction becomes invalid and LRZ is
+ * disabled for the rest of the renderpass.
+ *
+ * Since direction is not tracked by GPU - it's impossible to know whether
+ * LRZ is enabled during construction of secondary command buffers.
+ *
+ * For the same reason it's impossible to reuse LRZ between renderpasses.
+ *
+ * A650+ (gen3+)
+ * =============
+ *
+ * Now LRZ direction could be tracked on GPU. There are to parts:
+ * - Direction byte which stores current LRZ direction;
+ * - Parameters of the last used depth view.
+ *
+ * The idea is the same as when LRZ tracked on CPU: when GRAS_LRZ_CNTL
+ * is used - its direction is compared to previously known direction
+ * and direction byte is set to disabled when directions are incompatible.
+ *
+ * Additionally, to reuse LRZ between renderpasses, GRAS_LRZ_CNTL checks
+ * if current value of GRAS_LRZ_DEPTH_VIEW is equal to the value
+ * stored in the buffer, if not - LRZ is disabled. (This is necessary
+ * because depth buffer may have several layers and mip levels, on the
+ * other hand LRZ buffer represents only a single layer + mip level).
+ *
+ * LRZ direction between renderpasses is disabled when underlying depth
+ * buffer is changed, the following commands could change depth image:
+ * - vkCmdBlitImage*
+ * - vkCmdCopyBufferToImage*
+ * - vkCmdCopyImage*
+ *
+ * LRZ Fast-Clear
+ * ==============
+ *
+ * The LRZ fast-clear buffer is initialized to zeroes and read/written
+ * when GRAS_LRZ_CNTL.FC_ENABLE (b3) is set. It appears to store 1b/block.
+ * '0' means block has original depth clear value, and '1' means that the
+ * corresponding block in LRZ has been modified.
+ *
+ * LRZ fast-clear conservatively clears LRZ buffer, at the point where LRZ is
+ * written the LRZ block which corresponds to a single fast-clear bit is cleared:
+ * - To 0.0 if depth comparison is GREATER;
+ * - To 1.0 if depth comparison is LESS;
+ *
+ * This way it's always valid to fast-clear. On the other hand we disable
+ * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
+ * for perf if some primitives are expected to fail depth test against the
+ * actual depth clear value.
+ *
+ * LRZ Precision
+ * =============
+ *
+ * LRZ always uses Z16_UNORM. The epsilon for it is 1.f / (1 << 16) which is
+ * not enough to represent all values of Z32_UNORM or Z32_FLOAT.
+ * This especially rises questions in context of fast-clear, if fast-clear
+ * uses a value which cannot be precisely represented by LRZ - we wouldn't
+ * be able to round it in the correct direction since direction is tracked
+ * on GPU.
+ *
+ * However, it seems that depth comparisons with LRZ values have some "slack"
+ * and nothing special should be done for such depth clear values.
+ *
+ * How it was tested:
+ * - Clear Z32_FLOAT attachment to 1.f / (1 << 17)
+ * - LRZ buffer contains all zeroes
+ * - Do draws and check whether all samples are passing:
+ * - OP_GREATER with (1.f / (1 << 17) + float32_epsilon) - passing;
+ * - OP_GREATER with (1.f / (1 << 17) - float32_epsilon) - not passing;
+ * - OP_LESS with (1.f / (1 << 17) - float32_epsilon) - samples;
+ * - OP_LESS with() 1.f / (1 << 17) + float32_epsilon) - not passing;
+ * - OP_LESS_OR_EQ with (1.f / (1 << 17) + float32_epsilon) - not passing;
+ * In all cases resulting LRZ buffer is all zeroes and LRZ direction is updated.
+ *
+ * LRZ Caches
+ * ==========
+ *
+ * ! The policy here is to flush LRZ cache right after it is changed,
+ * so if LRZ data is needed afterwards - there is no need to flush it
+ * before using LRZ.
+ *
+ * LRZ_FLUSH flushes and invalidates LRZ caches, there are two caches:
+ * - Cache for fast-clear buffer;
+ * - Cache for direction byte + depth view params.
+ * They could be cleared by LRZ_CLEAR. To become visible in GPU memory
+ * the caches should be flushed with LRZ_FLUSH afterwards.
+ *
+ * GRAS_LRZ_CNTL reads from these caches.
+ */
+
+static void
+tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
+{
+ if (!depth_image) {
+ tu_cs_emit_regs(cs,
+ A6XX_GRAS_LRZ_BUFFER_BASE(0),
+ A6XX_GRAS_LRZ_BUFFER_PITCH(0),
+ A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
+ return;
+ }
+
+ uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
+ uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
+ if (!depth_image->lrz_fc_offset)
+ lrz_fc_iova = 0;
+
+ tu_cs_emit_regs(cs,
+ A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
+ A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
+ A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
+}
+
+static void
+tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+ struct tu_reg_value reg)
+{
+ if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
+ tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
+ tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
+ tu_cs_emit(cs, reg.reg);
+ tu_cs_emit(cs, reg.value);
+ } else {
+ tu_cs_emit_pkt4(cs, reg.reg, 1);
+ tu_cs_emit(cs, reg.value);
+ }
+}
+
+static void
+tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ /* Disable direction by writing invalid depth view. */
+ tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
+ .base_layer = 0b11111111111,
+ .layer_count = 0b11111111111,
+ .base_mip_level = 0b1111,
+ ));
+
+ tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
+ .enable = true,
+ .disable_on_wrong_dir = true,
+ ));
+
+ tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
+ tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
+}
+
+static void
+tu_lrz_init_state(struct tu_cmd_buffer *cmd,
+ const struct tu_render_pass_attachment *att,
+ const struct tu_image_view *view)
+{
+ if (!view->image->lrz_height)
+ return;
+
+ bool clears_depth = att->clear_mask &
+ (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
+ bool has_gpu_tracking =
+ cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
+
+ if (!has_gpu_tracking && !clears_depth)
+ return;
+
+ if (!clears_depth && !att->load)
+ return;
+
+ cmd->state.lrz.image_view = view;
+ cmd->state.lrz.valid = true;
+ cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
+ /* Be optimistic and unconditionally enable fast-clear in
+ * secondary cmdbufs and when reusing previous LRZ state.
+ */
+ cmd->state.lrz.fast_clear = view->image->lrz_fc_size > 0;
+
+ cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
+ cmd->state.lrz.reuse_previous_state = !clears_depth;
+}
+
+void
+tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
+ const VkRenderPassBeginInfo *pRenderPassBegin)
+{
+ const struct tu_render_pass *pass = cmd->state.pass;
+
+ int lrz_img_count = 0;
+ for (unsigned i = 0; i < pass->attachment_count; i++) {
+ if (cmd->state.attachments[i]->image->lrz_height)
+ lrz_img_count++;
+ }
+
+ if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
+ cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
+ /* Theoretically we could switch between LRZ buffers during the binning
+ * and tiling passes, but it is untested and would add complexity for
+ * presumably extremely rare case.
+ */
+ perf_debug(cmd->device,
+ "Invalidating LRZ because there are several subpasses with "
+ "different depth attachments in a single renderpass");
+
+ for (unsigned i = 0; i < pass->attachment_count; i++) {
+ struct tu_image *image = cmd->state.attachments[i]->image;
+ tu_disable_lrz(cmd, &cmd->cs, image);
+ }
+ }
+
+ /* Track LRZ valid state */
+ cmd->state.lrz.valid = false;
+ uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
+ if (a != VK_ATTACHMENT_UNUSED) {
+ const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
+ tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
+ if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
+ VkClearValue clear = pRenderPassBegin->pClearValues[a];
+ cmd->state.lrz.depth_clear_value = clear;
+ cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
+ (clear.depthStencil.depth == 0.f ||
+ clear.depthStencil.depth == 1.f);
+ }
+ cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
+ }
+
+ if (!cmd->state.lrz.valid) {
+ memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
+ tu6_emit_lrz_buffer(&cmd->cs, NULL);
+ }
+}
+
+void
+tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd,
+ struct tu_framebuffer *fb)
+{
+ uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
+ if (a != VK_ATTACHMENT_UNUSED &&
+ cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
+ const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
+ struct tu_image_view *view = fb->attachments[a].attachment;
+
+ tu_lrz_init_state(cmd, att, view);
+ }
+}
+
+void
+tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ if (!cmd->state.lrz.image_view)
+ return;
+
+ struct tu_lrz_state *lrz = &cmd->state.lrz;
+
+ tu6_emit_lrz_buffer(cs, lrz->image_view->image);
+
+ if (lrz->reuse_previous_state) {
+ /* Reuse previous LRZ state, LRZ cache is assumed to be
+ * already invalidated by previous renderpass.
+ */
+ assert(lrz->gpu_dir_tracking);
+
+ tu6_write_lrz_reg(cmd, cs,
+ A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
+ return;
+ }
+
+ bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
+ if (invalidate_lrz) {
+ /* Following the blob we elect to disable LRZ for the whole renderpass
+ * if it is known that LRZ is disabled somewhere in the renderpass.
+ *
+ * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
+ * to fail the comparison of depth views.
+ */
+ tu6_disable_lrz_via_depth_view(cmd, cs);
+ tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
+ } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
+ if (lrz->gpu_dir_tracking) {
+ tu6_write_lrz_reg(cmd, cs,
+ A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
+ }
+
+ tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
+ .enable = true,
+ .fc_enable = lrz->fast_clear,
+ .disable_on_wrong_dir = lrz->gpu_dir_tracking,
+ ));
+
+ /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
+ * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
+ * CUR_DIR_UNSET.
+ */
+ tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
+ }
+
+ if (!lrz->fast_clear && !invalidate_lrz) {
+ tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
+
+ /* Even though we disable fast-clear we still have to dirty
+ * fast-clear buffer because both secondary cmdbufs and following
+ * renderpasses won't know that fast-clear is disabled.
+ *
+ * TODO: we could avoid this if we don't store depth and don't
+ * expect secondary cmdbufs.
+ */
+ if (lrz->image_view->image->lrz_fc_size) {
+ tu6_dirty_lrz_fc(cmd, cs, lrz->image_view->image);
+ }
+ }
+}
+
+void
+tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
+ tu6_emit_lrz_buffer(cs, cmd->state.lrz.image_view->image);
+
+ if (cmd->state.lrz.gpu_dir_tracking) {
+ tu6_write_lrz_reg(cmd, &cmd->cs,
+ A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
+ }
+
+ /* Enable flushing of LRZ fast-clear and of direction buffer */
+ tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
+ .enable = true,
+ .fc_enable = cmd->state.lrz.fast_clear,
+ .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
+ ));
+ } else {
+ tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(0));
+ }
+
+ tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
+
+ /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
+ * additionally clears direction buffer:
+ * GRAS_LRZ_DEPTH_VIEW(.dword = 0)
+ * GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
+ * A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
+ * LRZ_CLEAR
+ * LRZ_FLUSH
+ * Since it happens after all of the rendering is done there is no known
+ * reason to do such clear.
+ */
+}
+
+void
+tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ if (!cmd->state.lrz.image_view)
+ return;
+
+ /* Actually, LRZ buffer could be filled in sysmem, in theory to
+ * be used in another renderpass, but the benefit is rather dubious.
+ */
+
+ struct tu_lrz_state *lrz = &cmd->state.lrz;
+
+ if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
+ tu_disable_lrz(cmd, cs, lrz->image_view->image);
+ /* Make sure depth view comparison will fail. */
+ tu6_write_lrz_reg(cmd, cs,
+ A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
+ } else {
+ tu6_emit_lrz_buffer(cs, lrz->image_view->image);
+ /* Even though we disable LRZ writes in sysmem mode - there is still
+ * LRZ test, so LRZ should be cleared.
+ */
+ if (lrz->fast_clear) {
+ tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
+ .enable = true,
+ .fc_enable = true,
+ ));
+ tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
+ tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
+ } else {
+ tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
+ }
+ }
+}
+
+void
+tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
+}
+
+/* Disable LRZ outside of renderpass. */
+void
+tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+ struct tu_image *image)
+{
+ if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
+ return;
+
+ if (!image->lrz_height)
+ return;
+
+ tu6_emit_lrz_buffer(cs, image);
+ tu6_disable_lrz_via_depth_view(cmd, cs);
+}
+
+/* Clear LRZ, used for out of renderpass depth clears. */
+void
+tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
+ struct tu_image *image,
+ const VkClearDepthStencilValue *pDepthStencil,
+ uint32_t rangeCount,
+ const VkImageSubresourceRange *pRanges)
+{
+ if (!rangeCount || !image->lrz_height ||
+ !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
+ return;
+
+ /* We cannot predict which depth subresource would be used later on,
+ * so we just pick the first one with depth cleared and clear the LRZ.
+ */
+ const VkImageSubresourceRange *range = NULL;
+ for (unsigned i = 0; i < rangeCount; i++) {
+ if (pRanges[i].aspectMask &
+ (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
+ range = &pRanges[i];
+ break;
+ }
+ }
+
+ if (!range)
+ return;
+
+ bool fast_clear = image->lrz_fc_size && (pDepthStencil->depth == 0.f ||
+ pDepthStencil->depth == 1.f);
+
+ tu6_emit_lrz_buffer(&cmd->cs, image);
+
+ tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
+ .base_layer = range->baseArrayLayer,
+ .layer_count = tu_get_layerCount(image, range),
+ .base_mip_level = range->baseMipLevel,
+ ));
+
+ tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
+ .enable = true,
+ .fc_enable = fast_clear,
+ .disable_on_wrong_dir = true,
+ ));
+
+ tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
+ tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
+
+ if (!fast_clear) {
+ tu6_clear_lrz(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
+ }
+}
+
+void
+tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
+{
+ assert(cmd->state.pass);
+
+ cmd->state.lrz.valid = false;
+ cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
+
+ if (cmd->state.lrz.gpu_dir_tracking) {
+ tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
+ .enable = true,
+ .dir = LRZ_DIR_INVALID,
+ .disable_on_wrong_dir = true,
+ ));
+ }
+}
+
+/* update lrz state based on stencil-test func:
+ *
+ * Conceptually the order of the pipeline is:
+ *
+ *
+ * FS -> Alpha-Test -> Stencil-Test -> Depth-Test
+ * | |
+ * if wrmask != 0 if wrmask != 0
+ * | |
+ * v v
+ * Stencil-Write Depth-Write
+ *
+ * Because Stencil-Test can have side effects (Stencil-Write) prior
+ * to depth test, in this case we potentially need to disable early
+ * lrz-test. See:
+ *
+ * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
+ */
+static bool
+tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
+ VkCompareOp func,
+ bool stencil_write)
+{
+ switch (func) {
+ case VK_COMPARE_OP_ALWAYS:
+ /* nothing to do for LRZ, but for stencil test when stencil-
+ * write is enabled, we need to disable lrz-test, since
+ * conceptually stencil test and write happens before depth-test.
+ */
+ if (stencil_write) {
+ return false;
+ }
+ break;
+ case VK_COMPARE_OP_NEVER:
+ /* fragment never passes, disable lrz_write for this draw. */
+ gras_lrz_cntl->lrz_write = false;
+ break;
+ default:
+ /* whether the fragment passes or not depends on result
+ * of stencil test, which we cannot know when doing binning
+ * pass.
+ */
+ gras_lrz_cntl->lrz_write = false;
+ /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
+ * effects from stencil test we need to disable lrz-test.
+ */
+ if (stencil_write) {
+ return false;
+ }
+ break;
+ }
+
+ return true;
+}
+
+static struct A6XX_GRAS_LRZ_CNTL
+tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
+ const uint32_t a)
+{
+ struct tu_pipeline *pipeline = cmd->state.pipeline;
+ bool z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
+ bool z_write_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
+ bool z_read_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
+ bool z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
+ VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
+
+ struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
+
+ if (!cmd->state.lrz.valid) {
+ return gras_lrz_cntl;
+ }
+
+ /* If depth test is disabled we shouldn't touch LRZ.
+ * Same if there is no depth attachment.
+ */
+ if (a == VK_ATTACHMENT_UNUSED || !z_test_enable ||
+ (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ))
+ return gras_lrz_cntl;
+
+ if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
+ /* Without on-gpu LRZ direction tracking - there is nothing we
+ * can do to enable LRZ in secondary command buffers.
+ */
+ return gras_lrz_cntl;
+ }
+
+ gras_lrz_cntl.enable = true;
+ gras_lrz_cntl.lrz_write =
+ z_write_enable &&
+ !(pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE);
+ gras_lrz_cntl.z_test_enable = z_read_enable && z_write_enable;
+ gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
+ gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
+ gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
+ gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
+
+ /* See comment in tu_pipeline about disabling LRZ write for blending. */
+ if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) &&
+ cmd->state.logic_op_enabled && cmd->state.rop_reads_dst)
+ gras_lrz_cntl.lrz_write = false;
+
+ if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) &&
+ cmd->state.color_write_enable != MASK(cmd->state.pipeline->num_rts))
+ gras_lrz_cntl.lrz_write = false;
+
+ /* LRZ is disabled until it is cleared, which means that one "wrong"
+ * depth test or shader could disable LRZ until depth buffer is cleared.
+ */
+ bool disable_lrz = false;
+ bool temporary_disable_lrz = false;
+
+ /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth
+ * or early fragment tests.
+ */
+ if (pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ) {
+ perf_debug(cmd->device, "Invalidating LRZ due to FS");
+ disable_lrz = true;
+ }
+
+ /* If Z is not written - it doesn't affect LRZ buffer state.
+ * Which means two things:
+ * - Don't lock direction until Z is written for the first time;
+ * - If Z isn't written and direction IS locked it's possible to just
+ * temporary disable LRZ instead of fully bailing out, when direction
+ * is changed.
+ */
+
+ enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
+ switch (depth_compare_op) {
+ case VK_COMPARE_OP_ALWAYS:
+ case VK_COMPARE_OP_NOT_EQUAL:
+ /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
+ * so if there is a depth write - LRZ must be disabled.
+ */
+ if (z_write_enable) {
+ perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL");
+ disable_lrz = true;
+ gras_lrz_cntl.dir = LRZ_DIR_INVALID;
+ } else {
+ perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
+ temporary_disable_lrz = true;
+ }
+ break;
+ case VK_COMPARE_OP_EQUAL:
+ case VK_COMPARE_OP_NEVER:
+ /* Blob disables LRZ for OP_EQUAL, and from our empirical
+ * evidence it is a right thing to do.
+ *
+ * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
+ * we could just temporary disable LRZ.
+ */
+ temporary_disable_lrz = true;
+ break;
+ case VK_COMPARE_OP_GREATER:
+ case VK_COMPARE_OP_GREATER_OR_EQUAL:
+ lrz_direction = TU_LRZ_GREATER;
+ gras_lrz_cntl.greater = true;
+ gras_lrz_cntl.dir = LRZ_DIR_GE;
+ break;
+ case VK_COMPARE_OP_LESS:
+ case VK_COMPARE_OP_LESS_OR_EQUAL:
+ lrz_direction = TU_LRZ_LESS;
+ gras_lrz_cntl.greater = false;
+ gras_lrz_cntl.dir = LRZ_DIR_LE;
+ break;
+ default:
+ unreachable("bad VK_COMPARE_OP value or uninitialized");
+ break;
+ };
+
+ /* If depthfunc direction is changed, bail out on using LRZ. The
+ * LRZ buffer encodes a min/max depth value per block, but if
+ * we switch from GT/GE <-> LT/LE, those values cannot be
+ * interpreted properly.
+ */
+ if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
+ lrz_direction != TU_LRZ_UNKNOWN &&
+ cmd->state.lrz.prev_direction != lrz_direction) {
+ if (z_write_enable) {
+ perf_debug(cmd->device, "Invalidating LRZ due to direction change");
+ disable_lrz = true;
+ } else {
+ perf_debug(cmd->device, "Skipping LRZ due to direction change");
+ temporary_disable_lrz = true;
+ }
+ }
+
+ /* Consider the following sequence of depthfunc changes:
+ *
+ * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
+ * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
+ * during second VK_COMPARE_OP_GREATER.
+ *
+ * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
+ * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
+ * invalid during COMPARE_OP_LESS.
+ *
+ * This shows that we should keep last KNOWN direction.
+ */
+ if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
+ cmd->state.lrz.prev_direction = lrz_direction;
+
+ /* Invalidate LRZ and disable write if stencil test is enabled */
+ bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
+ if (!disable_lrz && stencil_test_enable) {
+ bool stencil_front_writemask =
+ (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
+ (cmd->state.dynamic_stencil_wrmask & 0xff) :
+ (pipeline->stencil_wrmask & 0xff);
+
+ bool stencil_back_writemask =
+ (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
+ ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
+ (pipeline->stencil_wrmask & 0xff00) >> 8;
+
+ VkCompareOp stencil_front_compare_op =
+ (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;
+
+ VkCompareOp stencil_back_compare_op =
+ (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;
+
+ bool lrz_allowed = true;
+ lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
+ &gras_lrz_cntl, stencil_front_compare_op,
+ stencil_front_writemask);
+
+ lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
+ &gras_lrz_cntl, stencil_back_compare_op,
+ stencil_back_writemask);
+
+ /* Without depth write it's enough to make sure that depth test
+ * is executed after stencil test, so temporary disabling LRZ is enough.
+ */
+ if (!lrz_allowed) {
+ if (z_write_enable) {
+ perf_debug(cmd->device, "Invalidating LRZ due to stencil write");
+ disable_lrz = true;
+ } else {
+ perf_debug(cmd->device, "Skipping LRZ due to stencil write");
+ temporary_disable_lrz = true;
+ }
+ }
+ }
+
+ if (disable_lrz)
+ cmd->state.lrz.valid = false;
+
+ if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
+ /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
+ * for this it's not enough to emit empty GRAS_LRZ_CNTL.
+ */
+ gras_lrz_cntl.enable = true;
+ gras_lrz_cntl.dir = LRZ_DIR_INVALID;
+
+ return gras_lrz_cntl;
+ }
+
+ if (temporary_disable_lrz)
+ gras_lrz_cntl.enable = false;
+
+ cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
+ if (!cmd->state.lrz.enabled)
+ memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
+
+ return gras_lrz_cntl;
+}
+
+void
+tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
+ struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
+
+ tu6_write_lrz_reg(cmd, cs, pack_A6XX_GRAS_LRZ_CNTL(gras_lrz_cntl));
+ tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
+}