From 07017aa137b9fa48ec25ef5d1e3fd2d137beb651 Mon Sep 17 00:00:00 2001
From: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Date: Fri, 14 Oct 2022 11:53:03 -0400
Subject: [PATCH] util/tc: implement renderpass tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

this allows tc to track metadata for framebuffer attachments so that
drivers can optimize their renderpasses

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Marek OlÅ¡Ã¡k <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19077>
---
 src/gallium/auxiliary/util/u_threaded_context.c | 317 ++++++++++++++++++++++--
 src/gallium/auxiliary/util/u_threaded_context.h |  94 +++++++
 2 files changed, 390 insertions(+), 21 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index bac20b1..6b3929d 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -116,6 +116,103 @@ tc_clear_driver_thread(struct threaded_context *tc)
 #endif
 }
 
+/* ensure the batch's array of renderpass data is large enough for the current index */
+static void
+tc_batch_renderpass_infos_resize(struct tc_batch *batch)
+{
+   unsigned size = batch->renderpass_infos.capacity;
+   unsigned cur_num = batch->renderpass_info_idx;
+
+   if (size / sizeof(struct tc_renderpass_info) > cur_num)
+      return;
+
+   if (!util_dynarray_resize(&batch->renderpass_infos, struct tc_renderpass_info, cur_num + 10))
+      mesa_loge("tc: memory alloc fail!");
+
+   if (size != batch->renderpass_infos.capacity) {
+      /* zero new allocation region */
+      uint8_t *data = batch->renderpass_infos.data;
+      memset(data + size, 0, batch->renderpass_infos.capacity - size);
+      unsigned start = size / sizeof(struct tc_renderpass_info);
+      unsigned count = (batch->renderpass_infos.capacity - size) /
+                       sizeof(struct tc_renderpass_info);
+      struct tc_renderpass_info *infos = batch->renderpass_infos.data;
+      for (unsigned i = 0; i < count; i++)
+         util_queue_fence_init(&infos[start + i].ready);
+   }
+}
+
+/* signal that the renderpass info is "ready" for use by drivers and will no longer be updated */
+static void
+tc_signal_renderpass_info_ready(struct threaded_context *tc)
+{
+   if (tc->renderpass_info_recording &&
+       !util_queue_fence_is_signalled(&tc->renderpass_info_recording->ready))
+      util_queue_fence_signal(&tc->renderpass_info_recording->ready);
+}
+
+/* increment the current renderpass info struct for recording
+ * 'full_copy' is used for preserving data across non-blocking tc batch flushes
+ */
+static void
+tc_batch_increment_renderpass_info(struct threaded_context *tc, bool full_copy)
+{
+   struct tc_batch *batch = &tc->batch_slots[tc->next];
+   struct tc_renderpass_info *tc_info = batch->renderpass_infos.data;
+
+   /* signal existing info since it will not be used anymore */
+   tc_signal_renderpass_info_ready(tc);
+   /* increment rp info and initialize it */
+   batch->renderpass_info_idx++;
+   tc_batch_renderpass_infos_resize(batch);
+   tc_info = batch->renderpass_infos.data;
+
+   if (full_copy) {
+      /* copy the previous data in its entirety: this is still the same renderpass */
+      if (tc->renderpass_info_recording)
+         tc_info[batch->renderpass_info_idx].data = tc->renderpass_info_recording->data;
+      else
+         tc_info[batch->renderpass_info_idx].data = 0;
+   } else {
+      /* selectively copy: only the CSO metadata is copied, and a new framebuffer state will be added later */
+      tc_info[batch->renderpass_info_idx].data = 0;
+      if (tc->renderpass_info_recording)
+         tc_info[batch->renderpass_info_idx].data16[2] = tc->renderpass_info_recording->data16[2];
+   }
+
+   util_queue_fence_reset(&tc_info[batch->renderpass_info_idx].ready);
+   assert(tc->renderpass_info_recording != &tc_info[batch->renderpass_info_idx]);
+   /* this is now the current recording renderpass info */
+   tc->renderpass_info_recording = &tc_info[batch->renderpass_info_idx];
+}
+
+static ALWAYS_INLINE struct tc_renderpass_info *
+tc_get_renderpass_info(struct threaded_context *tc)
+{
+   return tc->renderpass_info_recording;
+}
+
+/* update metadata at draw time */
+static void
+tc_parse_draw(struct threaded_context *tc)
+{
+   struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
+
+   if (info) {
+      /* all buffers that aren't cleared are considered loaded */
+      info->cbuf_load |= ~info->cbuf_clear;
+      if (!info->zsbuf_clear)
+         info->zsbuf_load = true;
+      /* previous invalidates are no longer relevant */
+      info->cbuf_invalidate = 0;
+      info->zsbuf_invalidate = false;
+      info->has_draw = true;
+   }
+
+   tc->in_renderpass = true;
+   tc->seen_fb_state = true;
+}
+
 static void *
 to_call_check(void *ptr, unsigned num_slots)
 {
@@ -194,18 +291,13 @@ tc_drop_vertex_state_references(struct pipe_vertex_state *dst, int num_refs)
 #define DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX \
    offsetof(struct pipe_draw_info, min_index)
 
-static void
-tc_batch_execute(void *job, UNUSED void *gdata, int thread_index)
+ALWAYS_INLINE static void
+batch_execute(struct tc_batch *batch, struct pipe_context *pipe, uint64_t *last, bool parsing)
 {
-   struct tc_batch *batch = job;
-   struct pipe_context *pipe = batch->tc->pipe;
-   uint64_t *last = &batch->slots[batch->num_total_slots];
-
-   tc_batch_check(batch);
-   tc_set_driver_thread(batch->tc);
-
-   assert(!batch->token);
-
+   /* if the framebuffer state is persisting from a previous batch,
+    * begin incrementing renderpass info on the first set_framebuffer_state call
+    */
+   bool first = !batch->first_set_fb;
    for (uint64_t *iter = batch->slots; iter != last;) {
       struct tc_call_base *call = (struct tc_call_base *)iter;
 
@@ -218,7 +310,50 @@ tc_batch_execute(void *job, UNUSED void *gdata, int thread_index)
       TC_TRACE_SCOPE(call->call_id);
 
       iter += execute_func[call->call_id](pipe, call, last);
+
+      if (parsing) {
+         if (call->call_id == TC_CALL_flush) {
+            /* always increment renderpass info for non-deferred flushes */
+            batch->tc->renderpass_info++;
+            /* if a flush happens, renderpass info is always incremented after */
+            first = false;
+         } else if (call->call_id == TC_CALL_set_framebuffer_state) {
+            /* the renderpass info pointer is already set at the start of the batch,
+             * so don't increment on the first set_framebuffer_state call
+             */
+            if (!first)
+               batch->tc->renderpass_info++;
+            first = false;
+         } else if (call->call_id >= TC_CALL_draw_single &&
+                    call->call_id <= TC_CALL_draw_vstate_multi) {
+            /* if a draw happens before a set_framebuffer_state on this batch,
+             * begin incrementing renderpass data 
+             */
+            first = false;
+         }
+      }
    }
+}
+
+static void
+tc_batch_execute(void *job, UNUSED void *gdata, int thread_index)
+{
+   struct tc_batch *batch = job;
+   struct pipe_context *pipe = batch->tc->pipe;
+   uint64_t *last = &batch->slots[batch->num_total_slots];
+
+   tc_batch_check(batch);
+   tc_set_driver_thread(batch->tc);
+
+   assert(!batch->token);
+
+   /* setup renderpass info */
+   batch->tc->renderpass_info = batch->renderpass_infos.data;
+
+   if (batch->tc->options.parse_renderpass_info)
+      batch_execute(batch, pipe, last, true);
+   else
+      batch_execute(batch, pipe, last, false);
 
    /* Add the fence to the list of fences for the driver to signal at the next
     * flush, which we use for tracking which buffers are referenced by
@@ -247,6 +382,7 @@ tc_batch_execute(void *job, UNUSED void *gdata, int thread_index)
    tc_batch_check(batch);
    batch->num_total_slots = 0;
    batch->last_mergeable_call = NULL;
+   batch->first_set_fb = false;
 }
 
 static void
@@ -267,7 +403,7 @@ tc_begin_next_buffer_list(struct threaded_context *tc)
 }
 
 static void
-tc_batch_flush(struct threaded_context *tc)
+tc_batch_flush(struct threaded_context *tc, bool full_copy)
 {
    struct tc_batch *next = &tc->batch_slots[tc->next];
 
@@ -281,12 +417,22 @@ tc_batch_flush(struct threaded_context *tc)
       next->token->tc = NULL;
       tc_unflushed_batch_token_reference(&next->token, NULL);
    }
+   /* reset renderpass info index for subsequent use */
+   next->renderpass_info_idx = -1;
 
    util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
                       NULL, 0);
    tc->last = tc->next;
    tc->next = (tc->next + 1) % TC_MAX_BATCHES;
    tc_begin_next_buffer_list(tc);
+
+   /* always increment renderpass info on batch flush;
+    * renderpass info can only be accessed by its owner batch during execution
+    */
+   if (tc->renderpass_info_recording) {
+      tc->batch_slots[tc->next].first_set_fb = full_copy;
+      tc_batch_increment_renderpass_info(tc, full_copy);
+   }
 }
 
 /* This is the function that adds variable-sized calls into the current
@@ -303,7 +449,8 @@ tc_add_sized_call(struct threaded_context *tc, enum tc_call_id id,
    tc_debug_check(tc);
 
    if (unlikely(next->num_total_slots + num_slots > TC_SLOTS_PER_BATCH)) {
-      tc_batch_flush(tc);
+      /* copy existing renderpass info during flush */
+      tc_batch_flush(tc, true);
       next = &tc->batch_slots[tc->next];
       tc_assert(next->num_total_slots == 0);
       tc_assert(next->last_mergeable_call == NULL);
@@ -406,6 +553,8 @@ _tc_sync(struct threaded_context *tc, UNUSED const char *info, UNUSED const char
 
    tc_debug_check(tc);
 
+   tc_signal_renderpass_info_ready(tc);
+
    /* Only wait for queued calls... */
    if (!util_queue_fence_is_signalled(&last->fence)) {
       util_queue_fence_wait(&last->fence);
@@ -438,6 +587,17 @@ _tc_sync(struct threaded_context *tc, UNUSED const char *info, UNUSED const char
 
    tc_debug_check(tc);
 
+   if (tc->options.parse_renderpass_info) {
+      int renderpass_info_idx = next->renderpass_info_idx;
+      if (renderpass_info_idx > 0) {
+         next->renderpass_info_idx = -1;
+         tc_batch_increment_renderpass_info(tc, false);
+      } else if (tc->renderpass_info_recording->has_draw) {
+         tc->renderpass_info_recording->data32[0] = 0;
+      }
+      tc->seen_fb_state = false;
+   }
+
    MESA_TRACE_END();
 }
 
@@ -466,7 +626,7 @@ threaded_context_flush(struct pipe_context *_pipe,
        * running. That should be better for cache locality.
        */
       if (prefer_async || !util_queue_fence_is_signalled(&last->fence))
-         tc_batch_flush(tc);
+         tc_batch_flush(tc, false);
       else
          tc_sync(token->tc);
    }
@@ -1097,11 +1257,33 @@ TC_CSO_WHOLE(blend)
 TC_CSO_WHOLE(rasterizer)
 TC_CSO_CREATE(depth_stencil_alpha, depth_stencil_alpha)
 TC_CSO_BIND(depth_stencil_alpha,
+   if (param && tc->options.parse_renderpass_info) {
+      /* dsa info is only ever added during a renderpass;
+       * changes outside of a renderpass reset the data
+       */
+      if (!tc->in_renderpass) {
+         tc_get_renderpass_info(tc)->zsbuf_write_dsa = 0;
+         tc_get_renderpass_info(tc)->zsbuf_read_dsa = 0;
+      }
+      /* let the driver parse its own state */
+      tc->options.dsa_parse(param, tc_get_renderpass_info(tc));
+   }
 )
 TC_CSO_DELETE(depth_stencil_alpha)
 TC_CSO_WHOLE(compute)
 TC_CSO_CREATE(fs, shader)
 TC_CSO_BIND(fs,
+   if (param && tc->options.parse_renderpass_info) {
+      /* fs info is only ever added during a renderpass;
+       * changes outside of a renderpass reset the data
+       */
+      if (!tc->in_renderpass) {
+         tc_get_renderpass_info(tc)->cbuf_fbfetch = 0;
+         tc_get_renderpass_info(tc)->zsbuf_write_fs = 0;
+      }
+      /* let the driver parse its own state */
+      tc->options.fs_parse(param, tc_get_renderpass_info(tc));
+   }
 )
 TC_CSO_DELETE(fs)
 TC_CSO_SHADER(vs)
@@ -1199,10 +1381,48 @@ tc_set_framebuffer_state(struct pipe_context *_pipe,
    p->state.layers = fb->layers;
    p->state.nr_cbufs = nr_cbufs;
 
-   for (unsigned i = 0; i < nr_cbufs; i++) {
-      p->state.cbufs[i] = NULL;
-      pipe_surface_reference(&p->state.cbufs[i], fb->cbufs[i]);
+
+   if (tc->options.parse_renderpass_info) {
+      /* store existing zsbuf data for possible persistence */
+      uint8_t zsbuf = tc->renderpass_info_recording->has_draw ?
+                      0 :
+                      tc->renderpass_info_recording->data8[3];
+      bool zsbuf_changed = tc->fb_resources[PIPE_MAX_COLOR_BUFS] !=
+                           (fb->zsbuf ? fb->zsbuf->texture : NULL);
+
+      for (unsigned i = 0; i < nr_cbufs; i++) {
+         p->state.cbufs[i] = NULL;
+         pipe_surface_reference(&p->state.cbufs[i], fb->cbufs[i]);
+         /* full tracking requires storing the fb attachment resources */
+         tc->fb_resources[i] = fb->cbufs[i] ? fb->cbufs[i]->texture : NULL;
+      }
+      memset(&tc->fb_resources[nr_cbufs], 0,
+             sizeof(void*) * (PIPE_MAX_COLOR_BUFS - nr_cbufs));
+
+      tc->fb_resources[PIPE_MAX_COLOR_BUFS] = fb->zsbuf ? fb->zsbuf->texture : NULL;
+      if (tc->seen_fb_state) {
+         /* this is the end of a renderpass, so increment the renderpass info */
+         tc_batch_increment_renderpass_info(tc, false);
+         /* if zsbuf hasn't changed (i.e., possibly just adding a color buffer):
+          * keep zsbuf usage data
+          */
+         if (!zsbuf_changed)
+            tc->renderpass_info_recording->data8[3] = zsbuf;
+      } else {
+         /* this is the first time a set_framebuffer_call is triggered;
+          * just increment the index and keep using the existing info for recording
+          */
+         tc->batch_slots[tc->next].renderpass_info_idx = 0;
+      }
+      /* future fb state changes will increment the index */
+      tc->seen_fb_state = true;
+   } else {
+      for (unsigned i = 0; i < nr_cbufs; i++) {
+         p->state.cbufs[i] = NULL;
+         pipe_surface_reference(&p->state.cbufs[i], fb->cbufs[i]);
+      }
    }
+   tc->in_renderpass = false;
    p->state.zsbuf = NULL;
    pipe_surface_reference(&p->state.zsbuf, fb->zsbuf);
 }
@@ -3185,6 +3405,8 @@ tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
    struct pipe_screen *screen = pipe->screen;
    bool async = flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC);
 
+   tc->in_renderpass = false;
+
    if (async && tc->options.create_fence) {
       if (fence) {
          struct tc_batch *next = &tc->batch_slots[tc->next];
@@ -3215,17 +3437,25 @@ tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
       p->fence = fence ? *fence : NULL;
       p->flags = flags | TC_FLUSH_ASYNC;
 
-      if (!(flags & PIPE_FLUSH_DEFERRED))
-         tc_batch_flush(tc);
+      if (!(flags & PIPE_FLUSH_DEFERRED)) {
+         /* non-deferred async flushes indicate completion of existing renderpass info */
+         tc_signal_renderpass_info_ready(tc);
+         tc_batch_flush(tc, false);
+         tc->seen_fb_state = false;
+      }
+
       return;
    }
 
 out_of_memory:
+   /* renderpass info is signaled during sync */
    tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" :
                    flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal");
 
-   if (!(flags & PIPE_FLUSH_DEFERRED))
+   if (!(flags & PIPE_FLUSH_DEFERRED)) {
       tc_flush_queries(tc);
+      tc->seen_fb_state = false;
+   }
    tc_set_driver_thread(tc);
    pipe->flush(pipe, fence, flags);
    tc_clear_driver_thread(tc);
@@ -3440,6 +3670,7 @@ tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info,
    struct threaded_context *tc = threaded_context(_pipe);
    unsigned index_size = info->index_size;
    bool has_user_indices = info->has_user_indices;
+   tc_parse_draw(tc);
 
    if (unlikely(indirect)) {
       assert(!has_user_indices);
@@ -3759,6 +3990,7 @@ tc_draw_vertex_state(struct pipe_context *_pipe,
                      unsigned num_draws)
 {
    struct threaded_context *tc = threaded_context(_pipe);
+   tc_parse_draw(tc);
 
    if (num_draws == 1) {
       /* Single draw. */
@@ -4053,6 +4285,18 @@ tc_invalidate_resource(struct pipe_context *_pipe,
    struct tc_resource_call *call = tc_add_call(tc, TC_CALL_invalidate_resource,
                                                tc_resource_call);
    tc_set_resource_reference(&call->resource, resource);
+
+   struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
+   if (info) {
+      if (tc->fb_resources[PIPE_MAX_COLOR_BUFS] == resource) {
+         info->zsbuf_invalidate = true;
+      } else {
+         for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+            if (tc->fb_resources[i] == resource)
+               info->cbuf_invalidate |= BITFIELD_BIT(i);
+         }
+      }
+   }
 }
 
 struct tc_clear {
@@ -4083,8 +4327,23 @@ tc_clear(struct pipe_context *_pipe, unsigned buffers, const struct pipe_scissor
    struct tc_clear *p = tc_add_call(tc, TC_CALL_clear, tc_clear);
 
    p->buffers = buffers;
-   if (scissor_state)
+   if (scissor_state) {
       p->scissor_state = *scissor_state;
+      struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
+      /* partial clear info is useful for drivers to know whether any zs writes occur;
+       * drivers are responsible for optimizing partial clear -> full clear
+       */
+      if (info && buffers & PIPE_CLEAR_DEPTHSTENCIL)
+         info->zsbuf_clear_partial |= !info->zsbuf_clear;
+   } else {
+      struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
+      if (info) {
+         /* full clears use a different load operation, but are only valid if draws haven't occurred yet */
+         info->cbuf_clear |= (buffers >> 2) & ~info->cbuf_load;
+         if (buffers & PIPE_CLEAR_DEPTHSTENCIL && !info->zsbuf_load && !info->zsbuf_clear_partial)
+            info->zsbuf_clear = true;
+      }
+   }
    p->scissor_state_set = !!scissor_state;
    p->color = *color;
    p->depth = depth;
@@ -4480,6 +4739,7 @@ tc_destroy(struct pipe_context *_pipe)
 
       for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
          util_queue_fence_destroy(&tc->batch_slots[i].fence);
+         util_dynarray_fini(&tc->batch_slots[i].renderpass_infos);
          assert(!tc->batch_slots[i].token);
       }
    }
@@ -4595,6 +4855,11 @@ threaded_context_create(struct pipe_context *pipe,
 #endif
       tc->batch_slots[i].tc = tc;
       util_queue_fence_init(&tc->batch_slots[i].fence);
+      tc->batch_slots[i].renderpass_info_idx = -1;
+      if (tc->options.parse_renderpass_info) {
+         util_dynarray_init(&tc->batch_slots[i].renderpass_infos, NULL);
+         tc_batch_renderpass_infos_resize(&tc->batch_slots[i]);
+      }
    }
    for (unsigned i = 0; i < TC_MAX_BUFFER_LISTS; i++)
       util_queue_fence_init(&tc->buffer_lists[i].driver_flushed_fence);
@@ -4756,6 +5021,8 @@ threaded_context_create(struct pipe_context *pipe,
       *out = tc;
 
    tc_begin_next_buffer_list(tc);
+   if (tc->options.parse_renderpass_info)
+      tc_batch_increment_renderpass_info(tc, false);
    return &tc->base;
 
 fail:
@@ -4773,3 +5040,11 @@ threaded_context_init_bytes_mapped_limit(struct threaded_context *tc, unsigned d
          tc->bytes_mapped_limit = MIN2(tc->bytes_mapped_limit, 512*1024*1024UL);
    }
 }
+
+const struct tc_renderpass_info *
+threaded_context_get_renderpass_info(struct threaded_context *tc, bool wait)
+{
+   if (tc->renderpass_info && wait)
+      util_queue_fence_wait(&tc->renderpass_info->ready);
+   return tc->renderpass_info;
+}
\ No newline at end of file
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h
index 67296fc..e87b006 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -204,6 +204,7 @@
 #include "util/u_range.h"
 #include "util/u_thread.h"
 #include "util/slab.h"
+#include "util/u_dynarray.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -424,6 +425,63 @@ struct tc_unflushed_batch_token {
    struct threaded_context *tc;
 };
 
+struct tc_renderpass_info {
+   union {
+      struct {
+         /* bitmask of full-cleared color buffers */
+         uint8_t cbuf_clear;
+         /* bitmask of not-full-cleared color buffers */
+         uint8_t cbuf_load;
+         /* bitmask of color buffers that have their stores invalidated */
+         uint8_t cbuf_invalidate;
+         /* whether the zsbuf is full-cleared */
+         bool zsbuf_clear : 1;
+         /* whether the zsbuf is partial-cleared */
+         bool zsbuf_clear_partial : 1;
+         /* whether the zsbuf is not-full-cleared */
+         bool zsbuf_load : 1;
+         /* whether the zsbuf is invalidated */
+         bool zsbuf_invalidate : 1;
+         /* whether a draw occurs */
+         bool has_draw : 1;
+         uint8_t pad : 3;
+         /* 32 bits offset */
+         /* bitmask of color buffers using fbfetch */
+         uint8_t cbuf_fbfetch;
+         /* whether the fragment shader writes to the zsbuf */
+         bool zsbuf_write_fs : 1;
+         /* whether the DSA state writes to the zsbuf */
+         bool zsbuf_write_dsa : 1;
+         /* whether the DSA state reads the zsbuf */
+         bool zsbuf_read_dsa : 1;
+         /* whether the zsbuf is used for fbfetch */
+         bool zsbuf_fbfetch : 1;
+         uint8_t pad2 : 4;
+         uint16_t pad3;
+      };
+      uint64_t data;
+      /* fb info is in data32[0] */
+      uint32_t data32[2];
+      /* cso info is in data16[2] */
+      uint16_t data16[4];
+      /* zsbuf fb info is in data8[3] */
+      uint8_t data8[8];
+   };
+   /* determines whether the info can be "safely" read by drivers or if it may still be in use */
+   struct util_queue_fence ready;
+};
+
+static inline bool
+tc_renderpass_info_is_zsbuf_used(const struct tc_renderpass_info *info)
+{
+   return info->zsbuf_clear ||
+          info->zsbuf_clear_partial ||
+          info->zsbuf_write_fs ||
+          info->zsbuf_write_dsa ||
+          info->zsbuf_read_dsa ||
+          info->zsbuf_fbfetch;
+}
+
 struct tc_batch {
    struct threaded_context *tc;
 #if !defined(NDEBUG) && TC_DEBUG >= 1
@@ -431,6 +489,8 @@ struct tc_batch {
 #endif
    uint16_t num_total_slots;
    uint16_t buffer_list_index;
+   /* the index of the current renderpass info for recording */
+   int renderpass_info_idx;
 
    /* The last mergeable call that was added to this batch (i.e.
     * buffer subdata). This might be out-of-date or NULL.
@@ -438,8 +498,11 @@ struct tc_batch {
    struct tc_call_base *last_mergeable_call;
 
    struct util_queue_fence fence;
+   /* whether the first set_framebuffer_state call has been seen by this batch */
+   bool first_set_fb;
    struct tc_unflushed_batch_token *token;
    uint64_t slots[TC_SLOTS_PER_BATCH];
+   struct util_dynarray renderpass_infos;
 };
 
 struct tc_buffer_list {
@@ -468,6 +531,13 @@ struct threaded_context_options {
 
    /* If true, create_fence_fd doesn't access the context in the driver. */
    bool unsynchronized_create_fence_fd;
+   /* if true, parse and track renderpass info during execution */
+   bool parse_renderpass_info;
+   /* callbacks for drivers to read their DSA/FS state and update renderpass info accordingly
+    * note: drivers must ONLY append to renderpass info using |=
+    */
+   void (*dsa_parse)(void *state, struct tc_renderpass_info *info);
+   void (*fs_parse)(void *state, struct tc_renderpass_info *info);
 };
 
 struct threaded_context {
@@ -511,6 +581,10 @@ struct threaded_context {
    bool seen_tcs;
    bool seen_tes;
    bool seen_gs;
+   /* whether the current renderpass has seen a set_framebuffer_state call */
+   bool seen_fb_state;
+   /* whether a renderpass is currently active */
+   bool in_renderpass;
 
    bool seen_streamout_buffers;
    bool seen_shader_buffers[PIPE_SHADER_TYPES];
@@ -545,13 +619,33 @@ struct threaded_context {
 
    struct tc_batch batch_slots[TC_MAX_BATCHES];
    struct tc_buffer_list buffer_lists[TC_MAX_BUFFER_LISTS];
+   /* the curent framebuffer attachments; [PIPE_MAX_COLOR_BUFS] is the zsbuf */
+   struct pipe_resource *fb_resources[PIPE_MAX_COLOR_BUFS + 1];
+   /* accessed by main thread; preserves info across batches */
+   struct tc_renderpass_info *renderpass_info_recording;
+   /* accessed by driver thread */
+   struct tc_renderpass_info *renderpass_info;
 };
 
+
 void threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage);
 void threaded_resource_deinit(struct pipe_resource *res);
 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
 void tc_driver_internal_flush_notify(struct threaded_context *tc);
 
+/** function for getting the current renderpass info:
+ * - renderpass info is always valid
+ * - set 'wait=true' when calling during normal execution
+ * - set 'wait=true' when calling from flush
+ *
+ * Rules:
+ * 1) this must be called with 'wait=true' after the driver receives a pipe_context::set_framebuffer_state callback
+ * 2) this should be called with 'wait=false' when the driver receives a blocking pipe_context::flush call
+ * 3) this must not be used during any internal driver operations (e.g., u_blitter)
+ */
+const struct tc_renderpass_info *
+threaded_context_get_renderpass_info(struct threaded_context *tc, bool wait);
+
 struct pipe_context *
 threaded_context_create(struct pipe_context *pipe,
                         struct slab_parent_pool *parent_transfer_pool,
-- 
2.7.4