venus: check and configure new ringMonitoring feature
authorRyan Neph <ryanneph@google.com>
Wed, 22 Mar 2023 19:47:42 +0000 (12:47 -0700)
committerMarge Bot <emma+marge@anholt.net>
Fri, 24 Mar 2023 23:27:53 +0000 (23:27 +0000)
At ring creation, if supported by renderer, we can request
ringMonitoring. During driver ring waits, the ring's new ALIVE status
bit will be checked periodically at the configured rate. If the bit is
not set, the renderer must have crashed and the driver should do the
same to signal a problem to the app/user.

Signed-off-by: Ryan Neph <ryanneph@google.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22036>

src/virtio/vulkan/vn_common.c
src/virtio/vulkan/vn_common.h
src/virtio/vulkan/vn_instance.c
src/virtio/vulkan/vn_ring.c
src/virtio/vulkan/vn_ring.h

index 115961f..73042de 100644 (file)
@@ -123,9 +123,52 @@ vn_extension_get_spec_version(const char *name)
    return index >= 0 ? vn_info_extension_get(index)->spec_version : 0;
 }
 
+static bool
+vn_ring_monitor_acquire(struct vn_ring *ring)
+{
+   pid_t tid = gettid();
+   if (!ring->monitor.threadid && tid != ring->monitor.threadid &&
+       mtx_trylock(&ring->monitor.mutex) == thrd_success) {
+      /* register as the only waiting thread that monitors the ring. */
+      ring->monitor.threadid = tid;
+   }
+   return tid == ring->monitor.threadid;
+}
+
+void
+vn_ring_monitor_release(struct vn_ring *ring)
+{
+   if (gettid() != ring->monitor.threadid)
+      return;
+
+   ring->monitor.threadid = 0;
+   mtx_unlock(&ring->monitor.mutex);
+}
+
 struct vn_relax_state
 vn_relax_init(struct vn_ring *ring, const char *reason)
 {
+   if (ring->monitor.report_period_us) {
+#ifndef NDEBUG
+      /* ensure minimum check period is greater than maximum renderer
+       * reporting period (with margin of safety to ensure no false
+       * positives).
+       *
+       * first_warn_time is pre-calculated based on parameters in vn_relax
+       * and must update together.
+       */
+      const uint32_t first_warn_time = 3481600;
+      const uint32_t safety_margin = 250000;
+      assert(first_warn_time - safety_margin >=
+             ring->monitor.report_period_us);
+#endif
+
+      if (vn_ring_monitor_acquire(ring)) {
+         ring->monitor.alive = true;
+         vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
+      }
+   }
+
    return (struct vn_relax_state){
       .ring = ring,
       .iter = 0,
@@ -143,6 +186,7 @@ vn_relax(struct vn_relax_state *state)
    /* Yield for the first 2^busy_wait_order times and then sleep for
     * base_sleep_us microseconds for the same number of times.  After that,
     * keep doubling both sleep length and count.
+    * Must also update pre-calculated "first_warn_time" in vn_relax_init().
     */
    const uint32_t busy_wait_order = 8;
    const uint32_t base_sleep_us = vn_env.relax_base_sleep_us;
@@ -167,6 +211,19 @@ vn_relax(struct vn_relax_state *state)
          abort();
       }
 
+      if (ring->monitor.report_period_us) {
+         if (vn_ring_monitor_acquire(ring)) {
+            ring->monitor.alive = status & VK_RING_STATUS_ALIVE_BIT_MESA;
+            vn_ring_unset_status_bits(ring, VK_RING_STATUS_ALIVE_BIT_MESA);
+         }
+
+         if (!ring->monitor.alive) {
+            vn_log(NULL, "aborting on expired ring alive status at iter %d",
+                   *iter);
+            abort();
+         }
+      }
+
       if (*iter >= (1 << abort_order) && !VN_DEBUG(NO_ABORT)) {
          vn_log(NULL, "aborting");
          abort();
index 26477d2..7ca5c5b 100644 (file)
@@ -229,6 +229,9 @@ vn_refcount_dec(struct vn_refcount *ref)
 uint32_t
 vn_extension_get_spec_version(const char *name);
 
+void
+vn_ring_monitor_release(struct vn_ring *ring);
+
 struct vn_relax_state
 vn_relax_init(struct vn_ring *ring, const char *reason);
 
@@ -238,6 +241,7 @@ vn_relax(struct vn_relax_state *state);
 static inline void
 vn_relax_fini(struct vn_relax_state *state)
 {
+   vn_ring_monitor_release(state->ring);
 }
 
 static_assert(sizeof(vn_object_id) >= sizeof(uintptr_t), "");
index 7b5b475..ad0b9bf 100644 (file)
@@ -133,8 +133,19 @@ vn_instance_init_ring(struct vn_instance *instance)
 
    instance->ring.id = (uintptr_t)ring;
 
+   struct VkRingMonitorInfoMESA monitor_info;
+   if (instance->experimental.ringMonitoring) {
+      ring->monitor.report_period_us = 3000000;
+      mtx_init(&ring->monitor.mutex, mtx_plain);
+      monitor_info = (struct VkRingMonitorInfoMESA){
+         .sType = VK_STRUCTURE_TYPE_RING_MONITOR_INFO_MESA,
+         .maxReportingPeriodMicroseconds = ring->monitor.report_period_us,
+      };
+   }
+
    const struct VkRingCreateInfoMESA info = {
       .sType = VK_STRUCTURE_TYPE_RING_CREATE_INFO_MESA,
+      .pNext = instance->experimental.ringMonitoring ? &monitor_info : NULL,
       .resourceId = instance->ring.shmem->res_id,
       .size = layout.shmem_size,
       .idleTimeout = 50ull * 1000 * 1000,
@@ -231,12 +242,14 @@ vn_instance_init_experimental_features(struct vn_instance *instance)
              "\n\tglobalFencing = %u"
              "\n\tlargeRing = %u"
              "\n\tsyncFdFencing = %u"
-             "\n\tasyncRoundtrip = %u",
+             "\n\tasyncRoundtrip = %u"
+             "\n\tringMonitoring = %u",
              instance->experimental.memoryResourceAllocationSize,
              instance->experimental.globalFencing,
              instance->experimental.largeRing,
              instance->experimental.syncFdFencing,
-             instance->experimental.asyncRoundtrip);
+             instance->experimental.asyncRoundtrip,
+             instance->experimental.ringMonitoring);
    }
 
    return VK_SUCCESS;
index 189fe4b..715cbaa 100644 (file)
@@ -208,6 +208,9 @@ vn_ring_fini(struct vn_ring *ring)
    list_for_each_entry_safe(struct vn_ring_submit, submit,
                             &ring->free_submits, head)
       free(submit);
+
+   if (ring->monitor.report_period_us)
+      mtx_destroy(&ring->monitor.mutex);
 }
 
 struct vn_ring_submit *
index 6959450..9e15a3d 100644 (file)
@@ -70,6 +70,22 @@ struct vn_ring {
 
    struct list_head submits;
    struct list_head free_submits;
+
+   /* Only one "waiting" thread may fulfill the "monitor" role at a time.
+    * Every "report_period_us" or longer, the waiting "monitor" thread tests
+    * the ring's ALIVE status, updates the "alive" atomic, and resets the
+    * ALIVE status for the next cycle. Waiting non-"monitor" threads, just
+    * check the "alive" atomic. The "monitor" role may be released and
+    * acquired by another waiting thread dynamically.
+    */
+   struct {
+      mtx_t mutex;
+      atomic_int threadid;
+      atomic_bool alive;
+
+      /* constant and non-zero after ring init, if monitoring is enabled */
+      uint32_t report_period_us;
+   } monitor;
 };
 
 void