zink: add env var to abort on device-lost if no reset callback is set
authorMike Blumenkrantz <michael.blumenkrantz@gmail.com>
Tue, 12 Jul 2022 13:17:25 +0000 (09:17 -0400)
committerMarge Bot <emma+marge@anholt.net>
Thu, 21 Jul 2022 14:02:27 +0000 (14:02 +0000)
the alternative here is to just spin aimlessly until the process ooms,
which causes problems when trying to detect failures in cts caselists

a separate env var is used so that it can be exported without affecting
ZINK_DEBUG

Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17525>

src/gallium/drivers/zink/zink_batch.c
src/gallium/drivers/zink/zink_context.c
src/gallium/drivers/zink/zink_screen.c
src/gallium/drivers/zink/zink_screen.h

index f041e09..91447aa 100644 (file)
@@ -332,6 +332,9 @@ post_submit(void *data, void *gdata, int thread_index)
    if (bs->is_device_lost) {
       if (bs->ctx->reset.reset)
          bs->ctx->reset.reset(bs->ctx->reset.data, PIPE_GUILTY_CONTEXT_RESET);
+      else if (screen->abort_on_hang && !screen->robust_ctx_count)
+         /* if nothing can save us, abort */
+         abort();
       screen->device_lost = true;
    } else if (bs->ctx->batch_states_count > 5000) {
       zink_screen_timeline_wait(screen, bs->fence.batch_id - 2500, PIPE_TIMEOUT_INFINITE);
index 3494733..1ad4825 100644 (file)
@@ -217,11 +217,20 @@ zink_set_device_reset_callback(struct pipe_context *pctx,
                                const struct pipe_device_reset_callback *cb)
 {
    struct zink_context *ctx = zink_context(pctx);
+   bool had_reset = !!ctx->reset.reset;
 
    if (cb)
       ctx->reset = *cb;
    else
       memset(&ctx->reset, 0, sizeof(ctx->reset));
+
+   bool have_reset = !!ctx->reset.reset;
+   if (had_reset != have_reset) {
+      if (have_reset)
+         p_atomic_inc(&zink_screen(pctx->screen)->robust_ctx_count);
+      else
+         p_atomic_dec(&zink_screen(pctx->screen)->robust_ctx_count);
+   }
 }
 
 static void
index f5a5436..b81052b 100644 (file)
@@ -2103,6 +2103,7 @@ zink_internal_create_screen(const struct pipe_screen_config *config)
       return NULL;
 
    screen->threaded = util_get_cpu_caps()->nr_cpus > 1 && debug_get_bool_option("GALLIUM_THREAD", util_get_cpu_caps()->nr_cpus > 1);
+   screen->abort_on_hang = debug_get_bool_option("ZINK_HANG_ABORT", false);
 
    zink_debug = debug_get_option_zink_debug();
    zink_descriptor_mode = debug_get_option_zink_descriptor_mode();
index 1fcc7f6..24f0a63 100644 (file)
@@ -101,6 +101,7 @@ struct zink_screen {
 
    bool threaded;
    bool is_cpu;
+   bool abort_on_hang;
    uint64_t curr_batch; //the current batch id
    uint32_t last_finished;
    VkSemaphore sem;
@@ -110,6 +111,7 @@ struct zink_screen {
 
    unsigned buffer_rebind_counter;
    unsigned image_rebind_counter;
+   unsigned robust_ctx_count;
 
    struct hash_table dts;
    simple_mtx_t dt_lock;
@@ -258,6 +260,9 @@ zink_screen_handle_vkresult(struct zink_screen *screen, VkResult ret)
    case VK_ERROR_DEVICE_LOST:
       screen->device_lost = true;
       mesa_loge("zink: DEVICE LOST!\n");
+      /* if nothing can save us, abort */
+      if (screen->abort_on_hang && !screen->robust_ctx_count)
+         abort();
       FALLTHROUGH;
    default:
       success = false;