Mesa (main): zink: add env var to abort on device-lost if no reset callback is set

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Thu Jul 21 14:19:37 UTC 2022


Module: Mesa
Branch: main
Commit: 2ea0d735d4a24f995434e0b1ef27cc14b5a5e80d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=2ea0d735d4a24f995434e0b1ef27cc14b5a5e80d

Author: Mike Blumenkrantz <michael.blumenkrantz at gmail.com>
Date:   Tue Jul 12 09:17:25 2022 -0400

zink: add env var to abort on device-lost if no reset callback is set

the alternative here is to just spin aimlessly until the process ooms,
which causes problems when trying to detect failures in cts caselists

a separate env var is used so that it can be exported without affecting
ZINK_DEBUG

Acked-by: Erik Faye-Lund <erik.faye-lund at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17525>

---

 src/gallium/drivers/zink/zink_batch.c   | 3 +++
 src/gallium/drivers/zink/zink_context.c | 9 +++++++++
 src/gallium/drivers/zink/zink_screen.c  | 1 +
 src/gallium/drivers/zink/zink_screen.h  | 5 +++++
 4 files changed, 18 insertions(+)

diff --git a/src/gallium/drivers/zink/zink_batch.c b/src/gallium/drivers/zink/zink_batch.c
index f041e091c85..91447aa0355 100644
--- a/src/gallium/drivers/zink/zink_batch.c
+++ b/src/gallium/drivers/zink/zink_batch.c
@@ -332,6 +332,9 @@ post_submit(void *data, void *gdata, int thread_index)
    if (bs->is_device_lost) {
       if (bs->ctx->reset.reset)
          bs->ctx->reset.reset(bs->ctx->reset.data, PIPE_GUILTY_CONTEXT_RESET);
+      else if (screen->abort_on_hang && !screen->robust_ctx_count)
+         /* if nothing can save us, abort */
+         abort();
       screen->device_lost = true;
    } else if (bs->ctx->batch_states_count > 5000) {
       zink_screen_timeline_wait(screen, bs->fence.batch_id - 2500, PIPE_TIMEOUT_INFINITE);
diff --git a/src/gallium/drivers/zink/zink_context.c b/src/gallium/drivers/zink/zink_context.c
index 3494733e1ce..1ad4825582b 100644
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@@ -217,11 +217,20 @@ zink_set_device_reset_callback(struct pipe_context *pctx,
                                const struct pipe_device_reset_callback *cb)
 {
    struct zink_context *ctx = zink_context(pctx);
+   bool had_reset = !!ctx->reset.reset;
 
    if (cb)
       ctx->reset = *cb;
    else
       memset(&ctx->reset, 0, sizeof(ctx->reset));
+
+   bool have_reset = !!ctx->reset.reset;
+   if (had_reset != have_reset) {
+      if (have_reset)
+         p_atomic_inc(&zink_screen(pctx->screen)->robust_ctx_count);
+      else
+         p_atomic_dec(&zink_screen(pctx->screen)->robust_ctx_count);
+   }
 }
 
 static void
diff --git a/src/gallium/drivers/zink/zink_screen.c b/src/gallium/drivers/zink/zink_screen.c
index f5a54361ca9..b81052bedb0 100644
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@@ -2103,6 +2103,7 @@ zink_internal_create_screen(const struct pipe_screen_config *config)
       return NULL;
 
    screen->threaded = util_get_cpu_caps()->nr_cpus > 1 && debug_get_bool_option("GALLIUM_THREAD", util_get_cpu_caps()->nr_cpus > 1);
+   screen->abort_on_hang = debug_get_bool_option("ZINK_HANG_ABORT", false);
 
    zink_debug = debug_get_option_zink_debug();
    zink_descriptor_mode = debug_get_option_zink_descriptor_mode();
diff --git a/src/gallium/drivers/zink/zink_screen.h b/src/gallium/drivers/zink/zink_screen.h
index 1fcc7f6a618..24f0a63fa33 100644
--- a/src/gallium/drivers/zink/zink_screen.h
+++ b/src/gallium/drivers/zink/zink_screen.h
@@ -101,6 +101,7 @@ struct zink_screen {
 
    bool threaded;
    bool is_cpu;
+   bool abort_on_hang;
    uint64_t curr_batch; //the current batch id
    uint32_t last_finished;
    VkSemaphore sem;
@@ -110,6 +111,7 @@ struct zink_screen {
 
    unsigned buffer_rebind_counter;
    unsigned image_rebind_counter;
+   unsigned robust_ctx_count;
 
    struct hash_table dts;
    simple_mtx_t dt_lock;
@@ -258,6 +260,9 @@ zink_screen_handle_vkresult(struct zink_screen *screen, VkResult ret)
    case VK_ERROR_DEVICE_LOST:
       screen->device_lost = true;
       mesa_loge("zink: DEVICE LOST!\n");
+      /* if nothing can save us, abort */
+      if (screen->abort_on_hang && !screen->robust_ctx_count)
+         abort();
       FALLTHROUGH;
    default:
       success = false;



More information about the mesa-commit mailing list