[Mesa-dev] [PATCH 2/2] i965: Be resilient in the face of GPU hangs

Chris Wilson chris at chris-wilson.co.uk
Thu Feb 14 12:05:00 UTC 2019


If we hang the GPU and end up banning our context, we will no longer be
able to submit and abort with an error (exit(1) no less). As we submit
minimal incremental batches that rely on the logical context state of
previous batches, we can not rely on the kernel's recovery mechanism
which tries to restore the context back to a "golden" renderstate (the
default HW setup) and replay the batches in flight. Instead, we must
create a new context and set it up, including all the lost register
settings that we only apply once during setup, before allow the user to
continue rendering. The batches already submitted are lost
(unrecoverable) so there will be a momentarily glitch and lost rendering
across frames, but the application should be able to recover and
continue on fairly oblivious.

To make wedging even more likely, we use a new "no recovery" context
parameter that tells the kernel to not even attempt to replay any
batches in flight against the default context image, as experience shows
the HW is not always robust enough to cope with the conflicting state.

Cc: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c        | 25 +++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_bufmgr.h        |  2 ++
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 19 ++++++++++++++
 3 files changed, 46 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index b33a30930db..289b39cd584 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -1589,6 +1589,16 @@ init_cache_buckets(struct brw_bufmgr *bufmgr)
    }
 }
 
+static void init_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = ctx_id,
+      .param = 0x7, // I915_CONTEXT_PARAM_RECOVERABLE,
+   };
+
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p);
+}
+
 uint32_t
 brw_create_hw_context(struct brw_bufmgr *bufmgr)
 {
@@ -1599,6 +1609,8 @@ brw_create_hw_context(struct brw_bufmgr *bufmgr)
       return 0;
    }
 
+   init_context(bufmgr, create.ctx_id);
+
    return create.ctx_id;
 }
 
@@ -1621,6 +1633,19 @@ brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
    return err;
 }
 
+int
+brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = ctx_id,
+      .param = I915_CONTEXT_PARAM_PRIORITY,
+   };
+
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p);
+
+   return p.value; /* on error, return 0 i.e. default priority */
+}
+
 void
 brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 32fc7a553c9..886b2e607ce 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -356,6 +356,8 @@ uint32_t brw_create_hw_context(struct brw_bufmgr *bufmgr);
 int brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
                                 uint32_t ctx_id,
                                 int priority);
+int
+brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
 void brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 8097392d22b..afb6e2401e3 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -748,6 +748,18 @@ execbuffer(int fd,
    return ret;
 }
 
+static void recreate_context(struct brw_context *brw)
+{
+   struct brw_bufmgr *bufmgr = brw->bufmgr;
+   int prio;
+
+   prio = brw_hw_context_get_priority(bufmgr, brw->hw_ctx);
+   brw_destroy_hw_context(bufmgr, brw->hw_ctx);
+
+   brw->hw_ctx = brw_create_hw_context(bufmgr);
+   brw_hw_context_set_priority(bufmgr, brw->hw_ctx, prio);
+}
+
 static int
 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 {
@@ -834,6 +846,13 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
       brw_check_for_reset(brw);
 
+   if (ret == -EIO) {
+      recreate_context(brw);
+      brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
+      brw_upload_invariant_state(brw);
+      ret = 0;
+   }
+
    if (ret != 0) {
       fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
               strerror(-ret));
-- 
2.20.1



More information about the mesa-dev mailing list