[Mesa-dev] [PATCH 1/3] i965: Be resilient in the face of GPU hangs

Chris Wilson chris at chris-wilson.co.uk
Mon Feb 18 12:22:09 UTC 2019


If we hang the GPU and end up banning our context, we will no longer be
able to submit and abort with an error (exit(1) no less). As we submit
minimal incremental batches that rely on the logical context state of
previous batches, we can not rely on the kernel's recovery mechanism
which tries to restore the context back to a "golden" renderstate (the
default HW setup) and replay the batches in flight. Instead, we must
create a new context and set it up, including all the lost register
settings that we only apply once during setup, before allow the user to
continue rendering. The batches already submitted are lost
(unrecoverable) so there will be a momentarily glitch and lost rendering
across frames, but the application should be able to recover and
continue on fairly oblivious.

To make wedging even more likely, we use a new "no recovery" context
parameter that tells the kernel to not even attempt to replay any
batches in flight against the default context image, as experience shows
the HW is not always robust enough to cope with the conflicting state.

v2: Export brw_reset_state() to improve the amount of state we clobber
on return to a starting context. (Kenneth)

Cc: Kenneth Graunke <kenneth at whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org> # pre-uapi split
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c        | 11 +++++++++
 src/mesa/drivers/dri/i965/brw_bufmgr.h        |  1 +
 src/mesa/drivers/dri/i965/brw_context.h       |  3 +++
 src/mesa/drivers/dri/i965/brw_state_upload.c  | 22 ++++++++++++++----
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 23 +++++++++++++++++++
 5 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index b33a30930db..1248f8b9fa4 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -1621,6 +1621,17 @@ brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
    return err;
 }
 
+int
+brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = ctx_id,
+      .param = I915_CONTEXT_PARAM_PRIORITY,
+   };
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p);
+   return p.value; /* on error, return 0 i.e. default priority */
+}
+
 void
 brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 32fc7a553c9..9e80c2a831b 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -356,6 +356,7 @@ uint32_t brw_create_hw_context(struct brw_bufmgr *bufmgr);
 int brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
                                 uint32_t ctx_id,
                                 int priority);
+int brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
 void brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 66fe5b3a8a0..4a306c4217a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1647,6 +1647,9 @@ brw_get_graphics_reset_status(struct gl_context *ctx);
 void
 brw_check_for_reset(struct brw_context *brw);
 
+void
+brw_reset_state(struct brw_context *brw);
+
 /* brw_compute.c */
 extern void
 brw_init_compute_functions(struct dd_function_table *functions);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 50049d325b3..b873cf1b58a 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -228,12 +228,8 @@ brw_copy_pipeline_atoms(struct brw_context *brw,
 
 void brw_init_state( struct brw_context *brw )
 {
-   struct gl_context *ctx = &brw->ctx;
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
-   /* Force the first brw_select_pipeline to emit pipeline select */
-   brw->last_pipeline = BRW_NUM_PIPELINES;
-
    brw_init_caches(brw);
 
    if (devinfo->gen >= 11)
@@ -257,6 +253,17 @@ void brw_init_state( struct brw_context *brw )
    else
       gen4_init_atoms(brw);
 
+   brw_reset_state(brw);
+}
+
+
+void brw_reset_state( struct brw_context *brw )
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* Force the first brw_select_pipeline to emit pipeline select */
+   brw->last_pipeline = BRW_NUM_PIPELINES;
+
    brw_upload_initial_gpu_state(brw);
 
    brw->NewGLState = ~0;
@@ -267,6 +274,13 @@ void brw_init_state( struct brw_context *brw )
     */
    brw->pma_stall_bits = ~0;
 
+   brw->no_depth_or_stencil = false;
+
+   brw->urb.vsize = 0;
+   brw->urb.gsize = 0;
+   brw->urb.hsize = 0;
+   brw->urb.dsize = 0;
+
    /* Make sure that brw->ctx.NewDriverState has enough bits to hold all possible
     * dirty flags.
     */
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 8097392d22b..84620d0a5b9 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -748,6 +748,22 @@ execbuffer(int fd,
    return ret;
 }
 
+static int recreate_context(struct brw_context *brw)
+{
+   struct brw_bufmgr *bufmgr = brw->bufmgr;
+   int prio;
+
+   prio = brw_hw_context_get_priority(bufmgr, brw->hw_ctx);
+   brw_destroy_hw_context(bufmgr, brw->hw_ctx);
+
+   brw->hw_ctx = brw_create_hw_context(bufmgr);
+   if (!brw->hw_ctx)
+      return -EIO;
+
+   brw_hw_context_set_priority(bufmgr, brw->hw_ctx, prio);
+   return 0;
+}
+
 static int
 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 {
@@ -834,6 +850,13 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
       brw_check_for_reset(brw);
 
+   if (ret == -EIO) { /* Abandon all hope; and restart from scratch */
+      if (unlikely(INTEL_DEBUG & DEBUG_SUBMIT))
+         fprintf(stderr, "detected GPU hang; recreating context\n");
+      ret = recreate_context(brw);
+      brw_reset_state(brw);
+   }
+
    if (ret != 0) {
       fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
               strerror(-ret));
-- 
2.20.1



More information about the mesa-dev mailing list