[Mesa-dev] [PATCH] i965: Be resilient in the face of GPU hangs

Chris Wilson chris at chris-wilson.co.uk
Sat Feb 16 12:46:27 UTC 2019


If we hang the GPU and end up banning our context, we will no longer be
able to submit and abort with an error (exit(1) no less). As we submit
minimal incremental batches that rely on the logical context state of
previous batches, we can not rely on the kernel's recovery mechanism
which tries to restore the context back to a "golden" renderstate (the
default HW setup) and replay the batches in flight. Instead, we must
create a new context and set it up, including all the lost register
settings that we only apply once during setup, before allow the user to
continue rendering. The batches already submitted are lost
(unrecoverable) so there will be a momentarily glitch and lost rendering
across frames, but the application should be able to recover and
continue on fairly oblivious.

To make wedging even more likely, we use a new "no recovery" context
parameter that tells the kernel to not even attempt to replay any
batches in flight against the default context image, as experience shows
the HW is not always robust enough to cope with the conflicting state.

v2: Export brw_reset_state() to improve the amount of state we clobber
on return to a starting context. (Kenneth)

Cc: Kenneth Graunke <kenneth at whitecape.org>
---
The intent was to refactor the existing brw_reset_state() out of
brw_init_state() so that we could reuse, so reuse it!
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c        | 25 +++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_bufmgr.h        |  2 ++
 src/mesa/drivers/dri/i965/brw_context.h       |  3 +++
 src/mesa/drivers/dri/i965/brw_state_upload.c  | 22 ++++++++++++----
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 +++++++++++++++
 5 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index b33a30930db..d8a9f0c450d 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -1589,6 +1589,16 @@ init_cache_buckets(struct brw_bufmgr *bufmgr)
    }
 }
 
+static void init_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = ctx_id,
+      .param = 0x8, // I915_CONTEXT_PARAM_RECOVERABLE,
+   };
+
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p);
+}
+
 uint32_t
 brw_create_hw_context(struct brw_bufmgr *bufmgr)
 {
@@ -1599,6 +1609,8 @@ brw_create_hw_context(struct brw_bufmgr *bufmgr)
       return 0;
    }
 
+   init_context(bufmgr, create.ctx_id);
+
    return create.ctx_id;
 }
 
@@ -1621,6 +1633,19 @@ brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
    return err;
 }
 
+int
+brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = ctx_id,
+      .param = I915_CONTEXT_PARAM_PRIORITY,
+   };
+
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p);
+
+   return p.value; /* on error, return 0 i.e. default priority */
+}
+
 void
 brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 32fc7a553c9..886b2e607ce 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -356,6 +356,8 @@ uint32_t brw_create_hw_context(struct brw_bufmgr *bufmgr);
 int brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
                                 uint32_t ctx_id,
                                 int priority);
+int
+brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
 void brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 66fe5b3a8a0..4a306c4217a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1647,6 +1647,9 @@ brw_get_graphics_reset_status(struct gl_context *ctx);
 void
 brw_check_for_reset(struct brw_context *brw);
 
+void
+brw_reset_state(struct brw_context *brw);
+
 /* brw_compute.c */
 extern void
 brw_init_compute_functions(struct dd_function_table *functions);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 50049d325b3..a320c24edc5 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -228,12 +228,8 @@ brw_copy_pipeline_atoms(struct brw_context *brw,
 
 void brw_init_state( struct brw_context *brw )
 {
-   struct gl_context *ctx = &brw->ctx;
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
-   /* Force the first brw_select_pipeline to emit pipeline select */
-   brw->last_pipeline = BRW_NUM_PIPELINES;
-
    brw_init_caches(brw);
 
    if (devinfo->gen >= 11)
@@ -257,6 +253,16 @@ void brw_init_state( struct brw_context *brw )
    else
       gen4_init_atoms(brw);
 
+   brw_reset_state(brw);
+}
+
+void brw_reset_state( struct brw_context *brw )
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* Force the first brw_select_pipeline to emit pipeline select */
+   brw->last_pipeline = BRW_NUM_PIPELINES;
+
    brw_upload_initial_gpu_state(brw);
 
    brw->NewGLState = ~0;
@@ -267,6 +273,13 @@ void brw_init_state( struct brw_context *brw )
     */
    brw->pma_stall_bits = ~0;
 
+   brw->no_depth_or_stencil = false;
+
+   brw->urb.vsize = 0;
+   brw->urb.gsize = 0;
+   brw->urb.hsize = 0;
+   brw->urb.dsize = 0;
+
    /* Make sure that brw->ctx.NewDriverState has enough bits to hold all possible
     * dirty flags.
     */
@@ -284,7 +297,6 @@ void brw_init_state( struct brw_context *brw )
    ctx->DriverFlags.NewIntelConservativeRasterization = BRW_NEW_CONSERVATIVE_RASTERIZATION;
 }
 
-
 void brw_destroy_state( struct brw_context *brw )
 {
    brw_destroy_caches(brw);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 1cdf6fd65f5..145885e3b40 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -749,6 +749,18 @@ execbuffer(int fd,
    return ret;
 }
 
+static void recreate_context(struct brw_context *brw)
+{
+   struct brw_bufmgr *bufmgr = brw->bufmgr;
+   int prio;
+
+   prio = brw_hw_context_get_priority(bufmgr, brw->hw_ctx);
+   brw_destroy_hw_context(bufmgr, brw->hw_ctx);
+
+   brw->hw_ctx = brw_create_hw_context(bufmgr);
+   brw_hw_context_set_priority(bufmgr, brw->hw_ctx, prio);
+}
+
 static int
 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 {
@@ -835,6 +847,14 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
       brw_check_for_reset(brw);
 
+   if (ret == -EIO) { /* Abandon all hope; and restart from scratch */
+      if (unlikely(INTEL_DEBUG & DEBUG_SUBMIT))
+         fprintf(stderr, "detected GPU hang; recreating context\n");
+      recreate_context(brw);
+      brw_reset_state(brw);
+      ret = 0;
+   }
+
    if (ret != 0) {
       fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
               strerror(-ret));
-- 
2.20.1



More information about the mesa-dev mailing list