Mesa (master): iris: Try to recover from GPU hangs.

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Thu May 9 23:49:50 UTC 2019


Module: Mesa
Branch: master
Commit: c5c12bdd004994abfe0f5723e2d285cc69706b1a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=c5c12bdd004994abfe0f5723e2d285cc69706b1a

Author: Kenneth Graunke <kenneth at whitecape.org>
Date:   Tue May  7 23:19:30 2019 -0700

iris: Try to recover from GPU hangs.

The iris batch module now tries to detect that the kernel has banned
our GEM context, creates a new non-banned context, and informs the
iris context module that all assumptions about state are now invalid
and it needs to reinitialize the relevant state.

Based on Chris Wilson's work, but significantly rewritten by me.

---

 src/gallium/drivers/iris/iris_batch.c   | 31 +++++++++++++++++++++++++++
 src/gallium/drivers/iris/iris_context.c | 38 +++++++++++++++++++++++++++++++++
 src/gallium/drivers/iris/iris_context.h |  2 ++
 3 files changed, 71 insertions(+)

diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
index d2b4fc88fe6..f3d2e569aa7 100644
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@@ -452,6 +452,28 @@ iris_finish_batch(struct iris_batch *batch)
 }
 
 /**
+ * Replace our current GEM context with a new one (in case it got banned).
+ */
+static bool
+replace_hw_ctx(struct iris_batch *batch)
+{
+   struct iris_screen *screen = batch->screen;
+   struct iris_bufmgr *bufmgr = screen->bufmgr;
+
+   uint32_t new_ctx = iris_clone_hw_context(bufmgr, batch->hw_ctx_id);
+   if (!new_ctx)
+      return false;
+
+   iris_destroy_hw_context(bufmgr, batch->hw_ctx_id);
+   batch->hw_ctx_id = new_ctx;
+
+   /* Notify the context that state must be re-initialized. */
+   iris_lost_context_state(batch);
+
+   return true;
+}
+
+/**
  * Submit the batch to the GPU via execbuffer2.
  */
 static int
@@ -583,6 +605,15 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
    /* Start a new batch buffer. */
    iris_batch_reset(batch);
 
+   /* EIO means our context is banned.  In this case, try and replace it
+    * with a new logical context, and inform iris_context that all state
+    * has been lost and needs to be re-initialized.  If this succeeds,
+    * dubiously claim success...
+    */
+   if (ret == -EIO && replace_hw_ctx(batch)) {
+      ret = 0;
+   }
+
    if (ret >= 0) {
       //if (iris->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
          //iris_check_for_reset(ice);
diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c
index a1d11755a24..7ed4fdcd8d7 100644
--- a/src/gallium/drivers/iris/iris_context.c
+++ b/src/gallium/drivers/iris/iris_context.c
@@ -63,6 +63,44 @@ iris_set_debug_callback(struct pipe_context *ctx,
       memset(&ice->dbg, 0, sizeof(ice->dbg));
 }
 
+/**
+ * Called from the batch module when it detects a GPU hang.
+ *
+ * In this case, we've lost our GEM context, and can't rely on any existing
+ * state on the GPU.  We must mark everything dirty and wipe away any saved
+ * assumptions about the last known state of the GPU.
+ */
+void
+iris_lost_context_state(struct iris_batch *batch)
+{
+   /* The batch module doesn't have an iris_context, because we want to
+    * avoid introducing lots of layering violations.  Unfortunately, here
+    * we do need to inform the context of batch catastrophe.  We know the
+    * batch is one of our context's, so hackily claw our way back.
+    */
+   struct iris_context *ice = NULL;
+   struct iris_screen *screen;
+
+   if (batch->name == IRIS_BATCH_RENDER) {
+      ice = container_of(batch, ice, batches[IRIS_BATCH_RENDER]);
+      assert(&ice->batches[IRIS_BATCH_RENDER] == batch);
+      screen = (void *) ice->ctx.screen;
+
+      ice->vtbl.init_render_context(screen, batch, &ice->vtbl, &ice->dbg);
+   } else if (batch->name == IRIS_BATCH_COMPUTE) {
+      ice = container_of(batch, ice, batches[IRIS_BATCH_COMPUTE]);
+      assert(&ice->batches[IRIS_BATCH_COMPUTE] == batch);
+      screen = (void *) ice->ctx.screen;
+
+      ice->vtbl.init_compute_context(screen, batch, &ice->vtbl, &ice->dbg);
+   } else {
+      unreachable("unhandled batch reset");
+   }
+
+   ice->state.dirty = ~0ull;
+   memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
+}
+
 static void
 iris_get_sample_position(struct pipe_context *ctx,
                          unsigned sample_count,
diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h
index 31f345d36b0..4501c4fcad9 100644
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -662,6 +662,8 @@ double get_time(void);
 struct pipe_context *
 iris_create_context(struct pipe_screen *screen, void *priv, unsigned flags);
 
+void iris_lost_context_state(struct iris_batch *batch);
+
 void iris_init_blit_functions(struct pipe_context *ctx);
 void iris_init_clear_functions(struct pipe_context *ctx);
 void iris_init_program_functions(struct pipe_context *ctx);




More information about the mesa-commit mailing list