[Intel-gfx] [PATCH] drm/i915: Split I915_RESET_IN_PROGRESS into two flags

Thu Feb 23 16:59:45 UTC 2017

I915_RESET_IN_PROGRESS is being used for both signaling the requirement
to i915_mutex_lock_interruptible() to avoid taking the struct_mutex and
to instruct a waiter (already holding the struct_mutex) to perform the
reset. To allow for a little more coordination, split these two meaning
into a couple of distinct flags. I915_RESET_BACKOFF tells
i915_mutex_lock_interruptible() not to acquire the mutex and
I915_RESET_HANDOFF tells the waiter to call i915_reset().

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
This is part of a much bigger problem to try and restore the balance
between atomic modeset and resets. However, now that the waiter has
been revamped, this patch should help us ease forward with TDR.
-Chris
---
 drivers/gpu/drm/i915/i915_debugfs.c              |  8 +++--
 drivers/gpu/drm/i915/i915_drv.c                  |  5 +--
 drivers/gpu/drm/i915/i915_drv.h                  | 40 +++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_gem.c                  |  5 +--
 drivers/gpu/drm/i915/i915_gem_request.c          |  2 +-
 drivers/gpu/drm/i915/i915_irq.c                  | 11 ++++---
 drivers/gpu/drm/i915/intel_display.c             |  4 +--
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 18 +++++++----
 8 files changed, 65 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1a28b5279bec..ed63d3f7ddc0 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1331,8 +1331,10 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 
 	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
 		seq_printf(m, "Wedged\n");
-	if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags))
-		seq_printf(m, "Reset in progress\n");
+	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
+		seq_printf(m, "Reset in progress: struct_mutex backoff\n");
+	if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
+		seq_printf(m, "Reset in progress: reset handoff to waiter\n");
 	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
 		seq_printf(m, "Waiter holding struct mutex\n");
 	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -4126,7 +4128,7 @@ i915_wedged_set(void *data, u64 val)
 	 * while it is writing to 'i915_wedged'
 	 */
 
-	if (i915_reset_in_progress(&dev_priv->gpu_error))
+	if (i915_reset_backoff(&dev_priv->gpu_error))
 		return -EAGAIN;
 
 	i915_handle_error(dev_priv, val,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index b76e8f7ac174..5db713a5d540 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1823,8 +1823,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
 	int ret;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
-	if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
+	if (!test_and_clear_bit(I915_RESET_HANDOFF, &error->flags))
 		return;
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
@@ -1877,7 +1878,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
 wakeup:
 	i915_gem_reset_finish(dev_priv);
 	enable_irq(dev_priv->drm.irq);
-	wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS);
+	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
 	return;
 
 error:
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index eed9ead1b592..7e9b1a008134 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1578,8 +1578,33 @@ struct i915_gpu_error {
 	 */
 	unsigned long reset_count;
 
+	/**
+	 * flags: Control various stages of the GPU reset
+	 *
+	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
+	 * other users acquiring the struct_mutex. To do this we set the
+	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
+	 * and then check for that bit before acquiring the struct_mutex (in
+	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
+	 * secondary role in preventing two concurrent global reset attempts.
+	 *
+	 * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
+	 * struct_mutex. We try to acquire the struct_mutex in the reset worker,
+	 * but it may be held by some long running waiter (that we cannot
+	 * interrupt without causing trouble). Once we are ready to do the GPU
+	 * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
+	 * they already hold the struct_mutex and want to participate they can
+	 * inspect the bit and do the reset directly, otherwise the worker
+	 * waits for the struct_mutex.
+	 *
+	 * #I915_WEDGED - If reset fails and we can no longer use the GPU,
+	 * we set the #I915_WEDGED bit. Prior to command submission, e.g.
+	 * i915_gem_request_alloc(), this bit is checked and the sequence
+	 * aborted (with -EIO reported to userspace) if set.
+	 */
 	unsigned long flags;
-#define I915_RESET_IN_PROGRESS	0
+#define I915_RESET_BACKOFF	0
+#define I915_RESET_HANDOFF	1
 #define I915_WEDGED		(BITS_PER_LONG - 1)
 
 	/**
@@ -3366,9 +3391,14 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);
 
 void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
 
-static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
+static inline bool i915_reset_backoff(struct i915_gpu_error *error)
+{
+	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
+}
+
+static inline bool i915_reset_handoff(struct i915_gpu_error *error)
 {
-	return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
+	return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
 }
 
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
@@ -3376,9 +3406,9 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
 }
 
-static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
+static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
 {
-	return i915_reset_in_progress(error) | i915_terminally_wedged(error);
+	return i915_reset_backoff(error) | i915_terminally_wedged(error);
 }
 
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a8167003c10b..235e823c97d4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -103,16 +103,13 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
 
 	might_sleep();
 
-	if (!i915_reset_in_progress(error))
-		return 0;
-
 	/*
 	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 	 * userspace. If it takes that long something really bad is going on and
 	 * we should simply try to bail out and fail as gracefully as possible.
 	 */
 	ret = wait_event_interruptible_timeout(error->reset_queue,
-					       !i915_reset_in_progress(error),
+					       !i915_reset_backoff(error),
 					       I915_RESET_TIMEOUT);
 	if (ret == 0) {
 		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index fbfeb5f8d069..88929a35f54c 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -1025,7 +1025,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 
 static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request)
 {
-	if (likely(!i915_reset_in_progress(&request->i915->gpu_error)))
+	if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
 		return false;
 
 	__set_current_state(TASK_RUNNING);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index bc70e2c451b2..2a5a71afd594 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2623,9 +2623,6 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv)
 	 * a gpu reset pending so that i915_error_work_func can acquire them).
 	 */
 
-	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
-	wake_up_all(&dev_priv->gpu_error.wait_queue);
-
 	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
 	wake_up_all(&dev_priv->pending_flip_queue);
 }
@@ -2659,6 +2656,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	intel_runtime_pm_get(dev_priv);
 	intel_prepare_reset(dev_priv);
 
+	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
+	wake_up_all(&dev_priv->gpu_error.wait_queue);
+
 	do {
 		/*
 		 * All state reset _must_ be completed before we update the
@@ -2673,7 +2673,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 
 		/* We need to wait for anyone holding the lock to wakeup */
 	} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
-				     I915_RESET_IN_PROGRESS,
+				     I915_RESET_HANDOFF,
 				     TASK_UNINTERRUPTIBLE,
 				     HZ));
 
@@ -2688,6 +2688,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	 * Note: The wake_up also serves as a memory barrier so that
 	 * waiters see the updated value of the dev_priv->gpu_error.
 	 */
+	clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
 	wake_up_all(&dev_priv->gpu_error.reset_queue);
 }
 
@@ -2771,7 +2772,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
 	if (!engine_mask)
 		return;
 
-	if (test_and_set_bit(I915_RESET_IN_PROGRESS,
+	if (test_and_set_bit(I915_RESET_BACKOFF,
 			     &dev_priv->gpu_error.flags))
 		return;
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index d1831809ad3d..e9469697cc01 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3581,7 +3581,7 @@ static bool abort_flip_on_reset(struct intel_crtc *crtc)
 {
 	struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
 
-	if (i915_reset_in_progress(error))
+	if (i915_reset_backoff(error))
 		return true;
 
 	if (crtc->reset_count != i915_reset_count(error))
@@ -10635,7 +10635,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 		goto cleanup;
 
 	intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
-	if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
+	if (i915_reset_backoff_or_wedged(&dev_priv->gpu_error)) {
 		ret = -EIO;
 		goto unlock;
 	}
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index d4acee6730e9..6ec7c731a267 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -301,7 +301,8 @@ static int igt_global_reset(void *arg)
 
 	/* Check that we can issue a global GPU reset */
 
-	set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
+	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 
 	mutex_lock(&i915->drm.struct_mutex);
 	reset_count = i915_reset_count(&i915->gpu_error);
@@ -314,7 +315,8 @@ static int igt_global_reset(void *arg)
 	}
 	mutex_unlock(&i915->drm.struct_mutex);
 
-	GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
+	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
+	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 	if (i915_terminally_wedged(&i915->gpu_error))
 		err = -EIO;
 
@@ -330,7 +332,7 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 
 	reset_count = i915_reset_count(&rq->i915->gpu_error);
 
-	set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags);
+	set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
 	wake_up_all(&rq->i915->gpu_error.wait_queue);
 
 	return reset_count;
@@ -357,7 +359,7 @@ static int igt_wait_reset(void *arg)
 
 	/* Check that we detect a stuck waiter and issue a reset */
 
-	set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
+	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 
 	mutex_lock(&i915->drm.struct_mutex);
 	err = hang_init(&h, i915);
@@ -388,8 +390,8 @@ static int igt_wait_reset(void *arg)
 		err = timeout;
 		goto out_rq;
 	}
-	GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
 
+	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 	if (i915_reset_count(&i915->gpu_error) == reset_count) {
 		pr_err("No GPU reset recorded!\n");
 		err = -EINVAL;
@@ -402,6 +404,7 @@ static int igt_wait_reset(void *arg)
 	hang_fini(&h);
 unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
+	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
 		return -EIO;
@@ -422,6 +425,7 @@ static int igt_reset_queue(void *arg)
 	if (!igt_can_mi_store_dword_imm(i915))
 		return 0;
 
+	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 	mutex_lock(&i915->drm.struct_mutex);
 	err = hang_init(&h, i915);
 	if (err)
@@ -470,8 +474,9 @@ static int igt_reset_queue(void *arg)
 
 			i915_reset(i915);
 
-			GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS,
+			GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
 					    &i915->gpu_error.flags));
+
 			if (prev->fence.error != -EIO) {
 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
 				       prev->fence.error);
@@ -514,6 +519,7 @@ static int igt_reset_queue(void *arg)
 	hang_fini(&h);
 unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
+	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
 		return -EIO;
-- 
2.11.0