[Intel-gfx] [CI 12/21] drm/i915: Replace wait-on-mutex with wait-on-bit in reset worker

Fri Sep 9 11:00:52 UTC 2016

Since we have a cooperative mode now with a direct reset, we can avoid
the contention on struct_mutex and instead try then sleep on the
I915_RESET_IN_PROGRESS bit. If the mutex is held and that bit is
cleared, all is fine. Otherwise, we sleep for a bit and try again. In
the worst case we sleep for an extra second waiting for the mutex to be
released (no one touching the GPU is allowed the struct_mutex whilst the
I915_RESET_IN_PROGRESS bit is set). But when we have a direct reset,
this allows us to clean up the reset worker faster.

v2: Remember to call wake_up_bit() after changing (for the faster wakeup
as promised)

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c | 14 ++++++++------
 drivers/gpu/drm/i915/i915_drv.h |  2 +-
 drivers/gpu/drm/i915/i915_irq.c | 31 ++++++++++++++++++-------------
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ff4173e6e298..f2614b2f59f7 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1726,8 +1726,8 @@ int i915_resume_switcheroo(struct drm_device *dev)
  * i915_reset - reset chip after a hang
  * @dev: drm device to reset
  *
- * Reset the chip.  Useful if a hang is detected. Returns zero on successful
- * reset or otherwise an error code.
+ * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
+ * on failure.
  *
  * Caller must hold the struct_mutex.
  *
@@ -1739,7 +1739,7 @@ int i915_resume_switcheroo(struct drm_device *dev)
  *   - re-init interrupt state
  *   - re-init display
  */
-int i915_reset(struct drm_i915_private *dev_priv)
+void i915_reset(struct drm_i915_private *dev_priv)
 {
 	struct drm_device *dev = &dev_priv->drm;
 	struct i915_gpu_error *error = &dev_priv->gpu_error;
@@ -1748,7 +1748,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
 	lockdep_assert_held(&dev->struct_mutex);
 
 	if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
-		return test_bit(I915_WEDGED, &error->flags) ? -EIO : 0;
+		return;
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
 	__clear_bit(I915_WEDGED, &error->flags);
@@ -1798,11 +1798,13 @@ int i915_reset(struct drm_i915_private *dev_priv)
 	intel_sanitize_gt_powersave(dev_priv);
 	intel_autoenable_gt_powersave(dev_priv);
 
-	return 0;
+wakeup:
+	wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS);
+	return;
 
 error:
 	set_bit(I915_WEDGED, &error->flags);
-	return ret;
+	goto wakeup;
 }
 
 static int i915_pm_suspend(struct device *kdev)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 15f1977e356a..9a9f07f3574c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2884,7 +2884,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 #endif
 extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
 extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
-extern int i915_reset(struct drm_i915_private *dev_priv);
+extern void i915_reset(struct drm_i915_private *dev_priv);
 extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2c7cb5041511..ef2d40278191 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2497,7 +2497,6 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
-	int ret;
 
 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
 
@@ -2512,24 +2511,30 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	 * simulated reset via debugs, so get an RPM reference.
 	 */
 	intel_runtime_pm_get(dev_priv);
-
 	intel_prepare_reset(dev_priv);
 
-	/*
-	 * All state reset _must_ be completed before we update the
-	 * reset counter, for otherwise waiters might miss the reset
-	 * pending state and not properly drop locks, resulting in
-	 * deadlocks with the reset work.
-	 */
-	mutex_lock(&dev_priv->drm.struct_mutex);
-	ret = i915_reset(dev_priv);
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+	do {
+		/*
+		 * All state reset _must_ be completed before we update the
+		 * reset counter, for otherwise waiters might miss the reset
+		 * pending state and not properly drop locks, resulting in
+		 * deadlocks with the reset work.
+		 */
+		if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
+			i915_reset(dev_priv);
+			mutex_unlock(&dev_priv->drm.struct_mutex);
+		}
 
-	intel_finish_reset(dev_priv);
+		/* We need to wait for anyone holding the lock to wakeup */
+	} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
+				     I915_RESET_IN_PROGRESS,
+				     TASK_UNINTERRUPTIBLE,
+				     HZ));
 
+	intel_finish_reset(dev_priv);
 	intel_runtime_pm_put(dev_priv);
 
-	if (ret == 0)
+	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
 		kobject_uevent_env(kobj,
 				   KOBJ_CHANGE, reset_done_event);
 
-- 
2.9.3