[Intel-gfx] [PATCH] drm/i915: Fix up -EIO ABI per igt/gem_eio

Wed Nov 25 09:45:26 PST 2015

My apologies to Chris Wilson for being dense on this topic for
essentially for years.

This patch doesn't do any big redesign of the overall reset flow, but
instead just applies changes where it's needed to obey gem_eio. We can
redesign later on, but for now I prefer to make the testcase happy
with some minimally invasive changes:

- Ensure that set_domain_ioctl never returns -EIO. Tricky part there
  is that we might race with the reset handler when doing the lockless
  wait.

- Make sure debugfs/i915_wedged is actually useful to reset a wedged
  gpu - atm it bails out with -EAGAIN for a terminally wedged gpu.
  That's because reset_in_progress actually includes that terminal
  state, which is super-confusing (and most callers got this wrong).
  Fix this by changing the semantics of reset_in_progress to do what
  it says on the tin (and fixup all other callers).

  Also make sure that reset_in_progress is cleared when we reach the
  terminal "wedged" state. Without this we kept returning -EAGAIN in
  some places forever.

- Second problem with debugfs/i915_wedge was that we never got out of
  the terminal wedged state. Hence manually set the reset counter out
  of that state before starting the hung gpu recovery.

  For safety also make sure that we are consintent with resetting the
  gpu between i915_reset_and_wakeup and i915_handler_error by just
  passing the boolean "wedged" around instead of trying to recompute
  it wrongly. This isn't an issue for gem_eio with the debugfs fix,
  but just general robustness of the code.

- Finally make sure that when gpu reset is disabled through the module
  paramter the kernel doesn't generate dmesg noise that would upset
  our CI/piglit. Simplest solution for that was to lift the i915.reset
  check up into i915_reset().

With all these changes, plus the igt fixes I've posted, gem_eio is now
happy on many snb.

v2: Don't upset lockdep with my set_domain_ioctl changes.

Cc: Chris Wilson <chris at chris-wilson.co.uk>
Testcase: igt/gem_eio/*
Signed-off-by: Daniel Vetter <daniel.vetter at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c |  4 ++++
 drivers/gpu/drm/i915/i915_drv.c     |  6 ++++++
 drivers/gpu/drm/i915/i915_drv.h     |  2 +-
 drivers/gpu/drm/i915/i915_gem.c     | 23 +++++++++++++++--------
 drivers/gpu/drm/i915/i915_irq.c     | 11 ++++++-----
 drivers/gpu/drm/i915/intel_uncore.c |  3 ---
 6 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 411a9c68b4ee..c4006c09ef1b 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4744,6 +4744,10 @@ i915_wedged_set(void *data, u64 val)
 	if (i915_reset_in_progress(&dev_priv->gpu_error))
 		return -EAGAIN;
 
+	/* Already wedged, force us out of this terminal state. */
+	if (i915_terminally_wedged(&dev_priv->gpu_error))
+		atomic_set(&dev_priv->gpu_error.reset_counter, 0);
+
 	intel_runtime_pm_get(dev_priv);
 
 	i915_handle_error(dev, val,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ec1e877c4a36..1f5386bb78f4 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -909,6 +909,12 @@ int i915_reset(struct drm_device *dev)
 
 	simulated = dev_priv->gpu_error.stop_rings != 0;
 
+	if (!i915.reset) {
+		DRM_INFO("GPU reset disabled in module option, not resetting\n");
+		mutex_unlock(&dev->struct_mutex);
+		return -ENODEV;
+	}
+
 	ret = intel_gpu_reset(dev);
 
 	/* Also reset the gpu hangman. */
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a47e0f4fab56..8876b4891d56 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2939,7 +2939,7 @@ int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
 static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
 {
 	return unlikely(atomic_read(&error->reset_counter)
-			& (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
+			& I915_RESET_IN_PROGRESS_FLAG);
 }
 
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f10a5d57377e..f794e8f37f92 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -85,8 +85,7 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
 {
 	int ret;
 
-#define EXIT_COND (!i915_reset_in_progress(error) || \
-		   i915_terminally_wedged(error))
+#define EXIT_COND (!i915_reset_in_progress(error))
 	if (EXIT_COND)
 		return 0;
 
@@ -1113,16 +1112,16 @@ int
 i915_gem_check_wedge(struct i915_gpu_error *error,
 		     bool interruptible)
 {
+	/* Recovery complete, but the reset failed ... */
+	if (i915_terminally_wedged(error))
+		return -EIO;
+
 	if (i915_reset_in_progress(error)) {
 		/* Non-interruptible callers can't handle -EAGAIN, hence return
 		 * -EIO unconditionally for these. */
 		if (!interruptible)
 			return -EIO;
 
-		/* Recovery complete, but the reset failed ... */
-		if (i915_terminally_wedged(error))
-			return -EIO;
-
 		/*
 		 * Check if GPU Reset is in progress - we need intel_ring_begin
 		 * to work properly to reinit the hw state while the gpu is
@@ -1577,7 +1576,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
-		return ret;
+		goto out;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
@@ -1592,7 +1591,8 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 	ret = i915_gem_object_wait_rendering__nonblocking(obj,
 							  to_rps_client(file),
 							  !write_domain);
-	if (ret)
+	/* ABI: We must do cache management even when the gpu is dead. */
+	if (ret && ret != -EIO)
 		goto unref;
 
 	if (read_domains & I915_GEM_DOMAIN_GTT)
@@ -1605,10 +1605,17 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 					write_domain == I915_GEM_DOMAIN_GTT ?
 					ORIGIN_GTT : ORIGIN_CPU);
 
+	/* ABI: set_domain_ioctl must not fail even when the gpu is wedged. */
+	if (ret == -EIO)
+		ret = 0;
+
 unref:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	mutex_unlock(&dev->struct_mutex);
+out:
+	WARN_ON(ret == -EIO);
+
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index c8ba94968aaf..dbbc6159584b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2401,7 +2401,8 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
  * Fire an error uevent so userspace can see that a hang or error
  * was detected.
  */
-static void i915_reset_and_wakeup(struct drm_device *dev)
+static void i915_reset_and_wakeup(struct drm_device *dev,
+				  bool wedged)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct i915_gpu_error *error = &dev_priv->gpu_error;
@@ -2422,7 +2423,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
 	 * the reset in-progress bit is only ever set by code outside of this
 	 * work we don't need to worry about any other races.
 	 */
-	if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) {
+	if (wedged) {
 		DRM_DEBUG_DRIVER("resetting chip\n");
 		kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
 				   reset_event);
@@ -2432,7 +2433,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
 		 * reference held, for example because there is a pending GPU
 		 * request that won't finish until the reset is done. This
 		 * isn't the case at least when we get here by doing a
-		 * simulated reset via debugs, so get an RPM reference.
+		 * simulated reset via debugfs, so get an RPM reference.
 		 */
 		intel_runtime_pm_get(dev_priv);
 
@@ -2467,7 +2468,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
 			kobject_uevent_env(&dev->primary->kdev->kobj,
 					   KOBJ_CHANGE, reset_done_event);
 		} else {
-			atomic_or(I915_WEDGED, &error->reset_counter);
+			atomic_set(&error->reset_counter, I915_WEDGED);
 		}
 
 		/*
@@ -2614,7 +2615,7 @@ void i915_handle_error(struct drm_device *dev, bool wedged,
 		i915_error_wake_up(dev_priv, false);
 	}
 
-	i915_reset_and_wakeup(dev);
+	i915_reset_and_wakeup(dev, wedged);
 }
 
 /* Called from drm generic code, passed 'crtc' which
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index c2358ba78b30..4c5ae1154dd1 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1543,9 +1543,6 @@ not_ready:
 
 static int (*intel_get_gpu_reset(struct drm_device *dev))(struct drm_device *)
 {
-	if (!i915.reset)
-		return NULL;
-
 	if (INTEL_INFO(dev)->gen >= 8)
 		return gen8_do_reset;
 	else if (INTEL_INFO(dev)->gen >= 6)
-- 
2.1.0