[Intel-gfx] [PATCH] drm/i915: Fix up -EIO ABI per igt/gem_eio
Daniel Vetter
daniel at ffwll.ch
Thu Nov 26 03:33:24 PST 2015
On Thu, Nov 26, 2015 at 11:51:09AM +0100, Daniel Vetter wrote:
> My apologies to Chris Wilson for being dense on this topic for
> essentially for years.
>
> This patch doesn't do any big redesign of the overall reset flow, but
> instead just applies changes where it's needed to obey gem_eio. We can
> redesign later on, but for now I prefer to make the testcase happy
> with some minimally invasive changes:
>
> - Ensure that set_domain_ioctl never returns -EIO. Tricky part there
> is that we might race with the reset handler when doing the lockless
> wait.
>
> - Make sure debugfs/i915_wedged is actually useful to reset a wedged
> gpu - atm it bails out with -EAGAIN for a terminally wedged gpu.
> That's because reset_in_progress actually includes that terminal
> state, which is super-confusing (and most callers got this wrong).
> Fix this by changing the semantics of reset_in_progress to do what
> it says on the tin (and fixup all other callers).
>
> Also make sure that reset_in_progress is cleared when we reach the
> terminal "wedged" state. Without this we kept returning -EAGAIN in
> some places forever.
>
> - Second problem with debugfs/i915_wedge was that we never got out of
> the terminal wedged state. Hence manually set the reset counter out
> of that state before starting the hung gpu recovery.
>
> For safety also make sure that we are consintent with resetting the
> gpu between i915_reset_and_wakeup and i915_handler_error by just
> passing the boolean "wedged" around instead of trying to recompute
> it wrongly. This isn't an issue for gem_eio with the debugfs fix,
> but just general robustness of the code.
>
> - Finally make sure that when gpu reset is disabled through the module
> paramter the kernel doesn't generate dmesg noise that would upset
> our CI/piglit. Simplest solution for that was to lift the i915.reset
> check up into i915_reset().
>
> With all these changes, plus the igt fixes I've posted, gem_eio is now
> happy on many snb.
>
> v2: Don't upset lockdep with my set_domain_ioctl changes.
Blergh, forgotten to update the commit message:
v3: Don't special case set_domain_ioctl, instead curb all -EIO at the
source in wait_request, like in Chris' patch. That means anyone waiting
for a request will not notice the -EIO and fall over due to that. We
definitely want that for modeset code.
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Testcase: igt/gem_eio/*
> Signed-off-by: Daniel Vetter <daniel.vetter at intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 4 ++++
> drivers/gpu/drm/i915/i915_drv.c | 6 ++++++
> drivers/gpu/drm/i915/i915_drv.h | 2 +-
> drivers/gpu/drm/i915/i915_gem.c | 23 +++++++++++------------
> drivers/gpu/drm/i915/i915_irq.c | 11 ++++++-----
> drivers/gpu/drm/i915/intel_uncore.c | 3 ---
> 6 files changed, 28 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 411a9c68b4ee..c4006c09ef1b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4744,6 +4744,10 @@ i915_wedged_set(void *data, u64 val)
> if (i915_reset_in_progress(&dev_priv->gpu_error))
> return -EAGAIN;
>
> + /* Already wedged, force us out of this terminal state. */
> + if (i915_terminally_wedged(&dev_priv->gpu_error))
> + atomic_set(&dev_priv->gpu_error.reset_counter, 0);
> +
> intel_runtime_pm_get(dev_priv);
>
> i915_handle_error(dev, val,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index ec1e877c4a36..1f5386bb78f4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -909,6 +909,12 @@ int i915_reset(struct drm_device *dev)
>
> simulated = dev_priv->gpu_error.stop_rings != 0;
>
> + if (!i915.reset) {
> + DRM_INFO("GPU reset disabled in module option, not resetting\n");
> + mutex_unlock(&dev->struct_mutex);
> + return -ENODEV;
> + }
> +
> ret = intel_gpu_reset(dev);
>
> /* Also reset the gpu hangman. */
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a47e0f4fab56..8876b4891d56 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2939,7 +2939,7 @@ int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
> {
> return unlikely(atomic_read(&error->reset_counter)
> - & (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
> + & I915_RESET_IN_PROGRESS_FLAG);
> }
>
> static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f10a5d57377e..64c63409b9d0 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -85,8 +85,7 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
> {
> int ret;
>
> -#define EXIT_COND (!i915_reset_in_progress(error) || \
> - i915_terminally_wedged(error))
> +#define EXIT_COND (!i915_reset_in_progress(error))
> if (EXIT_COND)
> return 0;
>
> @@ -1113,16 +1112,16 @@ int
> i915_gem_check_wedge(struct i915_gpu_error *error,
> bool interruptible)
> {
> + /* Recovery complete, but the reset failed ... */
> + if (i915_terminally_wedged(error))
> + return -EIO;
> +
> if (i915_reset_in_progress(error)) {
> /* Non-interruptible callers can't handle -EAGAIN, hence return
> * -EIO unconditionally for these. */
> if (!interruptible)
> return -EIO;
>
> - /* Recovery complete, but the reset failed ... */
> - if (i915_terminally_wedged(error))
> - return -EIO;
> -
> /*
> * Check if GPU Reset is in progress - we need intel_ring_begin
> * to work properly to reinit the hw state while the gpu is
> @@ -1239,11 +1238,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
> /* We need to check whether any gpu reset happened in between
> * the caller grabbing the seqno and now ... */
> if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
> - /* ... but upgrade the -EAGAIN to an -EIO if the gpu
> - * is truely gone. */
> - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> - if (ret == 0)
> - ret = -EAGAIN;
> + ret = -EAGAIN;
> break;
> }
>
> @@ -1577,7 +1572,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>
> ret = i915_mutex_lock_interruptible(dev);
> if (ret)
> - return ret;
> + goto out;
>
> obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
> if (&obj->base == NULL) {
> @@ -1609,6 +1604,10 @@ unref:
> drm_gem_object_unreference(&obj->base);
> unlock:
> mutex_unlock(&dev->struct_mutex);
> +out:
> + /* ABI: set_domain_ioctl must not fail even when the gpu is wedged. */
> + WARN_ON(ret == -EIO);
> +
> return ret;
> }
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index c8ba94968aaf..dbbc6159584b 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2401,7 +2401,8 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
> * Fire an error uevent so userspace can see that a hang or error
> * was detected.
> */
> -static void i915_reset_and_wakeup(struct drm_device *dev)
> +static void i915_reset_and_wakeup(struct drm_device *dev,
> + bool wedged)
> {
> struct drm_i915_private *dev_priv = to_i915(dev);
> struct i915_gpu_error *error = &dev_priv->gpu_error;
> @@ -2422,7 +2423,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
> * the reset in-progress bit is only ever set by code outside of this
> * work we don't need to worry about any other races.
> */
> - if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) {
> + if (wedged) {
> DRM_DEBUG_DRIVER("resetting chip\n");
> kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
> reset_event);
> @@ -2432,7 +2433,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
> * reference held, for example because there is a pending GPU
> * request that won't finish until the reset is done. This
> * isn't the case at least when we get here by doing a
> - * simulated reset via debugs, so get an RPM reference.
> + * simulated reset via debugfs, so get an RPM reference.
> */
> intel_runtime_pm_get(dev_priv);
>
> @@ -2467,7 +2468,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
> kobject_uevent_env(&dev->primary->kdev->kobj,
> KOBJ_CHANGE, reset_done_event);
> } else {
> - atomic_or(I915_WEDGED, &error->reset_counter);
> + atomic_set(&error->reset_counter, I915_WEDGED);
> }
>
> /*
> @@ -2614,7 +2615,7 @@ void i915_handle_error(struct drm_device *dev, bool wedged,
> i915_error_wake_up(dev_priv, false);
> }
>
> - i915_reset_and_wakeup(dev);
> + i915_reset_and_wakeup(dev, wedged);
> }
>
> /* Called from drm generic code, passed 'crtc' which
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index c2358ba78b30..4c5ae1154dd1 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1543,9 +1543,6 @@ not_ready:
>
> static int (*intel_get_gpu_reset(struct drm_device *dev))(struct drm_device *)
> {
> - if (!i915.reset)
> - return NULL;
> -
> if (INTEL_INFO(dev)->gen >= 8)
> return gen8_do_reset;
> else if (INTEL_INFO(dev)->gen >= 6)
> --
> 2.1.0
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
More information about the Intel-gfx
mailing list