[Intel-gfx] [PATCH v2] drm/i915: Break modeset deadlocks on reset

Wed Jun 21 16:06:36 UTC 2017

Quoting Maarten Lankhorst (2017-06-21 16:55:06)
> Op 21-06-17 om 16:24 schreef Chris Wilson:
> > Trying to do a modeset from within a reset is fraught with danger. We
> > can fall into a cyclic deadlock where the modeset is waiting on a
> > previous modeset that is waiting on a request, and since the GPU hung
> > that request completion is waiting on the reset. As modesetting doesn't
> > allow its locks to be broken and restarted, or for its *own* reset
> > mechanism to take over the display, we have to do something very
> > evil instead. If we detect that we are stuck waiting to prepare the
> > display reset (by using a very simple timeout), resort to cancelling all
> > in-flight requests and throwing the user data into /dev/null, which is
> > marginally better than the driver locking up and keeping that data to
> > itself.
> >
> > This is not a fix; this is just a workaround that unbreaks machines
> > until we can resolve the deadlock in a way that doesn't lose data!
> >
> > v2: Move the retirement from set-wegded to the i915_reset() error path,
> > after which we no longer any delayed worker cleanup for
> > i915_handle_error()
> > v3: C abuse for syntactic sugar
> >
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
> > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> > Cc: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
> > Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c |  1 +
> >  drivers/gpu/drm/i915/i915_gem.c | 18 ++++-------------
> >  drivers/gpu/drm/i915/i915_irq.c | 45 ++++++++++++++++++++++++++++++++++++++++-
> >  3 files changed, 49 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> > index d1aa10c9cc5d..1df957b986c7 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -1914,6 +1914,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
> >  
> >  error:
> >       i915_gem_set_wedged(dev_priv);
> > +     i915_gem_retire_requests(dev_priv);
> >       goto finish;
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index ae3ce1314bd1..36d838677982 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -3062,7 +3062,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
> >       /* Mark all executing requests as skipped */
> >       spin_lock_irqsave(&engine->timeline->lock, flags);
> >       list_for_each_entry(request, &engine->timeline->requests, link)
> > -             dma_fence_set_error(&request->fence, -EIO);
> > +             if (!i915_gem_request_completed(request))
> > +                     dma_fence_set_error(&request->fence, -EIO);
> >       spin_unlock_irqrestore(&engine->timeline->lock, flags);
> >  
> >       /* Mark all pending requests as complete so that any concurrent
> > @@ -3108,6 +3109,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
> >       struct intel_engine_cs *engine;
> >       enum intel_engine_id id;
> >  
> > +     set_bit(I915_WEDGED, &i915->gpu_error.flags);
> >       for_each_engine(engine, i915, id)
> >               engine_set_wedged(engine);
> >  
> > @@ -3116,20 +3118,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
> >  
> >  void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
> >  {
> > -     lockdep_assert_held(&dev_priv->drm.struct_mutex);
> > -     set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
> > -
> > -     /* Retire completed requests first so the list of inflight/incomplete
> > -      * requests is accurate and we don't try and mark successful requests
> > -      * as in error during __i915_gem_set_wedged_BKL().
> > -      */
> > -     i915_gem_retire_requests(dev_priv);
> > -
> >       stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
> > -
> > -     i915_gem_contexts_lost(dev_priv);
> > -
> > -     mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
> >  }
> >  
> >  bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> > @@ -3184,6 +3173,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> >        * context and do not require stop_machine().
> >        */
> >       intel_engines_reset_default_submission(i915);
> > +     i915_gem_contexts_lost(i915);
> >  
> >       smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
> >       clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index b1c7d1a04612..c948a5bd031c 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -2599,6 +2599,46 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
> >       return ret;
> >  }
> >  
> > +struct wedge_me {
> > +     struct delayed_work work;
> > +     struct drm_i915_private *i915;
> > +     const char *name;
> > +};
> > +
> > +static void wedge_me(struct work_struct *work)
> > +{
> > +     struct wedge_me *w = container_of(work, typeof(*w), work.work);
> > +
> > +     dev_err(w->i915->drm.dev,
> > +             "%s timed out, cancelling all in-flight rendering.\n",
> > +             w->name);
> > +     i915_gem_set_wedged(w->i915);
> > +}
> > +
> > +static void __init_wedge(struct wedge_me *w,
> > +                      struct drm_i915_private *i915,
> > +                      long timeout,
> > +                      const char *name)
> > +{
> > +     w->i915 = i915;
> > +     w->name = name;
> > +
> > +     INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> > +     schedule_delayed_work(&w->work, timeout);
> > +}
> > +
> > +static void __fini_wedge(struct wedge_me *w)
> > +{
> > +     cancel_delayed_work_sync(&w->work);
> > +     destroy_delayed_work_on_stack(&w->work);
> > +     w->i915 = NULL;
> > +}
> > +
> > +#define i915_wedge_on_timeout(W, DEV, TIMEOUT, MSG)                  \
> > +     for (__init_wedge((W), (DEV), (TIMEOUT), (MSG));                \
> > +          (W)->i915;                                                 \
> > +          __fini_wedge((W)))
> > +
> >  /**
> >   * i915_reset_device - do process context error handling work
> >   * @dev_priv: i915 device private
> > @@ -2612,13 +2652,16 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
> >       char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
> >       char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
> >       char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> > +     struct wedge_me w;
> >  
> >       kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
> >  
> >       DRM_DEBUG_DRIVER("resetting chip\n");
> >       kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> >  
> > -     intel_prepare_reset(dev_priv);
> > +     i915_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset") {
> > +             intel_prepare_reset(dev_priv);
> > +     }
> I think v2 was more clear here.
> 
> But if this is just for the modeset locks, why not add a timeout to i915_sw_fence_await_reservation on old_obj?

That wait prevents further GPU death in batches that may still be
pending. The mechanism that we need to use is the reset. However,
resetting the GPU wants to meddle with modesetting, yet offers no way to
reset the display from within reset.

> That would allow us to end the deadlocks without having to kill off all fences first.

Instead you simply ignore fences. It is no more a fix to the deadlock
than cancelling the fence. The issue is not the wait on the rendering.
-Chris