[Intel-gfx] [PATCH v2] drm/i915: Break modeset deadlocks on reset

Wed Jun 21 17:27:25 UTC 2017

Op 21-06-17 om 18:06 schreef Chris Wilson:
> Quoting Maarten Lankhorst (2017-06-21 16:55:06)
>> Op 21-06-17 om 16:24 schreef Chris Wilson:
>>> Trying to do a modeset from within a reset is fraught with danger. We
>>> can fall into a cyclic deadlock where the modeset is waiting on a
>>> previous modeset that is waiting on a request, and since the GPU hung
>>> that request completion is waiting on the reset. As modesetting doesn't
>>> allow its locks to be broken and restarted, or for its *own* reset
>>> mechanism to take over the display, we have to do something very
>>> evil instead. If we detect that we are stuck waiting to prepare the
>>> display reset (by using a very simple timeout), resort to cancelling all
>>> in-flight requests and throwing the user data into /dev/null, which is
>>> marginally better than the driver locking up and keeping that data to
>>> itself.
>>>
>>> This is not a fix; this is just a workaround that unbreaks machines
>>> until we can resolve the deadlock in a way that doesn't lose data!
>>>
>>> v2: Move the retirement from set-wegded to the i915_reset() error path,
>>> after which we no longer any delayed worker cleanup for
>>> i915_handle_error()
>>> v3: C abuse for syntactic sugar
>>>
>>> References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
>>> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/i915_drv.c |  1 +
>>>  drivers/gpu/drm/i915/i915_gem.c | 18 ++++-------------
>>>  drivers/gpu/drm/i915/i915_irq.c | 45 ++++++++++++++++++++++++++++++++++++++++-
>>>  3 files changed, 49 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>>> index d1aa10c9cc5d..1df957b986c7 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.c
>>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>>> @@ -1914,6 +1914,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
>>>  
>>>  error:
>>>       i915_gem_set_wedged(dev_priv);
>>> +     i915_gem_retire_requests(dev_priv);
>>>       goto finish;
>>>  }
>>>  
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index ae3ce1314bd1..36d838677982 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -3062,7 +3062,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
>>>       /* Mark all executing requests as skipped */
>>>       spin_lock_irqsave(&engine->timeline->lock, flags);
>>>       list_for_each_entry(request, &engine->timeline->requests, link)
>>> -             dma_fence_set_error(&request->fence, -EIO);
>>> +             if (!i915_gem_request_completed(request))
>>> +                     dma_fence_set_error(&request->fence, -EIO);
>>>       spin_unlock_irqrestore(&engine->timeline->lock, flags);
>>>  
>>>       /* Mark all pending requests as complete so that any concurrent
>>> @@ -3108,6 +3109,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>>>       struct intel_engine_cs *engine;
>>>       enum intel_engine_id id;
>>>  
>>> +     set_bit(I915_WEDGED, &i915->gpu_error.flags);
>>>       for_each_engine(engine, i915, id)
>>>               engine_set_wedged(engine);
>>>  
>>> @@ -3116,20 +3118,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>>>  
>>>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
>>>  {
>>> -     lockdep_assert_held(&dev_priv->drm.struct_mutex);
>>> -     set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
>>> -
>>> -     /* Retire completed requests first so the list of inflight/incomplete
>>> -      * requests is accurate and we don't try and mark successful requests
>>> -      * as in error during __i915_gem_set_wedged_BKL().
>>> -      */
>>> -     i915_gem_retire_requests(dev_priv);
>>> -
>>>       stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
>>> -
>>> -     i915_gem_contexts_lost(dev_priv);
>>> -
>>> -     mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
>>>  }
>>>  
>>>  bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>>> @@ -3184,6 +3173,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>>>        * context and do not require stop_machine().
>>>        */
>>>       intel_engines_reset_default_submission(i915);
>>> +     i915_gem_contexts_lost(i915);
>>>  
>>>       smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>>>       clear_bit(I915_WEDGED, &i915->gpu_error.flags);
>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>> index b1c7d1a04612..c948a5bd031c 100644
>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>> @@ -2599,6 +2599,46 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>>>       return ret;
>>>  }
>>>  
>>> +struct wedge_me {
>>> +     struct delayed_work work;
>>> +     struct drm_i915_private *i915;
>>> +     const char *name;
>>> +};
>>> +
>>> +static void wedge_me(struct work_struct *work)
>>> +{
>>> +     struct wedge_me *w = container_of(work, typeof(*w), work.work);
>>> +
>>> +     dev_err(w->i915->drm.dev,
>>> +             "%s timed out, cancelling all in-flight rendering.\n",
>>> +             w->name);
>>> +     i915_gem_set_wedged(w->i915);
>>> +}
>>> +
>>> +static void __init_wedge(struct wedge_me *w,
>>> +                      struct drm_i915_private *i915,
>>> +                      long timeout,
>>> +                      const char *name)
>>> +{
>>> +     w->i915 = i915;
>>> +     w->name = name;
>>> +
>>> +     INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
>>> +     schedule_delayed_work(&w->work, timeout);
>>> +}
>>> +
>>> +static void __fini_wedge(struct wedge_me *w)
>>> +{
>>> +     cancel_delayed_work_sync(&w->work);
>>> +     destroy_delayed_work_on_stack(&w->work);
>>> +     w->i915 = NULL;
>>> +}
>>> +
>>> +#define i915_wedge_on_timeout(W, DEV, TIMEOUT, MSG)                  \
>>> +     for (__init_wedge((W), (DEV), (TIMEOUT), (MSG));                \
>>> +          (W)->i915;                                                 \
>>> +          __fini_wedge((W)))
>>> +
>>>  /**
>>>   * i915_reset_device - do process context error handling work
>>>   * @dev_priv: i915 device private
>>> @@ -2612,13 +2652,16 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
>>>       char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
>>>       char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
>>>       char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
>>> +     struct wedge_me w;
>>>  
>>>       kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
>>>  
>>>       DRM_DEBUG_DRIVER("resetting chip\n");
>>>       kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
>>>  
>>> -     intel_prepare_reset(dev_priv);
>>> +     i915_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset") {
>>> +             intel_prepare_reset(dev_priv);
>>> +     }
>> I think v2 was more clear here.
>>
>> But if this is just for the modeset locks, why not add a timeout to i915_sw_fence_await_reservation on old_obj?
> That wait prevents further GPU death in batches that may still be
> pending. The mechanism that we need to use is the reset. However,
> resetting the GPU wants to meddle with modesetting, yet offers no way to
> reset the display from within reset.
True. However it doesn't prevent gpu death when userspace disables the plane separately from a modeset.
>> That would allow us to end the deadlocks without having to kill off all fences first.
> Instead you simply ignore fences. It is no more a fix to the deadlock
> than cancelling the fence. The issue is not the wait on the rendering.
> -Chris
Could we do both?

~Maarten