[Intel-gfx] [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item

Thu Sep 4 15:31:14 CEST 2014

On Thu, Sep 04, 2014 at 01:26:36PM +0100, Chris Wilson wrote:
> On Thu, Sep 04, 2014 at 03:17:57PM +0300, Jani Nikula wrote:
> > On Thu, 04 Sep 2014, Chris Wilson <chris at chris-wilson.co.uk> wrote:
> > > When run as a timer, i915_hangcheck_elapsed() must adhere to all the
> > > rules of running in a softirq context. This is advantageous to us as we
> > > want to minimise the risk that a driver bug will prevent us from
> > > detecting a hung GPU. However, that is irrelevant if the driver bug
> > > prevents us from resetting and recovering. Still it is prudent not to
> > > rely on mutexes inside the checker, but given the coarseness of
> > > dev->struct_mutex doing so is extremely hard.
> > >
> > > Give in and run from a work queue, i.e. outside of softirq.
> > >
> > > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> > > ---
> > >  drivers/gpu/drm/i915/i915_dma.c |  2 +-
> > >  drivers/gpu/drm/i915/i915_drv.c |  2 +-
> > >  drivers/gpu/drm/i915/i915_drv.h |  2 +-
> > >  drivers/gpu/drm/i915/i915_gem.c |  2 +-
> > >  drivers/gpu/drm/i915/i915_irq.c | 15 ++++++++-------
> > >  5 files changed, 12 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > > index 6502ae2d7b7d..6a8e71cb2be8 100644
> > > --- a/drivers/gpu/drm/i915/i915_dma.c
> > > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > > @@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
> > >  	}
> > >  
> > >  	/* Free error state after interrupts are fully disabled. */
> > > -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> > > +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> > >  	cancel_work_sync(&dev_priv->gpu_error.work);
> > >  	i915_destroy_error_state(dev);
> > >  
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> > > index f7cc6a9c14fd..ea9224a977c1 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.c
> > > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > > @@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *device)
> > >  		return ret;
> > >  	}
> > >  
> > > -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> > > +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> > >  	intel_uncore_forcewake_reset(dev, false);
> > >  	dev_priv->pm.suspended = true;
> > >  
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > index a920ca3789d8..e1f8ffcb2cf3 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > @@ -1209,7 +1209,7 @@ struct i915_gpu_error {
> > >  	/* Hang gpu twice in this window and your context gets banned */
> > >  #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
> > >  
> > > -	struct timer_list hangcheck_timer;
> > > +	struct delayed_work hangcheck_work;
> > >  
> > >  	/* For reset and error_state handling. */
> > >  	spinlock_t lock;
> > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > > index b37177afc3c0..3e80c777bf12 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > @@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
> > >  							     DRIVER_MODESET);
> > >  	mutex_unlock(&dev->struct_mutex);
> > >  
> > > -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> > > +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> > >  	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
> > >  	flush_delayed_work(&dev_priv->mm.idle_work);
> > >  
> > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > > index 2ade9efe078c..5fe9cdb323b3 100644
> > > --- a/drivers/gpu/drm/i915/i915_irq.c
> > > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > > @@ -3189,9 +3189,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
> > >   * we kick the ring. If we see no progress on three subsequent calls
> > >   * we assume chip is wedged and try to fix it by resetting the chip.
> > >   */
> > > -static void i915_hangcheck_elapsed(unsigned long data)
> > > +static void i915_hangcheck_elapsed(struct work_struct *work)
> > >  {
> > > -	struct drm_i915_private *dev_priv = (struct drm_i915_private *)data;
> > > +	struct drm_i915_private *dev_priv =
> > > +		container_of(work, typeof(*dev_priv),
> > > +			     gpu_error.hangcheck_work.work);
> > >  	struct intel_engine_cs *engine;
> > >  	int i;
> > >  	int busy_count = 0, rings_hung = 0;
> > > @@ -3312,8 +3314,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
> > >  	if (!i915_module.enable_hangcheck)
> > >  		return;
> > >  
> > > -	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
> > > -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> > > +	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
> > > +			      round_jiffies_up(DRM_I915_HANGCHECK_JIFFIES));
> > 
> > schedule_delayed_work does not push the work timer forward if the work
> > is already queued, and it also expects a delay, not absolute jiffies.
> 
> Indeed. We don't need to push the work forward, but we do need the
> ability to requeue work. I do pass in a relative offset, but I guess we
> want to replace the round_jiffies into its relative variant.

Didn't 3.17 just gain the mod_delayed_work interface?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch