[Intel-gfx] [PATCH 01/62] drm/i915: Only start retire worker when idle

Wed Jun 8 12:07:45 UTC 2016

On ke, 2016-06-08 at 12:06 +0100, Chris Wilson wrote:
> On Wed, Jun 08, 2016 at 11:53:15AM +0100, Chris Wilson wrote:
> > 
> > On Tue, Jun 07, 2016 at 02:31:07PM +0300, Joonas Lahtinen wrote:
> > > 
> > > On pe, 2016-06-03 at 17:36 +0100, Chris Wilson wrote:
> > > > 
> > > >  i915_gem_idle_work_handler(struct work_struct *work)
> > > >  {
> > > >  	struct drm_i915_private *dev_priv =
> > > > -		container_of(work, typeof(*dev_priv), mm.idle_work.work);
> > > > +		container_of(work, typeof(*dev_priv), gt.idle_work.work);
> > > >  	struct drm_device *dev = dev_priv->dev;
> > > >  	struct intel_engine_cs *engine;
> > > >  
> > > > -	for_each_engine(engine, dev_priv)
> > > > -		if (!list_empty(&engine->request_list))
> > > > -			return;
> > > > +	if (!READ_ONCE(dev_priv->gt.awake))
> > > > +		return;
> > > >  
> > > > -	/* we probably should sync with hangcheck here, using cancel_work_sync.
> > > > -	 * Also locking seems to be fubar here, engine->request_list is protected
> > > > -	 * by dev->struct_mutex. */
> > > > +	mutex_lock(&dev->struct_mutex);
> > > > +	if (dev_priv->gt.active_engines)
> > > > +		goto out;
> > > >  
> > > > -	intel_mark_idle(dev_priv);
> > > > +	for_each_engine(engine, dev_priv)
> > > > +		i915_gem_batch_pool_fini(&engine->batch_pool);
> > > >  
> > > > -	if (mutex_trylock(&dev->struct_mutex)) {
> > > > -		for_each_engine(engine, dev_priv)
> > > > -			i915_gem_batch_pool_fini(&engine->batch_pool);
> > > > +	GEM_BUG_ON(!dev_priv->gt.awake);
> > > > +	dev_priv->gt.awake = false;
> > > >  
> > > > -		mutex_unlock(&dev->struct_mutex);
> > > > +	if (INTEL_INFO(dev_priv)->gen >= 6)
> > > > +		gen6_rps_idle(dev_priv);
> > > > +	intel_runtime_pm_put(dev_priv);
> > > > +out:
> > > > +	mutex_unlock(&dev->struct_mutex);
> > > > +
> > > > +	if (!dev_priv->gt.awake &&
> > > No READ_ONCE here, even we just unlocked the mutex. So lacks some
> > > consistency.
> > > 
> > > Also, this assumes we might be pre-empted between unlocking mutex and
> > > making this test, so I'm little bit confused. Do you want to optimize
> > > by avoiding calling cancel_delayed_work_sync?
> > General principle to never call work_sync functions with locks held. I
> > had actually thought I had fixed this up (but realized that I just
> > rewrote hangcheck later on instead ;)
> > 
> > Ok, what I think is safer here is
> > 
> > 	bool hangcheck = cancel_delay_work_sync(hangcheck_work)
> > 
> > 	mutex_lock()
> > 	if (actually_idle()) {
> > 		awake = false;
> > 		missed_irq_rings |= intel_kick_waiters();
> > 	}
> > 	mutex_unlock();
> > 
> > 	if (awake && hangcheck)
> > 		queue_hangcheck()
> > 	
> > So always kick the hangcheck and reeanble if we tried to idle too early.
> > This will potentially delay hangcheck by one full hangcheck period if we
> > do encounter that race. But we shouldn't be hitting this race that
> > often, or hanging the GPU for that mterr.
> Actual delta:
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 406046f66e36..856da4036fb3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3066,10 +3066,15 @@ i915_gem_idle_work_handler(struct work_struct *work)
>                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
>         struct drm_device *dev = dev_priv->dev;
>         struct intel_engine_cs *engine;
> +       unsigned stuck_engines;
> +       bool rearm_hangcheck;
>  
>         if (!READ_ONCE(dev_priv->gt.awake))
>                 return;
>  
> +       rearm_hangcheck =
> +               cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> +
>         mutex_lock(&dev->struct_mutex);
>         if (dev_priv->gt.active_engines)
>                 goto out;
> @@ -3079,6 +3084,13 @@ i915_gem_idle_work_handler(struct work_struct *work)
>  
>         GEM_BUG_ON(!dev_priv->gt.awake);
>         dev_priv->gt.awake = false;
> +       rearm_hangcheck = false;
> +
> +       stuck_engines = intel_kick_waiters(dev_priv);
> +       if (unlikely(stuck_engines)) {
> +               DRM_DEBUG_DRIVER("kicked stuck waiters...missed irq\n");
> +               dev_priv->gpu_error.missed_irq_rings |= stuck_engines;
> +       }
>  
>         if (INTEL_INFO(dev_priv)->gen >= 6)
>                 gen6_rps_idle(dev_priv);
> @@ -3086,14 +3098,8 @@ i915_gem_idle_work_handler(struct work_struct *work)
>  out:
>         mutex_unlock(&dev->struct_mutex);
>  
> -       if (!dev_priv->gt.awake &&
> -           cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work)) {
> -               unsigned stuck = intel_kick_waiters(dev_priv);
> -               if (unlikely(stuck)) {
> -                       DRM_DEBUG_DRIVER("kicked stuck waiters...missed irq\n");
> -                       dev_priv->gpu_error.missed_irq_rings |= stuck;
> -               }
> -       }
> +       if (rearm_hangcheck)
> +               i915_queue_hangcheck(dev_priv);

As discussed in IRC, should not race, so with above hunk;

Reviewed-by: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>

>  }
> -Chris
> 
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation