[Intel-gfx] [PATCH 01/62] drm/i915: Only start retire worker when idle

Wed Jun 8 11:06:02 UTC 2016

On Wed, Jun 08, 2016 at 11:53:15AM +0100, Chris Wilson wrote:
> On Tue, Jun 07, 2016 at 02:31:07PM +0300, Joonas Lahtinen wrote:
> > On pe, 2016-06-03 at 17:36 +0100, Chris Wilson wrote:
> > >  i915_gem_idle_work_handler(struct work_struct *work)
> > >  {
> > >  	struct drm_i915_private *dev_priv =
> > > -		container_of(work, typeof(*dev_priv), mm.idle_work.work);
> > > +		container_of(work, typeof(*dev_priv), gt.idle_work.work);
> > >  	struct drm_device *dev = dev_priv->dev;
> > >  	struct intel_engine_cs *engine;
> > >  
> > > -	for_each_engine(engine, dev_priv)
> > > -		if (!list_empty(&engine->request_list))
> > > -			return;
> > > +	if (!READ_ONCE(dev_priv->gt.awake))
> > > +		return;
> > >  
> > > -	/* we probably should sync with hangcheck here, using cancel_work_sync.
> > > -	 * Also locking seems to be fubar here, engine->request_list is protected
> > > -	 * by dev->struct_mutex. */
> > > +	mutex_lock(&dev->struct_mutex);
> > > +	if (dev_priv->gt.active_engines)
> > > +		goto out;
> > >  
> > > -	intel_mark_idle(dev_priv);
> > > +	for_each_engine(engine, dev_priv)
> > > +		i915_gem_batch_pool_fini(&engine->batch_pool);
> > >  
> > > -	if (mutex_trylock(&dev->struct_mutex)) {
> > > -		for_each_engine(engine, dev_priv)
> > > -			i915_gem_batch_pool_fini(&engine->batch_pool);
> > > +	GEM_BUG_ON(!dev_priv->gt.awake);
> > > +	dev_priv->gt.awake = false;
> > >  
> > > -		mutex_unlock(&dev->struct_mutex);
> > > +	if (INTEL_INFO(dev_priv)->gen >= 6)
> > > +		gen6_rps_idle(dev_priv);
> > > +	intel_runtime_pm_put(dev_priv);
> > > +out:
> > > +	mutex_unlock(&dev->struct_mutex);
> > > +
> > > +	if (!dev_priv->gt.awake &&
> > 
> > No READ_ONCE here, even we just unlocked the mutex. So lacks some
> > consistency.
> > 
> > Also, this assumes we might be pre-empted between unlocking mutex and
> > making this test, so I'm little bit confused. Do you want to optimize
> > by avoiding calling cancel_delayed_work_sync?
> 
> General principle to never call work_sync functions with locks held. I
> had actually thought I had fixed this up (but realized that I just
> rewrote hangcheck later on instead ;)
> 
> Ok, what I think is safer here is
> 
> 	bool hangcheck = cancel_delay_work_sync(hangcheck_work)
> 
> 	mutex_lock()
> 	if (actually_idle()) {
> 		awake = false;
> 		missed_irq_rings |= intel_kick_waiters();
> 	}
> 	mutex_unlock();
> 
> 	if (awake && hangcheck)
> 		queue_hangcheck()
> 	
> So always kick the hangcheck and reeanble if we tried to idle too early.
> This will potentially delay hangcheck by one full hangcheck period if we
> do encounter that race. But we shouldn't be hitting this race that
> often, or hanging the GPU for that mterr.

Actual delta:

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 406046f66e36..856da4036fb3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3066,10 +3066,15 @@ i915_gem_idle_work_handler(struct work_struct *work)
                container_of(work, typeof(*dev_priv), gt.idle_work.work);
        struct drm_device *dev = dev_priv->dev;
        struct intel_engine_cs *engine;
+       unsigned stuck_engines;
+       bool rearm_hangcheck;
 
        if (!READ_ONCE(dev_priv->gt.awake))
                return;
 
+       rearm_hangcheck =
+               cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
+
        mutex_lock(&dev->struct_mutex);
        if (dev_priv->gt.active_engines)
                goto out;
@@ -3079,6 +3084,13 @@ i915_gem_idle_work_handler(struct work_struct *work)
 
        GEM_BUG_ON(!dev_priv->gt.awake);
        dev_priv->gt.awake = false;
+       rearm_hangcheck = false;
+
+       stuck_engines = intel_kick_waiters(dev_priv);
+       if (unlikely(stuck_engines)) {
+               DRM_DEBUG_DRIVER("kicked stuck waiters...missed irq\n");
+               dev_priv->gpu_error.missed_irq_rings |= stuck_engines;
+       }
 
        if (INTEL_INFO(dev_priv)->gen >= 6)
                gen6_rps_idle(dev_priv);
@@ -3086,14 +3098,8 @@ i915_gem_idle_work_handler(struct work_struct *work)
 out:
        mutex_unlock(&dev->struct_mutex);
 
-       if (!dev_priv->gt.awake &&
-           cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work)) {
-               unsigned stuck = intel_kick_waiters(dev_priv);
-               if (unlikely(stuck)) {
-                       DRM_DEBUG_DRIVER("kicked stuck waiters...missed irq\n");
-                       dev_priv->gpu_error.missed_irq_rings |= stuck;
-               }
-       }
+       if (rearm_hangcheck)
+               i915_queue_hangcheck(dev_priv);
 }
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre