[Intel-gfx] [PATCH] drm/i915: Always run hangcheck while the GPU is busy

Tue Jan 30 12:18:17 UTC 2018

Chris Wilson <chris at chris-wilson.co.uk> writes:

> Previously, we relied on only running the hangcheck while somebody was
> waiting on the GPU, in order to minimise the amount of time hangcheck
> had to run. (If nobody was watching the GPU, nobody would notice if the
> GPU wasn't responding -- eventually somebody would care and so kick
> hangcheck into action.) However, this falls apart from around commit
> 4680816be336 ("drm/i915: Wait first for submission, before waiting for
> request completion"), as not all waiters declare themselves to hangcheck
> and so we could switch off hangcheck and miss GPU hangs even when
> waiting under the struct_mutex.
>
> If we enable hangcheck from the first request submission, and let it run
> until the GPU is idle again, we forgo all the complexity involved with
> only enabling around waiters. Instead we have to be careful that we do
> not declare a GPU hang when idly waiting for the next request to be come
> ready.

For the complexity part I agree that this is simple and elegant. But
I think I have not understood it fully as I don't connect the part where
we need to be careful in idly waiting for next request.
Could you elaborate and point it the relevant portion in the patch for it?

-Mika

>
> Fixes: 4680816be336 ("drm/i915: Wait first for submission, before waiting for request completion"
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c          |  7 +++----
>  drivers/gpu/drm/i915/i915_gem_request.c  |  2 ++
>  drivers/gpu/drm/i915/intel_breadcrumbs.c | 11 -----------
>  drivers/gpu/drm/i915/intel_hangcheck.c   |  7 +------
>  4 files changed, 6 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 062b21408698..63308ec016a3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3322,16 +3322,15 @@ i915_gem_retire_work_handler(struct work_struct *work)
>  		mutex_unlock(&dev->struct_mutex);
>  	}
>  
> -	/* Keep the retire handler running until we are finally idle.
> +	/*
> +	 * Keep the retire handler running until we are finally idle.
>  	 * We do not need to do this test under locking as in the worst-case
>  	 * we queue the retire worker once too often.
>  	 */
> -	if (READ_ONCE(dev_priv->gt.awake)) {
> -		i915_queue_hangcheck(dev_priv);
> +	if (READ_ONCE(dev_priv->gt.awake))
>  		queue_delayed_work(dev_priv->wq,
>  				   &dev_priv->gt.retire_work,
>  				   round_jiffies_up_relative(HZ));
> -	}
>  }
>  
>  static void shrink_caches(struct drm_i915_private *i915)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 9b02310307fc..6cacd78cc849 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -393,6 +393,8 @@ static void mark_busy(struct drm_i915_private *i915)
>  
>  	intel_engines_unpark(i915);
>  
> +	i915_queue_hangcheck(i915);
> +
>  	queue_delayed_work(i915->wq,
>  			   &i915->gt.retire_work,
>  			   round_jiffies_up_relative(HZ));
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 86acac010bb8..62b2a20bc24e 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -149,17 +149,6 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
>  		return;
>  
>  	mod_timer(&b->fake_irq, jiffies + 1);
> -
> -	/* Ensure that even if the GPU hangs, we get woken up.
> -	 *
> -	 * However, note that if no one is waiting, we never notice
> -	 * a gpu hang. Eventually, we will have to wait for a resource
> -	 * held by the GPU and so trigger a hangcheck. In the most
> -	 * pathological case, this will be upon memory starvation! To
> -	 * prevent this, we also queue the hangcheck from the retire
> -	 * worker.
> -	 */
> -	i915_queue_hangcheck(engine->i915);
>  }
>  
>  static void irq_enable(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 31f01d64c021..348a4f7ffb67 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -411,7 +411,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>  	unsigned int hung = 0, stuck = 0;
> -	int busy_count = 0;
>  
>  	if (!i915_modparams.enable_hangcheck)
>  		return;
> @@ -429,7 +428,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
>  
>  	for_each_engine(engine, dev_priv, id) {
> -		const bool busy = intel_engine_has_waiter(engine);
>  		struct intel_engine_hangcheck hc;
>  
>  		semaphore_clear_deadlocks(dev_priv);
> @@ -443,16 +441,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>  			if (hc.action != ENGINE_DEAD)
>  				stuck |= intel_engine_flag(engine);
>  		}
> -
> -		busy_count += busy;
>  	}
>  
>  	if (hung)
>  		hangcheck_declare_hang(dev_priv, hung, stuck);
>  
>  	/* Reset timer in case GPU hangs without another request being added */
> -	if (busy_count)
> -		i915_queue_hangcheck(dev_priv);
> +	i915_queue_hangcheck(dev_priv);
>  }
>  
>  void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
> -- 
> 2.15.1