[Intel-gfx] [PATCH] drm/i915: Always run hangcheck while the GPU is busy

Antonio Argenziano antonio.argenziano at intel.com
Mon Jan 29 23:41:55 UTC 2018



On 29/01/18 06:41, Chris Wilson wrote:
> Previously, we relied on only running the hangcheck while somebody was
> waiting on the GPU, in order to minimise the amount of time hangcheck
> had to run. (If nobody was watching the GPU, nobody would notice if the
> GPU wasn't responding -- eventually somebody would care and so kick
> hangcheck into action.) However, this falls apart from around commit
> 4680816be336 ("drm/i915: Wait first for submission, before waiting for
> request completion"), as not all waiters declare themselves to hangcheck
> and so we could switch off hangcheck and miss GPU hangs even when
> waiting under the struct_mutex.
> 
> If we enable hangcheck from the first request submission, and let it run
> until the GPU is idle again, we forgo all the complexity involved with
> only enabling around waiters. Instead we have to be careful that we do
> not declare a GPU hang when idly waiting for the next request to be come
> ready.
> 
> Fixes: 4680816be336 ("drm/i915: Wait first for submission, before waiting for request completion"

It also fixes https://bugs.freedesktop.org/show_bug.cgi?id=104840.

Thanks,
Antonio

> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.c          |  7 +++----
>   drivers/gpu/drm/i915/i915_gem_request.c  |  2 ++
>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 11 -----------
>   drivers/gpu/drm/i915/intel_hangcheck.c   |  7 +------
>   4 files changed, 6 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 062b21408698..63308ec016a3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3322,16 +3322,15 @@ i915_gem_retire_work_handler(struct work_struct *work)
>   		mutex_unlock(&dev->struct_mutex);
>   	}
>   
> -	/* Keep the retire handler running until we are finally idle.
> +	/*
> +	 * Keep the retire handler running until we are finally idle.
>   	 * We do not need to do this test under locking as in the worst-case
>   	 * we queue the retire worker once too often.
>   	 */
> -	if (READ_ONCE(dev_priv->gt.awake)) {
> -		i915_queue_hangcheck(dev_priv);
> +	if (READ_ONCE(dev_priv->gt.awake))
>   		queue_delayed_work(dev_priv->wq,
>   				   &dev_priv->gt.retire_work,
>   				   round_jiffies_up_relative(HZ));
> -	}
>   }
>   
>   static void shrink_caches(struct drm_i915_private *i915)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 9b02310307fc..6cacd78cc849 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -393,6 +393,8 @@ static void mark_busy(struct drm_i915_private *i915)
>   
>   	intel_engines_unpark(i915);
>   
> +	i915_queue_hangcheck(i915);
> +
>   	queue_delayed_work(i915->wq,
>   			   &i915->gt.retire_work,
>   			   round_jiffies_up_relative(HZ));
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 86acac010bb8..62b2a20bc24e 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -149,17 +149,6 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
>   		return;
>   
>   	mod_timer(&b->fake_irq, jiffies + 1);
> -
> -	/* Ensure that even if the GPU hangs, we get woken up.
> -	 *
> -	 * However, note that if no one is waiting, we never notice
> -	 * a gpu hang. Eventually, we will have to wait for a resource
> -	 * held by the GPU and so trigger a hangcheck. In the most
> -	 * pathological case, this will be upon memory starvation! To
> -	 * prevent this, we also queue the hangcheck from the retire
> -	 * worker.
> -	 */
> -	i915_queue_hangcheck(engine->i915);
>   }
>   
>   static void irq_enable(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 31f01d64c021..348a4f7ffb67 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -411,7 +411,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
>   	unsigned int hung = 0, stuck = 0;
> -	int busy_count = 0;
>   
>   	if (!i915_modparams.enable_hangcheck)
>   		return;
> @@ -429,7 +428,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
>   
>   	for_each_engine(engine, dev_priv, id) {
> -		const bool busy = intel_engine_has_waiter(engine);
>   		struct intel_engine_hangcheck hc;
>   
>   		semaphore_clear_deadlocks(dev_priv);
> @@ -443,16 +441,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   			if (hc.action != ENGINE_DEAD)
>   				stuck |= intel_engine_flag(engine);
>   		}
> -
> -		busy_count += busy;
>   	}
>   
>   	if (hung)
>   		hangcheck_declare_hang(dev_priv, hung, stuck);
>   
>   	/* Reset timer in case GPU hangs without another request being added */
> -	if (busy_count)
> -		i915_queue_hangcheck(dev_priv);
> +	i915_queue_hangcheck(dev_priv);
>   }
>   
>   void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
> 


More information about the Intel-gfx mailing list