[Intel-gfx] [PATCH] drm/i915: Always run hangcheck while the GPU is busy
Mika Kuoppala
mika.kuoppala at linux.intel.com
Wed Jan 31 09:41:35 UTC 2018
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Previously, we relied on only running the hangcheck while somebody was
> waiting on the GPU, in order to minimise the amount of time hangcheck
> had to run. (If nobody was watching the GPU, nobody would notice if the
> GPU wasn't responding -- eventually somebody would care and so kick
> hangcheck into action.) However, this falls apart from around commit
> 4680816be336 ("drm/i915: Wait first for submission, before waiting for
> request completion"), as not all waiters declare themselves to hangcheck
> and so we could switch off hangcheck and miss GPU hangs even when
> waiting under the struct_mutex.
>
> If we enable hangcheck from the first request submission, and let it run
> until the GPU is idle again, we forgo all the complexity involved with
> only enabling around waiters. Instead we have to be careful that we do
> not declare a GPU hang when idly waiting for the next request to be come
> ready.
>
> Fixes: 4680816be336 ("drm/i915: Wait first for submission, before waiting for request completion"
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 7 +++----
> drivers/gpu/drm/i915/i915_gem_request.c | 2 ++
> drivers/gpu/drm/i915/intel_breadcrumbs.c | 11 -----------
> drivers/gpu/drm/i915/intel_hangcheck.c | 7 +------
> 4 files changed, 6 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 062b21408698..63308ec016a3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3322,16 +3322,15 @@ i915_gem_retire_work_handler(struct work_struct *work)
> mutex_unlock(&dev->struct_mutex);
> }
>
> - /* Keep the retire handler running until we are finally idle.
> + /*
> + * Keep the retire handler running until we are finally idle.
> * We do not need to do this test under locking as in the worst-case
> * we queue the retire worker once too often.
> */
> - if (READ_ONCE(dev_priv->gt.awake)) {
> - i915_queue_hangcheck(dev_priv);
> + if (READ_ONCE(dev_priv->gt.awake))
> queue_delayed_work(dev_priv->wq,
> &dev_priv->gt.retire_work,
> round_jiffies_up_relative(HZ));
> - }
> }
>
> static void shrink_caches(struct drm_i915_private *i915)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 9b02310307fc..6cacd78cc849 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -393,6 +393,8 @@ static void mark_busy(struct drm_i915_private *i915)
>
> intel_engines_unpark(i915);
>
> + i915_queue_hangcheck(i915);
> +
> queue_delayed_work(i915->wq,
> &i915->gt.retire_work,
> round_jiffies_up_relative(HZ));
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 86acac010bb8..62b2a20bc24e 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -149,17 +149,6 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
> return;
>
> mod_timer(&b->fake_irq, jiffies + 1);
> -
> - /* Ensure that even if the GPU hangs, we get woken up.
> - *
> - * However, note that if no one is waiting, we never notice
> - * a gpu hang. Eventually, we will have to wait for a resource
> - * held by the GPU and so trigger a hangcheck. In the most
> - * pathological case, this will be upon memory starvation! To
> - * prevent this, we also queue the hangcheck from the retire
> - * worker.
> - */
> - i915_queue_hangcheck(engine->i915);
> }
>
> static void irq_enable(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 31f01d64c021..348a4f7ffb67 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -411,7 +411,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
> unsigned int hung = 0, stuck = 0;
> - int busy_count = 0;
>
> if (!i915_modparams.enable_hangcheck)
> return;
> @@ -429,7 +428,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
>
> for_each_engine(engine, dev_priv, id) {
> - const bool busy = intel_engine_has_waiter(engine);
> struct intel_engine_hangcheck hc;
>
> semaphore_clear_deadlocks(dev_priv);
> @@ -443,16 +441,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> if (hc.action != ENGINE_DEAD)
> stuck |= intel_engine_flag(engine);
> }
> -
> - busy_count += busy;
> }
>
> if (hung)
> hangcheck_declare_hang(dev_priv, hung, stuck);
>
> /* Reset timer in case GPU hangs without another request being added */
> - if (busy_count)
> - i915_queue_hangcheck(dev_priv);
> + i915_queue_hangcheck(dev_priv);
> }
>
> void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
> --
> 2.15.1
More information about the Intel-gfx
mailing list