[Intel-gfx] [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
Mika Kuoppala
mika.kuoppala at linux.intel.com
Mon Jul 9 11:38:10 UTC 2018
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Usually we have no idea about the upper bound we need to wait to catch
> up with userspace when idling the device, but in a few situations we
> know the system was idle beforehand and can provide a short timeout in
> order to very quickly catch a failure, long before hangcheck kicks in.
>
> In the following patches, we will use the timeout to curtain two overly
> long waits, where we know we can expect the GPU to complete within a
> reasonable time or declare it broken.
>
> In particular, with a broken GPU we expect it to fail during the initial
> GPU setup where do a couple of context switches to record the defaults.
> This is a task that takes a few milliseconds even on the slowest of
> devices, but we may have to wait 60s for hangcheck to give in and
> declare the machine inoperable. In this a case where any gpu hang is
> unacceptable, both from a timeliness and practical standpoint.
>
> The other improvement is that in selftests, we do not need to arm an
> independent timer to inject a wedge, as we can just limit the timeout on
> the wait directly.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 6 ++-
> drivers/gpu/drm/i915/i915_drv.h | 2 +-
> drivers/gpu/drm/i915/i915_gem.c | 43 +++++++++++--------
> drivers/gpu/drm/i915/i915_gem_evict.c | 3 +-
> drivers/gpu/drm/i915/i915_gem_gtt.c | 2 +-
> drivers/gpu/drm/i915/i915_gem_shrinker.c | 11 +++--
> drivers/gpu/drm/i915/i915_perf.c | 4 +-
> drivers/gpu/drm/i915/i915_request.c | 6 ++-
> .../gpu/drm/i915/selftests/i915_gem_context.c | 4 +-
> drivers/gpu/drm/i915/selftests/i915_request.c | 4 +-
> .../gpu/drm/i915/selftests/igt_flush_test.c | 2 +-
> 11 files changed, 56 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 544e5e7f011f..099f97ef2303 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4105,7 +4105,8 @@ fault_irq_set(struct drm_i915_private *i915,
>
> err = i915_gem_wait_for_idle(i915,
> I915_WAIT_LOCKED |
> - I915_WAIT_INTERRUPTIBLE);
> + I915_WAIT_INTERRUPTIBLE,
> + MAX_SCHEDULE_TIMEOUT);
> if (err)
> goto err_unlock;
>
> @@ -4210,7 +4211,8 @@ i915_drop_caches_set(void *data, u64 val)
> if (val & DROP_ACTIVE)
> ret = i915_gem_wait_for_idle(dev_priv,
> I915_WAIT_INTERRUPTIBLE |
> - I915_WAIT_LOCKED);
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT);
>
> if (val & DROP_RETIRE)
> i915_retire_requests(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 09ab12458244..bec0a2796c37 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3154,7 +3154,7 @@ void i915_gem_init_swizzling(struct drm_i915_private *dev_priv);
> void i915_gem_fini(struct drm_i915_private *dev_priv);
> void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
> int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
> - unsigned int flags);
> + unsigned int flags, long timeout);
> int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
> void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
> void i915_gem_resume(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1a9dab302433..093d98ed7755 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2267,7 +2267,9 @@ static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
>
> /* Attempt to reap some mmap space from dead objects */
> do {
> - err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
> + err = i915_gem_wait_for_idle(dev_priv,
> + I915_WAIT_INTERRUPTIBLE,
> + MAX_SCHEDULE_TIMEOUT);
> if (err)
> break;
>
> @@ -3742,14 +3744,14 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> return ret;
> }
>
> -static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
> +static long wait_for_timeline(struct i915_timeline *tl,
> + unsigned int flags, long timeout)
> {
> struct i915_request *rq;
> - long ret;
>
> rq = i915_gem_active_get_unlocked(&tl->last_request);
> if (!rq)
> - return 0;
> + return timeout;
>
> /*
> * "Race-to-idle".
> @@ -3763,10 +3765,10 @@ static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
> if (flags & I915_WAIT_FOR_IDLE_BOOST)
> gen6_rps_boost(rq, NULL);
>
> - ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT);
> + timeout = i915_request_wait(rq, flags, timeout);
> i915_request_put(rq);
>
> - return ret < 0 ? ret : 0;
> + return timeout;
> }
>
> static int wait_for_engines(struct drm_i915_private *i915)
> @@ -3782,7 +3784,8 @@ static int wait_for_engines(struct drm_i915_private *i915)
> return 0;
> }
>
> -int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> +int i915_gem_wait_for_idle(struct drm_i915_private *i915,
> + unsigned int flags, long timeout)
> {
> GEM_TRACE("flags=%x (%s)\n",
> flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");
Did you omit enhancing the trace on purpose?
Eventually i915_request_wait will assert for silly timeout values, but
should we add assertion here too as wait_for_timeline will
return what we put, for nonactive timelines?
> @@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> lockdep_assert_held(&i915->drm.struct_mutex);
>
> list_for_each_entry(tl, &i915->gt.timelines, link) {
> - err = wait_for_timeline(tl, flags);
> - if (err)
> - return err;
> + timeout = wait_for_timeline(tl, flags, timeout);
> + if (timeout < 0)
> + return timeout;
> }
>
> err = wait_for_engines(i915);
To truely serve the caller, the remaining timeout could be passed to
wait_for_engines too, but that can be followup.
So the only real thing is the missing trace.
-Mika
> @@ -3812,12 +3815,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> } else {
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
> - int err;
>
> for_each_engine(engine, i915, id) {
> - err = wait_for_timeline(&engine->timeline, flags);
> - if (err)
> - return err;
> + struct i915_timeline *tl = &engine->timeline;
> +
> + timeout = wait_for_timeline(tl, flags, timeout);
> + if (timeout < 0)
> + return timeout;
> }
> }
>
> @@ -5052,7 +5056,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
> ret = i915_gem_wait_for_idle(dev_priv,
> I915_WAIT_INTERRUPTIBLE |
> I915_WAIT_LOCKED |
> - I915_WAIT_FOR_IDLE_BOOST);
> + I915_WAIT_FOR_IDLE_BOOST,
> + MAX_SCHEDULE_TIMEOUT);
> if (ret && ret != -EIO)
> goto err_unlock;
>
> @@ -5356,7 +5361,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
> if (err)
> goto err_active;
>
> - err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> + err = i915_gem_wait_for_idle(i915,
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT);
> if (err)
> goto err_active;
>
> @@ -5421,7 +5428,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
> if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
> goto out_ctx;
>
> - if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
> + if (WARN_ON(i915_gem_wait_for_idle(i915,
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT)))
> goto out_ctx;
>
> i915_gem_contexts_lost(i915);
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
> index 54814a196ee4..02b83a5ed96c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -69,7 +69,8 @@ static int ggtt_flush(struct drm_i915_private *i915)
>
> err = i915_gem_wait_for_idle(i915,
> I915_WAIT_INTERRUPTIBLE |
> - I915_WAIT_LOCKED);
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT);
> if (err)
> return err;
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 2ad319e17e39..abd81fb9b0b6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2793,7 +2793,7 @@ void i915_gem_gtt_finish_pages(struct drm_i915_gem_object *obj,
> struct i915_ggtt *ggtt = &dev_priv->ggtt;
>
> if (unlikely(ggtt->do_idle_maps)) {
> - if (i915_gem_wait_for_idle(dev_priv, 0)) {
> + if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) {
> DRM_ERROR("Failed to wait for idle; VT'd may hang.\n");
> /* Wait a bit, in hopes it avoids the hang */
> udelay(10);
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index 55e84e71f526..c61f5b80fee3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -172,7 +172,9 @@ i915_gem_shrink(struct drm_i915_private *i915,
> * we will free as much as we can and hope to get a second chance.
> */
> if (flags & I915_SHRINK_ACTIVE)
> - i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> + i915_gem_wait_for_idle(i915,
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT);
>
> trace_i915_gem_shrink(i915, target, flags);
> i915_retire_requests(i915);
> @@ -392,7 +394,8 @@ shrinker_lock_uninterruptible(struct drm_i915_private *i915, bool *unlock,
> unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms);
>
> do {
> - if (i915_gem_wait_for_idle(i915, 0) == 0 &&
> + if (i915_gem_wait_for_idle(i915,
> + 0, MAX_SCHEDULE_TIMEOUT) == 0 &&
> shrinker_lock(i915, unlock))
> break;
>
> @@ -466,7 +469,9 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
> return NOTIFY_DONE;
>
> /* Force everything onto the inactive lists */
> - ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> + ret = i915_gem_wait_for_idle(i915,
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT);
> if (ret)
> goto out;
>
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 447407fee3b8..6bf10952c724 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1836,7 +1836,9 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
> * So far the best way to work around this issue seems to be draining
> * the GPU from any submitted work.
> */
> - ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
> + ret = i915_gem_wait_for_idle(dev_priv,
> + wait_flags,
> + MAX_SCHEDULE_TIMEOUT);
> if (ret)
> goto out;
>
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 3248369dbcfb..5c2c93cbab12 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -206,7 +206,8 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
> /* Carefully retire all requests without writing to the rings */
> ret = i915_gem_wait_for_idle(i915,
> I915_WAIT_INTERRUPTIBLE |
> - I915_WAIT_LOCKED);
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT);
> if (ret)
> return ret;
>
> @@ -735,7 +736,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
> /* Ratelimit ourselves to prevent oom from malicious clients */
> ret = i915_gem_wait_for_idle(i915,
> I915_WAIT_LOCKED |
> - I915_WAIT_INTERRUPTIBLE);
> + I915_WAIT_INTERRUPTIBLE,
> + MAX_SCHEDULE_TIMEOUT);
> if (ret)
> goto err_unreserve;
>
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> index 5fbe15f4effd..ab2590242033 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> @@ -478,7 +478,9 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
> }
> }
>
> - err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> + err = i915_gem_wait_for_idle(i915,
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT);
> if (err)
> return err;
>
> diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
> index 43995fc3534d..c4aac6141e04 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_request.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_request.c
> @@ -286,7 +286,9 @@ static int begin_live_test(struct live_test *t,
> t->func = func;
> t->name = name;
>
> - err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> + err = i915_gem_wait_for_idle(i915,
> + I915_WAIT_LOCKED,
> + MAX_SCHEDULE_TIMEOUT);
> if (err) {
> pr_err("%s(%s): failed to idle before, with err=%d!",
> func, name, err);
> diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> index 0d06f559243f..09ab037ce803 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> @@ -64,7 +64,7 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
> }
>
> wedge_on_timeout(&w, i915, HZ)
> - i915_gem_wait_for_idle(i915, flags);
> + i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT);
>
> return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
> }
> --
> 2.18.0
More information about the Intel-gfx
mailing list