[Intel-gfx] [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()

Mika Kuoppala mika.kuoppala at linux.intel.com
Mon Jul 9 11:38:10 UTC 2018


Chris Wilson <chris at chris-wilson.co.uk> writes:

> Usually we have no idea about the upper bound we need to wait to catch
> up with userspace when idling the device, but in a few situations we
> know the system was idle beforehand and can provide a short timeout in
> order to very quickly catch a failure, long before hangcheck kicks in.
>
> In the following patches, we will use the timeout to curtain two overly
> long waits, where we know we can expect the GPU to complete within a
> reasonable time or declare it broken.
>
> In particular, with a broken GPU we expect it to fail during the initial
> GPU setup where do a couple of context switches to record the defaults.
> This is a task that takes a few milliseconds even on the slowest of
> devices, but we may have to wait 60s for hangcheck to give in and
> declare the machine inoperable. In this a case where any gpu hang is
> unacceptable, both from a timeliness and practical standpoint.
>
> The other improvement is that in selftests, we do not need to arm an
> independent timer to inject a wedge, as we can just limit the timeout on
> the wait directly.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c           |  6 ++-
>  drivers/gpu/drm/i915/i915_drv.h               |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c               | 43 +++++++++++--------
>  drivers/gpu/drm/i915/i915_gem_evict.c         |  3 +-
>  drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
>  drivers/gpu/drm/i915/i915_gem_shrinker.c      | 11 +++--
>  drivers/gpu/drm/i915/i915_perf.c              |  4 +-
>  drivers/gpu/drm/i915/i915_request.c           |  6 ++-
>  .../gpu/drm/i915/selftests/i915_gem_context.c |  4 +-
>  drivers/gpu/drm/i915/selftests/i915_request.c |  4 +-
>  .../gpu/drm/i915/selftests/igt_flush_test.c   |  2 +-
>  11 files changed, 56 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 544e5e7f011f..099f97ef2303 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4105,7 +4105,8 @@ fault_irq_set(struct drm_i915_private *i915,
>  
>  	err = i915_gem_wait_for_idle(i915,
>  				     I915_WAIT_LOCKED |
> -				     I915_WAIT_INTERRUPTIBLE);
> +				     I915_WAIT_INTERRUPTIBLE,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err)
>  		goto err_unlock;
>  
> @@ -4210,7 +4211,8 @@ i915_drop_caches_set(void *data, u64 val)
>  		if (val & DROP_ACTIVE)
>  			ret = i915_gem_wait_for_idle(dev_priv,
>  						     I915_WAIT_INTERRUPTIBLE |
> -						     I915_WAIT_LOCKED);
> +						     I915_WAIT_LOCKED,
> +						     MAX_SCHEDULE_TIMEOUT);
>  
>  		if (val & DROP_RETIRE)
>  			i915_retire_requests(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 09ab12458244..bec0a2796c37 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3154,7 +3154,7 @@ void i915_gem_init_swizzling(struct drm_i915_private *dev_priv);
>  void i915_gem_fini(struct drm_i915_private *dev_priv);
>  void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
>  int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
> -			   unsigned int flags);
> +			   unsigned int flags, long timeout);
>  int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
>  void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
>  void i915_gem_resume(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1a9dab302433..093d98ed7755 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2267,7 +2267,9 @@ static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
>  
>  	/* Attempt to reap some mmap space from dead objects */
>  	do {
> -		err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
> +		err = i915_gem_wait_for_idle(dev_priv,
> +					     I915_WAIT_INTERRUPTIBLE,
> +					     MAX_SCHEDULE_TIMEOUT);
>  		if (err)
>  			break;
>  
> @@ -3742,14 +3744,14 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	return ret;
>  }
>  
> -static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
> +static long wait_for_timeline(struct i915_timeline *tl,
> +			      unsigned int flags, long timeout)
>  {
>  	struct i915_request *rq;
> -	long ret;
>  
>  	rq = i915_gem_active_get_unlocked(&tl->last_request);
>  	if (!rq)
> -		return 0;
> +		return timeout;
>  
>  	/*
>  	 * "Race-to-idle".
> @@ -3763,10 +3765,10 @@ static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
>  	if (flags & I915_WAIT_FOR_IDLE_BOOST)
>  		gen6_rps_boost(rq, NULL);
>  
> -	ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT);
> +	timeout = i915_request_wait(rq, flags, timeout);
>  	i915_request_put(rq);
>  
> -	return ret < 0 ? ret : 0;
> +	return timeout;
>  }
>  
>  static int wait_for_engines(struct drm_i915_private *i915)
> @@ -3782,7 +3784,8 @@ static int wait_for_engines(struct drm_i915_private *i915)
>  	return 0;
>  }
>  
> -int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> +int i915_gem_wait_for_idle(struct drm_i915_private *i915,
> +			   unsigned int flags, long timeout)
>  {
>  	GEM_TRACE("flags=%x (%s)\n",
>  		  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");

Did you omit enhancing the trace on purpose?

Eventually i915_request_wait will assert for silly timeout values, but
should we add assertion here too as wait_for_timeline will
return what we put, for nonactive timelines?

> @@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>  		lockdep_assert_held(&i915->drm.struct_mutex);
>  
>  		list_for_each_entry(tl, &i915->gt.timelines, link) {
> -			err = wait_for_timeline(tl, flags);
> -			if (err)
> -				return err;
> +			timeout = wait_for_timeline(tl, flags, timeout);
> +			if (timeout < 0)
> +				return timeout;
>  		}
>  
>  		err = wait_for_engines(i915);

To truely serve the caller, the remaining timeout could be passed to
wait_for_engines too, but that can be followup.

So the only real thing is the missing trace.
-Mika


> @@ -3812,12 +3815,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>  	} else {
>  		struct intel_engine_cs *engine;
>  		enum intel_engine_id id;
> -		int err;
>  
>  		for_each_engine(engine, i915, id) {
> -			err = wait_for_timeline(&engine->timeline, flags);
> -			if (err)
> -				return err;
> +			struct i915_timeline *tl = &engine->timeline;
> +
> +			timeout = wait_for_timeline(tl, flags, timeout);
> +			if (timeout < 0)
> +				return timeout;
>  		}
>  	}
>  
> @@ -5052,7 +5056,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
>  		ret = i915_gem_wait_for_idle(dev_priv,
>  					     I915_WAIT_INTERRUPTIBLE |
>  					     I915_WAIT_LOCKED |
> -					     I915_WAIT_FOR_IDLE_BOOST);
> +					     I915_WAIT_FOR_IDLE_BOOST,
> +					     MAX_SCHEDULE_TIMEOUT);
>  		if (ret && ret != -EIO)
>  			goto err_unlock;
>  
> @@ -5356,7 +5361,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
>  	if (err)
>  		goto err_active;
>  
> -	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +	err = i915_gem_wait_for_idle(i915,
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err)
>  		goto err_active;
>  
> @@ -5421,7 +5428,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
>  	if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
>  		goto out_ctx;
>  
> -	if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
> +	if (WARN_ON(i915_gem_wait_for_idle(i915,
> +					   I915_WAIT_LOCKED,
> +					   MAX_SCHEDULE_TIMEOUT)))
>  		goto out_ctx;
>  
>  	i915_gem_contexts_lost(i915);
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
> index 54814a196ee4..02b83a5ed96c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -69,7 +69,8 @@ static int ggtt_flush(struct drm_i915_private *i915)
>  
>  	err = i915_gem_wait_for_idle(i915,
>  				     I915_WAIT_INTERRUPTIBLE |
> -				     I915_WAIT_LOCKED);
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err)
>  		return err;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 2ad319e17e39..abd81fb9b0b6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2793,7 +2793,7 @@ void i915_gem_gtt_finish_pages(struct drm_i915_gem_object *obj,
>  	struct i915_ggtt *ggtt = &dev_priv->ggtt;
>  
>  	if (unlikely(ggtt->do_idle_maps)) {
> -		if (i915_gem_wait_for_idle(dev_priv, 0)) {
> +		if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) {
>  			DRM_ERROR("Failed to wait for idle; VT'd may hang.\n");
>  			/* Wait a bit, in hopes it avoids the hang */
>  			udelay(10);
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index 55e84e71f526..c61f5b80fee3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -172,7 +172,9 @@ i915_gem_shrink(struct drm_i915_private *i915,
>  	 * we will free as much as we can and hope to get a second chance.
>  	 */
>  	if (flags & I915_SHRINK_ACTIVE)
> -		i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +		i915_gem_wait_for_idle(i915,
> +				       I915_WAIT_LOCKED,
> +				       MAX_SCHEDULE_TIMEOUT);
>  
>  	trace_i915_gem_shrink(i915, target, flags);
>  	i915_retire_requests(i915);
> @@ -392,7 +394,8 @@ shrinker_lock_uninterruptible(struct drm_i915_private *i915, bool *unlock,
>  	unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms);
>  
>  	do {
> -		if (i915_gem_wait_for_idle(i915, 0) == 0 &&
> +		if (i915_gem_wait_for_idle(i915,
> +					   0, MAX_SCHEDULE_TIMEOUT) == 0 &&
>  		    shrinker_lock(i915, unlock))
>  			break;
>  
> @@ -466,7 +469,9 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
>  		return NOTIFY_DONE;
>  
>  	/* Force everything onto the inactive lists */
> -	ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +	ret = i915_gem_wait_for_idle(i915,
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (ret)
>  		goto out;
>  
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 447407fee3b8..6bf10952c724 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1836,7 +1836,9 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>  	 * So far the best way to work around this issue seems to be draining
>  	 * the GPU from any submitted work.
>  	 */
> -	ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
> +	ret = i915_gem_wait_for_idle(dev_priv,
> +				     wait_flags,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (ret)
>  		goto out;
>  
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 3248369dbcfb..5c2c93cbab12 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -206,7 +206,8 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
>  	/* Carefully retire all requests without writing to the rings */
>  	ret = i915_gem_wait_for_idle(i915,
>  				     I915_WAIT_INTERRUPTIBLE |
> -				     I915_WAIT_LOCKED);
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (ret)
>  		return ret;
>  
> @@ -735,7 +736,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>  		/* Ratelimit ourselves to prevent oom from malicious clients */
>  		ret = i915_gem_wait_for_idle(i915,
>  					     I915_WAIT_LOCKED |
> -					     I915_WAIT_INTERRUPTIBLE);
> +					     I915_WAIT_INTERRUPTIBLE,
> +					     MAX_SCHEDULE_TIMEOUT);
>  		if (ret)
>  			goto err_unreserve;
>  
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> index 5fbe15f4effd..ab2590242033 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> @@ -478,7 +478,9 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
>  		}
>  	}
>  
> -	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +	err = i915_gem_wait_for_idle(i915,
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err)
>  		return err;
>  
> diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
> index 43995fc3534d..c4aac6141e04 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_request.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_request.c
> @@ -286,7 +286,9 @@ static int begin_live_test(struct live_test *t,
>  	t->func = func;
>  	t->name = name;
>  
> -	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +	err = i915_gem_wait_for_idle(i915,
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err) {
>  		pr_err("%s(%s): failed to idle before, with err=%d!",
>  		       func, name, err);
> diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> index 0d06f559243f..09ab037ce803 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> @@ -64,7 +64,7 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
>  	}
>  
>  	wedge_on_timeout(&w, i915, HZ)
> -		i915_gem_wait_for_idle(i915, flags);
> +		i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT);
>  
>  	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
>  }
> -- 
> 2.18.0


More information about the Intel-gfx mailing list