[Intel-gfx] [PATCH] drm/i915: Prevent lock-cycles between GPU waits and GPU resets

Mika Kuoppala mika.kuoppala at linux.intel.com
Wed Jun 12 11:00:07 UTC 2019


Chris Wilson <chris at chris-wilson.co.uk> writes:

> We cannot allow ourselves to wait on the GPU while holding any lock we

s/we/as we?

My english parser is not strong.

> may need to reset the GPU. While there is not an explicit lock between
> the two operations, lockdep cannot detect the dependency. So let's tell
> lockdep about the wait/reset dependency with an explicit lockmap.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> This is *annoyingly* good at detecting lock cycles in GPU reset.
> -Chris
> ---
>  drivers/gpu/drm/i915/gt/intel_reset.c            | 5 ++++-
>  drivers/gpu/drm/i915/i915_drv.h                  | 8 ++++++++
>  drivers/gpu/drm/i915/i915_gem.c                  | 3 +++
>  drivers/gpu/drm/i915/i915_request.c              | 2 ++
>  drivers/gpu/drm/i915/selftests/mock_gem_device.c | 2 ++
>  5 files changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 60d24110af80..6368b37f26d1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -978,10 +978,11 @@ void i915_reset(struct drm_i915_private *i915,
>  
>  	might_sleep();
>  	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
> +	lock_map_acquire(&i915->gt.reset_lockmap);
>  
>  	/* Clear any previous failed attempts at recovery. Time to try again. */
>  	if (!__i915_gem_unset_wedged(i915))
> -		return;
> +		goto unlock;
>  
>  	if (reason)
>  		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
> @@ -1029,6 +1030,8 @@ void i915_reset(struct drm_i915_private *i915,
>  
>  finish:
>  	reset_finish(i915);
> +unlock:
> +	lock_map_release(&i915->gt.reset_lockmap);
>  	return;

The return patter in this function is rather unorthodox. Might be
even that I reviewed it. Very close that I fell into trap of thinking
that you return with lock held.
>  
>  taint:
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 0ea7f78ae227..9cfa9500fcc4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1919,6 +1919,14 @@ struct drm_i915_private {
>  		ktime_t last_init_time;
>  
>  		struct i915_vma *scratch;
> +
> +		/*
> +		 * We must never wait on the GPU while holding a lock we may

My english parser still expected 'as' somewhere in there.

Regardless, a neat way to use lockdep to enforce our expectations.

Reviewed-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>

> +		 * need to perform a GPU reset. So while we don't need to
> +		 * serialise wait/reset with an explicit lock, we do want
> +		 * lockdep to detect potential dependency cycles.
> +		 */
> +		struct lockdep_map reset_lockmap;
>  	} gt;
>  
>  	struct {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index e980c1ee3dcf..24f0f3db1bfb 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1782,6 +1782,7 @@ static void i915_gem_init__mm(struct drm_i915_private *i915)
>  
>  int i915_gem_init_early(struct drm_i915_private *dev_priv)
>  {
> +	static struct lock_class_key reset_key;
>  	int err;
>  
>  	intel_gt_pm_init(dev_priv);
> @@ -1789,6 +1790,8 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>  	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
>  	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
>  	spin_lock_init(&dev_priv->gt.closed_lock);
> +	lockdep_init_map(&dev_priv->gt.reset_lockmap,
> +			 "i915.reset", &reset_key, 0);
>  
>  	i915_gem_init__mm(dev_priv);
>  	i915_gem_init__pm(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 1a948471829d..cd4ce44dc4e5 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1444,6 +1444,7 @@ long i915_request_wait(struct i915_request *rq,
>  		return -ETIME;
>  
>  	trace_i915_request_wait_begin(rq, flags);
> +	lock_map_acquire(&rq->i915->gt.reset_lockmap);
>  
>  	/*
>  	 * Optimistic spin before touching IRQs.
> @@ -1517,6 +1518,7 @@ long i915_request_wait(struct i915_request *rq,
>  	dma_fence_remove_callback(&rq->fence, &wait.cb);
>  
>  out:
> +	lock_map_release(&rq->i915->gt.reset_lockmap);
>  	trace_i915_request_wait_end(rq);
>  	return timeout;
>  }
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index b7f3fbb4ae89..1e9ffced78c1 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -130,6 +130,7 @@ static struct dev_pm_domain pm_domain = {
>  
>  struct drm_i915_private *mock_gem_device(void)
>  {
> +	static struct lock_class_key reset_key;
>  	struct drm_i915_private *i915;
>  	struct pci_dev *pdev;
>  	int err;
> @@ -204,6 +205,7 @@ struct drm_i915_private *mock_gem_device(void)
>  	INIT_LIST_HEAD(&i915->gt.active_rings);
>  	INIT_LIST_HEAD(&i915->gt.closed_vma);
>  	spin_lock_init(&i915->gt.closed_lock);
> +	lockdep_init_map(&i915->gt.reset_lockmap, "i915.reset", &reset_key, 0);
>  
>  	mutex_lock(&i915->drm.struct_mutex);
>  
> -- 
> 2.20.1


More information about the Intel-gfx mailing list