[Intel-gfx] [PATCH 3/3] drm/i915/userptr: Avoid struct_mutex recursion for mmu_invalidate_range_start

Thu Nov 8 16:45:01 UTC 2018

On 08/11/2018 08:17, Chris Wilson wrote:
> Since commit 93065ac753e4 ("mm, oom: distinguish blockable mode for mmu
> notifiers") we have been able to report failure from
> mmu_invalidate_range_start which allows us to use a trylock on the
> struct_mutex to avoid potential recursion and report -EBUSY instead.
> Furthermore, this allows us to pull the work into the main callback and
> avoid the sleight-of-hand in using a workqueue to avoid lockdep.
> 
> However, not all paths to mmu_invalidate_range_start are prepared to
> handle failure, so instead of reporting the recursion, deal with it by
> propagating the failure upwards, who can decide themselves to handle it
> or report it.
> 
> v2: Mark up the recursive lock behaviour and comment on the various weak
> points.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108375
> References: 93065ac753e4 ("mm, oom: distinguish blockable mode for mmu notifiers")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h         |   4 +-
>   drivers/gpu/drm/i915/i915_gem.c         |  30 +++-
>   drivers/gpu/drm/i915/i915_gem_object.h  |   7 +
>   drivers/gpu/drm/i915/i915_gem_userptr.c | 221 +++++++++++-------------
>   4 files changed, 136 insertions(+), 126 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 0c8438de3c1b..b95afa05976d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3076,8 +3076,8 @@ enum i915_mm_subclass { /* lockdep subclass for obj->mm.lock */
>   	I915_MM_SHRINKER
>   };
>   
> -void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
> -				 enum i915_mm_subclass subclass);
> +int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
> +				enum i915_mm_subclass subclass);
>   void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj);
>   
>   enum i915_map_type {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index dc120b5d8e05..53fb3605c4df 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2447,8 +2447,8 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>   	struct sg_table *pages;
>   
>   	pages = fetch_and_zero(&obj->mm.pages);
> -	if (!pages)
> -		return NULL;
> +	if (IS_ERR_OR_NULL(pages))

Where can we set pages to some ERR_PTR value?

> +		return pages;
>   
>   	spin_lock(&i915->mm.obj_lock);
>   	list_del(&obj->mm.link);
> @@ -2472,22 +2472,23 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>   	return pages;
>   }
>   
> -void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
> -				 enum i915_mm_subclass subclass)
> +int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
> +				enum i915_mm_subclass subclass)
>   {
>   	struct sg_table *pages;
> +	int ret;
>   
>   	if (i915_gem_object_has_pinned_pages(obj))
> -		return;
> +		return -EBUSY;
>   
>   	GEM_BUG_ON(obj->bind_count);
> -	if (!i915_gem_object_has_pages(obj))
> -		return;
>   
>   	/* May be called by shrinker from within get_pages() (on another bo) */
>   	mutex_lock_nested(&obj->mm.lock, subclass);
> -	if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
> +	if (unlikely(atomic_read(&obj->mm.pages_pin_count))) {
> +		ret = -EBUSY;
>   		goto unlock;
> +	}
>   
>   	/*
>   	 * ->put_pages might need to allocate memory for the bit17 swizzle
> @@ -2495,11 +2496,24 @@ void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
>   	 * lists early.
>   	 */
>   	pages = __i915_gem_object_unset_pages(obj);
> +
> +	/*
> +	 * XXX Temporary hijinx to avoid updating all backends to handle
> +	 * NULL pages. In the future, when we have more asynchronous
> +	 * get_pages backends we should be better able to handle the
> +	 * cancellation of the async task in a more uniform manner.
> +	 */
> +	if (!pages && !i915_gem_object_needs_async_cancel(obj))
> +		pages = ERR_PTR(-EINVAL);
> +
>   	if (!IS_ERR(pages))
>   		obj->ops->put_pages(obj, pages);
>   
> +	ret = 0;
>   unlock:
>   	mutex_unlock(&obj->mm.lock);
> +
> +	return ret;
>   }
>   
>   bool i915_sg_trim(struct sg_table *orig_st)
> diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
> index a6dd7c46de0d..49ce797173b5 100644
> --- a/drivers/gpu/drm/i915/i915_gem_object.h
> +++ b/drivers/gpu/drm/i915/i915_gem_object.h
> @@ -56,6 +56,7 @@ struct drm_i915_gem_object_ops {
>   #define I915_GEM_OBJECT_HAS_STRUCT_PAGE	BIT(0)
>   #define I915_GEM_OBJECT_IS_SHRINKABLE	BIT(1)
>   #define I915_GEM_OBJECT_IS_PROXY	BIT(2)
> +#define I915_GEM_OBJECT_ASYNC_CANCEL	BIT(3)
>   
>   	/* Interface between the GEM object and its backing storage.
>   	 * get_pages() is called once prior to the use of the associated set
> @@ -386,6 +387,12 @@ i915_gem_object_is_proxy(const struct drm_i915_gem_object *obj)
>   	return obj->ops->flags & I915_GEM_OBJECT_IS_PROXY;
>   }
>   
> +static inline bool
> +i915_gem_object_needs_async_cancel(const struct drm_i915_gem_object *obj)
> +{
> +	return obj->ops->flags & I915_GEM_OBJECT_ASYNC_CANCEL;
> +}
> +
>   static inline bool
>   i915_gem_object_is_active(const struct drm_i915_gem_object *obj)
>   {
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 2c9b284036d1..8b07fd44731f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -50,79 +50,84 @@ struct i915_mmu_notifier {
>   	struct hlist_node node;
>   	struct mmu_notifier mn;
>   	struct rb_root_cached objects;
> -	struct workqueue_struct *wq;
> +	struct i915_mm_struct *mm;
>   };
>   
>   struct i915_mmu_object {
>   	struct i915_mmu_notifier *mn;
>   	struct drm_i915_gem_object *obj;
>   	struct interval_tree_node it;
> -	struct list_head link;
> -	struct work_struct work;
> -	bool attached;
>   };
>   
> -static void cancel_userptr(struct work_struct *work)
> -{
> -	struct i915_mmu_object *mo = container_of(work, typeof(*mo), work);
> -	struct drm_i915_gem_object *obj = mo->obj;
> -	struct work_struct *active;
> -
> -	/* Cancel any active worker and force us to re-evaluate gup */
> -	mutex_lock(&obj->mm.lock);
> -	active = fetch_and_zero(&obj->userptr.work);
> -	mutex_unlock(&obj->mm.lock);
> -	if (active)
> -		goto out;
> -
> -	i915_gem_object_wait(obj, I915_WAIT_ALL, MAX_SCHEDULE_TIMEOUT, NULL);
> -
> -	mutex_lock(&obj->base.dev->struct_mutex);
> -
> -	/* We are inside a kthread context and can't be interrupted */
> -	if (i915_gem_object_unbind(obj) == 0)
> -		__i915_gem_object_put_pages(obj, I915_MM_NORMAL);
> -	WARN_ONCE(i915_gem_object_has_pages(obj),
> -		  "Failed to release pages: bind_count=%d, pages_pin_count=%d, pin_global=%d\n",
> -		  obj->bind_count,
> -		  atomic_read(&obj->mm.pages_pin_count),
> -		  obj->pin_global);
> -
> -	mutex_unlock(&obj->base.dev->struct_mutex);
> -
> -out:
> -	i915_gem_object_put(obj);
> -}
> -
>   static void add_object(struct i915_mmu_object *mo)
>   {
> -	if (mo->attached)
> +	if (!RB_EMPTY_NODE(&mo->it.rb))
>   		return;
>   
>   	interval_tree_insert(&mo->it, &mo->mn->objects);
> -	mo->attached = true;
>   }
>   
>   static void del_object(struct i915_mmu_object *mo)
>   {
> -	if (!mo->attached)
> +	if (RB_EMPTY_NODE(&mo->it.rb))
>   		return;
>   
>   	interval_tree_remove(&mo->it, &mo->mn->objects);
> -	mo->attached = false;
> +	RB_CLEAR_NODE(&mo->it.rb);
> +}
> +
> +static void
> +__i915_gem_userptr_set_active(struct drm_i915_gem_object *obj, bool value)
> +{
> +	struct i915_mmu_object *mo = obj->userptr.mmu_object;
> +
> +	/*
> +	 * During mm_invalidate_range we need to cancel any userptr that
> +	 * overlaps the range being invalidated. Doing so requires the
> +	 * struct_mutex, and that risks recursion. In order to cause
> +	 * recursion, the user must alias the userptr address space with
> +	 * a GTT mmapping (possible with a MAP_FIXED) - then when we have
> +	 * to invalidate that mmaping, mm_invalidate_range is called with
> +	 * the userptr address *and* the struct_mutex held.  To prevent that
> +	 * we set a flag under the i915_mmu_notifier spinlock to indicate
> +	 * whether this object is valid.
> +	 */
> +	if (!mo)
> +		return;
> +
> +	spin_lock(&mo->mn->lock);
> +	if (value)
> +		add_object(mo);
> +	else
> +		del_object(mo);
> +	spin_unlock(&mo->mn->lock);
> +}
> +
> +static struct mutex *__i915_mutex_lock_recursive(struct mutex *m)
> +{
> +	switch (mutex_trylock_recursive(m)) {
> +	default:
> +	case MUTEX_TRYLOCK_FAILED:
> +		mutex_lock_nested(m, I915_MM_SHRINKER);

I am thinking that perhaps we need a new namespace for struct mutex lock 
nesting, like I915_STRUCT_MUTEX...

And this one would then be called something other than shrinker. Since 
it is not triggered by the shrinker but page migration, right?

> +	case MUTEX_TRYLOCK_SUCCESS:
> +		return m;
> +
> +	case MUTEX_TRYLOCK_RECURSIVE:
> +		return ERR_PTR(-EEXIST);
> +	}
>   }
>   
>   static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
> -						       struct mm_struct *mm,
> -						       unsigned long start,
> -						       unsigned long end,
> -						       bool blockable)
> +						      struct mm_struct *mm,
> +						      unsigned long start,
> +						      unsigned long end,
> +						      bool blockable)
>   {
>   	struct i915_mmu_notifier *mn =
>   		container_of(_mn, struct i915_mmu_notifier, mn);
> -	struct i915_mmu_object *mo;
>   	struct interval_tree_node *it;
> -	LIST_HEAD(cancelled);
> +	struct mutex *unlock = NULL;
> +	int ret = 0;
>   
>   	if (RB_EMPTY_ROOT(&mn->objects.rb_root))
>   		return 0;
> @@ -133,11 +138,15 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>   	spin_lock(&mn->lock);
>   	it = interval_tree_iter_first(&mn->objects, start, end);
>   	while (it) {
> +		struct drm_i915_gem_object *obj;
> +
>   		if (!blockable) {
> -			spin_unlock(&mn->lock);
> -			return -EAGAIN;
> +			ret = -EAGAIN;
> +			break;
>   		}
> -		/* The mmu_object is released late when destroying the
> +
> +		/*
> +		 * The mmu_object is released late when destroying the
>   		 * GEM object so it is entirely possible to gain a
>   		 * reference on an object in the process of being freed
>   		 * since our serialisation is via the spinlock and not
> @@ -146,21 +155,39 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>   		 * use-after-free we only acquire a reference on the
>   		 * object if it is not in the process of being destroyed.
>   		 */
> -		mo = container_of(it, struct i915_mmu_object, it);
> -		if (kref_get_unless_zero(&mo->obj->base.refcount))
> -			queue_work(mn->wq, &mo->work);
> +		obj = container_of(it, struct i915_mmu_object, it)->obj;
> +		if (!kref_get_unless_zero(&obj->base.refcount)) {
> +			it = interval_tree_iter_next(it, start, end);
> +			continue;
> +		}
> +		spin_unlock(&mn->lock);
>   
> -		list_add(&mo->link, &cancelled);
> -		it = interval_tree_iter_next(it, start, end);
> +		if (!unlock)
> +			unlock = __i915_mutex_lock_recursive(&mn->mm->i915->drm.struct_mutex);
> +		ret = i915_gem_object_unbind(obj);
> +		if (ret == 0)
> +			ret = __i915_gem_object_put_pages(obj, I915_MM_SHRINKER);
> +		i915_gem_object_put(obj);
> +		if (ret)
> +			goto unlock;
> +
> +		spin_lock(&mn->lock);
> +
> +		/*
> +		 * As we do not (yet) protect the mmu from concurrent insertion
> +		 * over this range, there is no guarantee that this search will
> +		 * terminate given a pathologic workload.
> +		 */

Hmm my concern from the other day. But it must exit, some client will 
eventually stall for a bit I think. If not sooner then when everything 
grinds to a halt due everyone being stuck in this loop.

> +		it = interval_tree_iter_first(&mn->objects, start, end);
>   	}
> -	list_for_each_entry(mo, &cancelled, link)
> -		del_object(mo);
>   	spin_unlock(&mn->lock);
>   
> -	if (!list_empty(&cancelled))
> -		flush_workqueue(mn->wq);
> +unlock:
> +	if (!IS_ERR_OR_NULL(unlock))
> +		mutex_unlock(unlock);
> +
> +	return ret;
>   
> -	return 0;
>   }
>   
>   static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
> @@ -168,7 +195,7 @@ static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
>   };
>   
>   static struct i915_mmu_notifier *
> -i915_mmu_notifier_create(struct mm_struct *mm)
> +i915_mmu_notifier_create(struct i915_mm_struct *mm)
>   {
>   	struct i915_mmu_notifier *mn;
>   
> @@ -179,13 +206,7 @@ i915_mmu_notifier_create(struct mm_struct *mm)
>   	spin_lock_init(&mn->lock);
>   	mn->mn.ops = &i915_gem_userptr_notifier;
>   	mn->objects = RB_ROOT_CACHED;
> -	mn->wq = alloc_workqueue("i915-userptr-release",
> -				 WQ_UNBOUND | WQ_MEM_RECLAIM,
> -				 0);
> -	if (mn->wq == NULL) {
> -		kfree(mn);
> -		return ERR_PTR(-ENOMEM);
> -	}
> +	mn->mm = mm;
>   
>   	return mn;
>   }
> @@ -195,16 +216,14 @@ i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object *obj)
>   {
>   	struct i915_mmu_object *mo;
>   
> -	mo = obj->userptr.mmu_object;
> -	if (mo == NULL)
> +	mo = fetch_and_zero(&obj->userptr.mmu_object);
> +	if (!mo)
>   		return;
>   
>   	spin_lock(&mo->mn->lock);
>   	del_object(mo);
>   	spin_unlock(&mo->mn->lock);
>   	kfree(mo);
> -
> -	obj->userptr.mmu_object = NULL;
>   }
>   
>   static struct i915_mmu_notifier *
> @@ -217,7 +236,7 @@ i915_mmu_notifier_find(struct i915_mm_struct *mm)
>   	if (mn)
>   		return mn;
>   
> -	mn = i915_mmu_notifier_create(mm->mm);
> +	mn = i915_mmu_notifier_create(mm);
>   	if (IS_ERR(mn))
>   		err = PTR_ERR(mn);
>   
> @@ -240,10 +259,8 @@ i915_mmu_notifier_find(struct i915_mm_struct *mm)
>   	mutex_unlock(&mm->i915->mm_lock);
>   	up_write(&mm->mm->mmap_sem);
>   
> -	if (mn && !IS_ERR(mn)) {
> -		destroy_workqueue(mn->wq);
> +	if (mn && !IS_ERR(mn))
>   		kfree(mn);
> -	}
>   
>   	return err ? ERR_PTR(err) : mm->mn;
>   }
> @@ -266,14 +283,14 @@ i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object *obj,
>   		return PTR_ERR(mn);
>   
>   	mo = kzalloc(sizeof(*mo), GFP_KERNEL);
> -	if (mo == NULL)
> +	if (!mo)
>   		return -ENOMEM;
>   
>   	mo->mn = mn;
>   	mo->obj = obj;
>   	mo->it.start = obj->userptr.ptr;
>   	mo->it.last = obj->userptr.ptr + obj->base.size - 1;
> -	INIT_WORK(&mo->work, cancel_userptr);
> +	RB_CLEAR_NODE(&mo->it.rb);
>   
>   	obj->userptr.mmu_object = mo;
>   	return 0;
> @@ -287,12 +304,16 @@ i915_mmu_notifier_free(struct i915_mmu_notifier *mn,
>   		return;
>   
>   	mmu_notifier_unregister(&mn->mn, mm);
> -	destroy_workqueue(mn->wq);
>   	kfree(mn);
>   }
>   
>   #else
>   
> +static void
> +__i915_gem_userptr_set_active(struct drm_i915_gem_object *obj, bool value)
> +{
> +}
> +
>   static void
>   i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object *obj)
>   {
> @@ -461,42 +482,6 @@ __i915_gem_userptr_alloc_pages(struct drm_i915_gem_object *obj,
>   	return st;
>   }
>   
> -static int
> -__i915_gem_userptr_set_active(struct drm_i915_gem_object *obj,
> -			      bool value)
> -{
> -	int ret = 0;
> -
> -	/* During mm_invalidate_range we need to cancel any userptr that
> -	 * overlaps the range being invalidated. Doing so requires the
> -	 * struct_mutex, and that risks recursion. In order to cause
> -	 * recursion, the user must alias the userptr address space with
> -	 * a GTT mmapping (possible with a MAP_FIXED) - then when we have
> -	 * to invalidate that mmaping, mm_invalidate_range is called with
> -	 * the userptr address *and* the struct_mutex held.  To prevent that
> -	 * we set a flag under the i915_mmu_notifier spinlock to indicate
> -	 * whether this object is valid.
> -	 */
> -#if defined(CONFIG_MMU_NOTIFIER)
> -	if (obj->userptr.mmu_object == NULL)
> -		return 0;
> -
> -	spin_lock(&obj->userptr.mmu_object->mn->lock);
> -	/* In order to serialise get_pages with an outstanding
> -	 * cancel_userptr, we must drop the struct_mutex and try again.
> -	 */
> -	if (!value)
> -		del_object(obj->userptr.mmu_object);
> -	else if (!work_pending(&obj->userptr.mmu_object->work))
> -		add_object(obj->userptr.mmu_object);
> -	else
> -		ret = -EAGAIN;
> -	spin_unlock(&obj->userptr.mmu_object->mn->lock);
> -#endif
> -
> -	return ret;
> -}
> -
>   static void
>   __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>   {
> @@ -682,8 +667,11 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj,
>   	struct sgt_iter sgt_iter;
>   	struct page *page;
>   
> -	BUG_ON(obj->userptr.work != NULL);
> +	/* Cancel any inflight work and force them to restart their gup */
> +	obj->userptr.work = NULL;
>   	__i915_gem_userptr_set_active(obj, false);
> +	if (!pages)
> +		return;
>   
>   	if (obj->mm.madv != I915_MADV_WILLNEED)
>   		obj->mm.dirty = false;
> @@ -721,7 +709,8 @@ i915_gem_userptr_dmabuf_export(struct drm_i915_gem_object *obj)
>   
>   static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = {
>   	.flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
> -		 I915_GEM_OBJECT_IS_SHRINKABLE,
> +		 I915_GEM_OBJECT_IS_SHRINKABLE |
> +		 I915_GEM_OBJECT_ASYNC_CANCEL,
>   	.get_pages = i915_gem_userptr_get_pages,
>   	.put_pages = i915_gem_userptr_put_pages,
>   	.dmabuf_export = i915_gem_userptr_dmabuf_export,
> 

Regards,

Tvrtko