[Intel-gfx] [PATCH 04/25] drm/i915/userptr: Avoid struct_mutex recursion for mmu_invalidate_range_start

Tue Nov 6 12:41:53 UTC 2018

On 02/11/2018 16:12, Chris Wilson wrote:
> Since commit 93065ac753e4 ("mm, oom: distinguish blockable mode for mmu
> notifiers") we have been able to report failure from
> mmu_invalidate_range_start which allows us to use a trylock on the
> struct_mutex to avoid potential recursion and report -EBUSY instead.
> Furthermore, this allows us to pull the work into the main callback and
> avoid the sleight-of-hand in using a workqueue to avoid lockdep.
> 
> However, not all paths to mmu_invalidate_range_start are prepared to
> handle failure, so instead of reporting the recursion, deal with it.

Please add a paragraph on how the patch deals with it.

> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108375
> References: 93065ac753e4 ("mm, oom: distinguish blockable mode for mmu notifiers")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.h         |   4 +-
>   drivers/gpu/drm/i915/i915_gem.c         |  18 +-
>   drivers/gpu/drm/i915/i915_gem_object.h  |   7 +
>   drivers/gpu/drm/i915/i915_gem_userptr.c | 217 +++++++++++-------------
>   4 files changed, 120 insertions(+), 126 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2a88a7eb871b..1056b12c3bc8 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3073,8 +3073,8 @@ enum i915_mm_subclass { /* lockdep subclass for obj->mm.lock */
>   	I915_MM_SHRINKER
>   };
>   
> -void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
> -				 enum i915_mm_subclass subclass);
> +int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
> +				enum i915_mm_subclass subclass);
>   void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj);
>   
>   enum i915_map_type {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 93d09282710d..9a8af9454a53 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2429,8 +2429,8 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>   	struct sg_table *pages;
>   
>   	pages = fetch_and_zero(&obj->mm.pages);
> -	if (!pages)
> -		return NULL;
> +	if (IS_ERR_OR_NULL(pages))
> +		return pages;
>   
>   	spin_lock(&i915->mm.obj_lock);
>   	list_del(&obj->mm.link);
> @@ -2454,17 +2454,16 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>   	return pages;
>   }
>   
> -void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
> -				 enum i915_mm_subclass subclass)
> +int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
> +				enum i915_mm_subclass subclass)
>   {
>   	struct sg_table *pages;
> +	int ret = -EBUSY;
>   
>   	if (i915_gem_object_has_pinned_pages(obj))
> -		return;
> +		return -EBUSY;

Could return ret since you just loaded it with -EBUSY but not sure if it 
makes any difference.

>   
>   	GEM_BUG_ON(obj->bind_count);
> -	if (!i915_gem_object_has_pages(obj))
> -		return;
>   
>   	/* May be called by shrinker from within get_pages() (on another bo) */
>   	mutex_lock_nested(&obj->mm.lock, subclass);
> @@ -2477,11 +2476,16 @@ void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
>   	 * lists early.
>   	 */
>   	pages = __i915_gem_object_unset_pages(obj);
> +	if (!pages && !i915_gem_object_needs_async_cancel(obj))

Would async_put_pages be better?

> +		pages = ERR_PTR(-EINVAL);
>   	if (!IS_ERR(pages))
>   		obj->ops->put_pages(obj, pages);
>   
> +	ret = 0;

Funnily enough, no callers seem to be bothered with the return code from 
this function.

>   unlock:
>   	mutex_unlock(&obj->mm.lock);
> +
> +	return ret;
>   }
>   
>   bool i915_sg_trim(struct sg_table *orig_st)
> diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
> index a6dd7c46de0d..49ce797173b5 100644
> --- a/drivers/gpu/drm/i915/i915_gem_object.h
> +++ b/drivers/gpu/drm/i915/i915_gem_object.h
> @@ -56,6 +56,7 @@ struct drm_i915_gem_object_ops {
>   #define I915_GEM_OBJECT_HAS_STRUCT_PAGE	BIT(0)
>   #define I915_GEM_OBJECT_IS_SHRINKABLE	BIT(1)
>   #define I915_GEM_OBJECT_IS_PROXY	BIT(2)
> +#define I915_GEM_OBJECT_ASYNC_CANCEL	BIT(3)

Somewhere we need a comment blurb explaining what async_cancel means.

>   
>   	/* Interface between the GEM object and its backing storage.
>   	 * get_pages() is called once prior to the use of the associated set
> @@ -386,6 +387,12 @@ i915_gem_object_is_proxy(const struct drm_i915_gem_object *obj)
>   	return obj->ops->flags & I915_GEM_OBJECT_IS_PROXY;
>   }
>   
> +static inline bool
> +i915_gem_object_needs_async_cancel(const struct drm_i915_gem_object *obj)
> +{
> +	return obj->ops->flags & I915_GEM_OBJECT_ASYNC_CANCEL;
> +}
> +
>   static inline bool
>   i915_gem_object_is_active(const struct drm_i915_gem_object *obj)
>   {
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 2c9b284036d1..ab5ae426e27b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -50,79 +50,84 @@ struct i915_mmu_notifier {
>   	struct hlist_node node;
>   	struct mmu_notifier mn;
>   	struct rb_root_cached objects;
> -	struct workqueue_struct *wq;
> +	struct i915_mm_struct *mm;
>   };
>   
>   struct i915_mmu_object {
>   	struct i915_mmu_notifier *mn;
>   	struct drm_i915_gem_object *obj;
>   	struct interval_tree_node it;
> -	struct list_head link;
> -	struct work_struct work;
> -	bool attached;
>   };
>   
> -static void cancel_userptr(struct work_struct *work)
> -{
> -	struct i915_mmu_object *mo = container_of(work, typeof(*mo), work);
> -	struct drm_i915_gem_object *obj = mo->obj;
> -	struct work_struct *active;
> -
> -	/* Cancel any active worker and force us to re-evaluate gup */
> -	mutex_lock(&obj->mm.lock);
> -	active = fetch_and_zero(&obj->userptr.work);
> -	mutex_unlock(&obj->mm.lock);
> -	if (active)
> -		goto out;
> -
> -	i915_gem_object_wait(obj, I915_WAIT_ALL, MAX_SCHEDULE_TIMEOUT, NULL);
> -
> -	mutex_lock(&obj->base.dev->struct_mutex);
> -
> -	/* We are inside a kthread context and can't be interrupted */
> -	if (i915_gem_object_unbind(obj) == 0)
> -		__i915_gem_object_put_pages(obj, I915_MM_NORMAL);
> -	WARN_ONCE(i915_gem_object_has_pages(obj),
> -		  "Failed to release pages: bind_count=%d, pages_pin_count=%d, pin_global=%d\n",
> -		  obj->bind_count,
> -		  atomic_read(&obj->mm.pages_pin_count),
> -		  obj->pin_global);
> -
> -	mutex_unlock(&obj->base.dev->struct_mutex);
> -
> -out:
> -	i915_gem_object_put(obj);
> -}
> -
>   static void add_object(struct i915_mmu_object *mo)
>   {
> -	if (mo->attached)
> +	if (!RB_EMPTY_NODE(&mo->it.rb))
>   		return;
>   
>   	interval_tree_insert(&mo->it, &mo->mn->objects);
> -	mo->attached = true;
>   }
>   
>   static void del_object(struct i915_mmu_object *mo)
>   {
> -	if (!mo->attached)
> +	if (RB_EMPTY_NODE(&mo->it.rb))
>   		return;
>   
>   	interval_tree_remove(&mo->it, &mo->mn->objects);
> -	mo->attached = false;
> +	RB_CLEAR_NODE(&mo->it.rb);
> +}
> +
> +static void
> +__i915_gem_userptr_set_active(struct drm_i915_gem_object *obj, bool value)
> +{
> +	struct i915_mmu_object *mo = obj->userptr.mmu_object;
> +
> +	/*
> +	 * During mm_invalidate_range we need to cancel any userptr that
> +	 * overlaps the range being invalidated. Doing so requires the
> +	 * struct_mutex, and that risks recursion. In order to cause
> +	 * recursion, the user must alias the userptr address space with
> +	 * a GTT mmapping (possible with a MAP_FIXED) - then when we have
> +	 * to invalidate that mmaping, mm_invalidate_range is called with
> +	 * the userptr address *and* the struct_mutex held.  To prevent that
> +	 * we set a flag under the i915_mmu_notifier spinlock to indicate
> +	 * whether this object is valid.
> +	 */
> +	if (!mo)
> +		return;
> +
> +	spin_lock(&mo->mn->lock);
> +	if (value)
> +		add_object(mo);
> +	else
> +		del_object(mo);
> +	spin_unlock(&mo->mn->lock);
> +}
> +
> +static struct mutex *__i915_mutex_lock_recursive(struct mutex *m)
> +{
> +	switch (mutex_trylock_recursive(m)) {
> +	default:
> +	case MUTEX_TRYLOCK_FAILED:
> +		mutex_lock(m);
> +	case MUTEX_TRYLOCK_SUCCESS:
> +		return m;
> +
> +	case MUTEX_TRYLOCK_RECURSIVE:
> +		return ERR_PTR(-EEXIST); > +	}
>   }
>   
>   static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
> -						       struct mm_struct *mm,
> -						       unsigned long start,
> -						       unsigned long end,
> -						       bool blockable)
> +						      struct mm_struct *mm,
> +						      unsigned long start,
> +						      unsigned long end,
> +						      bool blockable)
>   {
>   	struct i915_mmu_notifier *mn =
>   		container_of(_mn, struct i915_mmu_notifier, mn);
> -	struct i915_mmu_object *mo;
>   	struct interval_tree_node *it;
> -	LIST_HEAD(cancelled);
> +	struct mutex *unlock = NULL;
> +	int ret = 0;
>   
>   	if (RB_EMPTY_ROOT(&mn->objects.rb_root))
>   		return 0;
> @@ -133,11 +138,15 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>   	spin_lock(&mn->lock);
>   	it = interval_tree_iter_first(&mn->objects, start, end);
>   	while (it) {
> +		struct drm_i915_gem_object *obj;
> +
>   		if (!blockable) {
> -			spin_unlock(&mn->lock);
> -			return -EAGAIN;
> +			ret = -EAGAIN;
> +			break;
>   		}
> -		/* The mmu_object is released late when destroying the
> +
> +		/*
> +		 * The mmu_object is released late when destroying the
>   		 * GEM object so it is entirely possible to gain a
>   		 * reference on an object in the process of being freed
>   		 * since our serialisation is via the spinlock and not
> @@ -146,21 +155,33 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>   		 * use-after-free we only acquire a reference on the
>   		 * object if it is not in the process of being destroyed.
>   		 */
> -		mo = container_of(it, struct i915_mmu_object, it);
> -		if (kref_get_unless_zero(&mo->obj->base.refcount))
> -			queue_work(mn->wq, &mo->work);
> -
> -		list_add(&mo->link, &cancelled);
> -		it = interval_tree_iter_next(it, start, end);
> +		obj = container_of(it, struct i915_mmu_object, it)->obj;
> +		if (!kref_get_unless_zero(&obj->base.refcount)) {
> +			it = interval_tree_iter_next(it, start, end);
> +			continue;
> +		}
> +		spin_unlock(&mn->lock); > +
> +		if (!unlock)
> +			unlock = __i915_mutex_lock_recursive(&mn->mm->i915->drm.struct_mutex);
> +		ret = i915_gem_object_unbind(obj);
> +		if (ret == 0)
> +			ret = __i915_gem_object_put_pages(obj, I915_MM_SHRINKER);

When gup_worker has been scheduled but not executed, we'll have object 
in the interval tree with pages == -EINVAL. __i915_gem_object_put_pages 
will therefore skip the call user_ptr->put_pages which would cancel the 
worker. So canceling the worker seems will be missed under these 
circumstances.

I think at least, if I haven't lost myself in the flows here..

I am also worried about the lock dropping a few lines above. It's 
definitely correct to restart every time, but can we get in a situation 
where we never exit the loop? Maybe not even because new objects are 
coming along, but because for some reason we wouldn't be deleting all 
objects. Because AFAICT we can only exit the loop when all intervals 
have been deleted or an error happened on some object. Maybe 
kref_get_unless_zero failing? That could be RCU period (long) and this 
loop would essentially busy loop for that time.

Regards,

Tvrtko

> +		i915_gem_object_put(obj);
> +		if (ret)
> +			goto unlock;
> +
> +		spin_lock(&mn->lock);
> +		it = interval_tree_iter_first(&mn->objects, start, end);
>   	}
> -	list_for_each_entry(mo, &cancelled, link)
> -		del_object(mo);
>   	spin_unlock(&mn->lock);
>   
> -	if (!list_empty(&cancelled))
> -		flush_workqueue(mn->wq);
> +unlock:
> +	if (!IS_ERR_OR_NULL(unlock))
> +		mutex_unlock(unlock);
> +
> +	return ret;
>   
> -	return 0;
>   }
>   
>   static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
> @@ -168,7 +189,7 @@ static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
>   };
>   
>   static struct i915_mmu_notifier *
> -i915_mmu_notifier_create(struct mm_struct *mm)
> +i915_mmu_notifier_create(struct i915_mm_struct *mm)
>   {
>   	struct i915_mmu_notifier *mn;
>   
> @@ -179,13 +200,7 @@ i915_mmu_notifier_create(struct mm_struct *mm)
>   	spin_lock_init(&mn->lock);
>   	mn->mn.ops = &i915_gem_userptr_notifier;
>   	mn->objects = RB_ROOT_CACHED;
> -	mn->wq = alloc_workqueue("i915-userptr-release",
> -				 WQ_UNBOUND | WQ_MEM_RECLAIM,
> -				 0);
> -	if (mn->wq == NULL) {
> -		kfree(mn);
> -		return ERR_PTR(-ENOMEM);
> -	}
> +	mn->mm = mm;
>   
>   	return mn;
>   }
> @@ -195,16 +210,14 @@ i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object *obj)
>   {
>   	struct i915_mmu_object *mo;
>   
> -	mo = obj->userptr.mmu_object;
> -	if (mo == NULL)
> +	mo = fetch_and_zero(&obj->userptr.mmu_object);
> +	if (!mo)
>   		return;
>   
>   	spin_lock(&mo->mn->lock);
>   	del_object(mo);
>   	spin_unlock(&mo->mn->lock);
>   	kfree(mo);
> -
> -	obj->userptr.mmu_object = NULL;
>   }
>   
>   static struct i915_mmu_notifier *
> @@ -217,7 +230,7 @@ i915_mmu_notifier_find(struct i915_mm_struct *mm)
>   	if (mn)
>   		return mn;
>   
> -	mn = i915_mmu_notifier_create(mm->mm);
> +	mn = i915_mmu_notifier_create(mm);
>   	if (IS_ERR(mn))
>   		err = PTR_ERR(mn);
>   
> @@ -240,10 +253,8 @@ i915_mmu_notifier_find(struct i915_mm_struct *mm)
>   	mutex_unlock(&mm->i915->mm_lock);
>   	up_write(&mm->mm->mmap_sem);
>   
> -	if (mn && !IS_ERR(mn)) {
> -		destroy_workqueue(mn->wq);
> +	if (mn && !IS_ERR(mn))
>   		kfree(mn);
> -	}
>   
>   	return err ? ERR_PTR(err) : mm->mn;
>   }
> @@ -266,14 +277,14 @@ i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object *obj,
>   		return PTR_ERR(mn);
>   
>   	mo = kzalloc(sizeof(*mo), GFP_KERNEL);
> -	if (mo == NULL)
> +	if (!mo)
>   		return -ENOMEM;
>   
>   	mo->mn = mn;
>   	mo->obj = obj;
>   	mo->it.start = obj->userptr.ptr;
>   	mo->it.last = obj->userptr.ptr + obj->base.size - 1;
> -	INIT_WORK(&mo->work, cancel_userptr);
> +	RB_CLEAR_NODE(&mo->it.rb);
>   
>   	obj->userptr.mmu_object = mo;
>   	return 0;
> @@ -287,12 +298,16 @@ i915_mmu_notifier_free(struct i915_mmu_notifier *mn,
>   		return;
>   
>   	mmu_notifier_unregister(&mn->mn, mm);
> -	destroy_workqueue(mn->wq);
>   	kfree(mn);
>   }
>   
>   #else
>   
> +static void
> +__i915_gem_userptr_set_active(struct drm_i915_gem_object *obj, bool value)
> +{
> +}
> +
>   static void
>   i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object *obj)
>   {
> @@ -461,42 +476,6 @@ __i915_gem_userptr_alloc_pages(struct drm_i915_gem_object *obj,
>   	return st;
>   }
>   
> -static int
> -__i915_gem_userptr_set_active(struct drm_i915_gem_object *obj,
> -			      bool value)
> -{
> -	int ret = 0;
> -
> -	/* During mm_invalidate_range we need to cancel any userptr that
> -	 * overlaps the range being invalidated. Doing so requires the
> -	 * struct_mutex, and that risks recursion. In order to cause
> -	 * recursion, the user must alias the userptr address space with
> -	 * a GTT mmapping (possible with a MAP_FIXED) - then when we have
> -	 * to invalidate that mmaping, mm_invalidate_range is called with
> -	 * the userptr address *and* the struct_mutex held.  To prevent that
> -	 * we set a flag under the i915_mmu_notifier spinlock to indicate
> -	 * whether this object is valid.
> -	 */
> -#if defined(CONFIG_MMU_NOTIFIER)
> -	if (obj->userptr.mmu_object == NULL)
> -		return 0;
> -
> -	spin_lock(&obj->userptr.mmu_object->mn->lock);
> -	/* In order to serialise get_pages with an outstanding
> -	 * cancel_userptr, we must drop the struct_mutex and try again.
> -	 */
> -	if (!value)
> -		del_object(obj->userptr.mmu_object);
> -	else if (!work_pending(&obj->userptr.mmu_object->work))
> -		add_object(obj->userptr.mmu_object);
> -	else
> -		ret = -EAGAIN;
> -	spin_unlock(&obj->userptr.mmu_object->mn->lock);
> -#endif
> -
> -	return ret;
> -}
> -
>   static void
>   __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>   {
> @@ -682,8 +661,11 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj,
>   	struct sgt_iter sgt_iter;
>   	struct page *page;
>   
> -	BUG_ON(obj->userptr.work != NULL);
> +	/* Cancel any inflight work and force them to restart their gup */
> +	obj->userptr.work = NULL;
>   	__i915_gem_userptr_set_active(obj, false);
> +	if (!pages)
> +		return;
>   
>   	if (obj->mm.madv != I915_MADV_WILLNEED)
>   		obj->mm.dirty = false;
> @@ -721,7 +703,8 @@ i915_gem_userptr_dmabuf_export(struct drm_i915_gem_object *obj)
>   
>   static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = {
>   	.flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
> -		 I915_GEM_OBJECT_IS_SHRINKABLE,
> +		 I915_GEM_OBJECT_IS_SHRINKABLE |
> +		 I915_GEM_OBJECT_ASYNC_CANCEL,
>   	.get_pages = i915_gem_userptr_get_pages,
>   	.put_pages = i915_gem_userptr_put_pages,
>   	.dmabuf_export = i915_gem_userptr_dmabuf_export,
>