[Intel-gfx] [PATCH v4 2/3] drm/i915: Fix userptr deadlock with aliased GTT mmappings

Thu Sep 10 03:25:26 PDT 2015

On 09/10/2015 10:51 AM, Chris Wilson wrote:
> Michał Winiarski found a really evil way to trigger a struct_mutex
> deadlock with userptr. He found that if he allocated a userptr bo and
> then GTT mmaped another bo, or even itself, at the same address as the
> userptr using MAP_FIXED, he could then cause a deadlock any time we then
> had to invalidate the GTT mmappings (so at will). Tvrtko then found by
> repeatedly allocating GTT mmappings he could alias with an old userptr
> mmap and also trigger the deadlock.
>
> To counter act the deadlock, we make the observation that we only need
> to take the struct_mutex if the object has any pages to revoke, and that
> before userspace can alias with the userptr address space, it must have
> invalidated the userptr->pages. Thus if we can check for those pages
> outside of the struct_mutex, we can avoid the deadlock. To do so we
> introduce a separate flag for userptr objects that we can inspect from
> the mmu-notifier underneath its spinlock.
>
> The patch makes one eye-catching change. That is the removal serial=0
> after detecting a to-be-freed object inside the invalidate walker. I
> felt setting serial=0 was a questionable pessimisation: it denies us the
> chance to reuse the current iterator for the next loop (before it is
> freed) and being explicit makes the reader question the validity of the
> locking (since the object-free race could occur elsewhere). The
> serialisation of the iterator is through the spinlock, if the object is
> freed before the next loop then the notifier.serial will be incremented
> and we start the walk from the beginning as we detect the invalid cache.
>
> To try and tame the error paths and interactions with the userptr->active
> flag, we have to do a fair amount of rearranging of get_pages_userptr().
>
> v2: Grammar fixes
> v3: Reorder set-active so that it is only set when obj->pages is set
> (and so needs cancellation). Only the order of setting obj->pages and
> the active-flag is crucial. Calling gup after invalidate-range begin
> means the userptr sees the new set of backing storage (and so will not
> need to invalidate its new pages), but we have to be careful not to set
> the active-flag prior to successfully establishing obj->pages.
> v4: Take the active->flag early so we know in the mmu-notifier when we
> have to cancel a pending gup-worker.
> v5: Rearrange the error path so that is not so convoluted
>
> Reported-by: Michał Winiarski <michal.winiarski at intel.com>
> Testcase: igt/gem_userptr_blits/map-fixed*
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michał Winiarski <michal.winiarski at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: stable at vger.kernel.org
> ---
>   drivers/gpu/drm/i915/i915_gem_userptr.c | 175 ++++++++++++++++++++------------
>   1 file changed, 109 insertions(+), 66 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 800a5394aa1e..505c608ade7e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -59,6 +59,7 @@ struct i915_mmu_object {
>   	struct interval_tree_node it;
>   	struct list_head link;
>   	struct drm_i915_gem_object *obj;
> +	bool active;
>   	bool is_linear;
>   };
>
> @@ -114,7 +115,8 @@ restart:
>
>   		obj = mo->obj;
>
> -		if (!kref_get_unless_zero(&obj->base.refcount))
> +		if (!mo->active ||
> +		    !kref_get_unless_zero(&obj->base.refcount))
>   			continue;
>
>   		spin_unlock(&mn->lock);
> @@ -151,7 +153,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>   		else
>   			it = interval_tree_iter_first(&mn->objects, start, end);
>   		if (it != NULL) {
> -			obj = container_of(it, struct i915_mmu_object, it)->obj;
> +			struct i915_mmu_object *mo =
> +				container_of(it, struct i915_mmu_object, it);
>
>   			/* The mmu_object is released late when destroying the
>   			 * GEM object so it is entirely possible to gain a
> @@ -160,11 +163,9 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>   			 * the struct_mutex - and consequently use it after it
>   			 * is freed and then double free it.
>   			 */
> -			if (!kref_get_unless_zero(&obj->base.refcount)) {
> -				spin_unlock(&mn->lock);
> -				serial = 0;
> -				continue;
> -			}
> +			if (mo->active &&
> +			    kref_get_unless_zero(&mo->obj->base.refcount))
> +				obj = mo->obj;
>
>   			serial = mn->serial;
>   		}
> @@ -566,6 +567,30 @@ __i915_gem_userptr_set_pages(struct drm_i915_gem_object *obj,
>   }
>
>   static void
> +__i915_gem_userptr_set_active(struct drm_i915_gem_object *obj,
> +			      bool value)
> +{
> +	/* During mm_invalidate_range we need to cancel any userptr that
> +	 * overlaps the range being invalidated. Doing so requires the
> +	 * struct_mutex, and that risks recursion. In order to cause
> +	 * recursion, the user must alias the userptr address space with
> +	 * a GTT mmapping (possible with a MAP_FIXED) - then when we have
> +	 * to invalidate that mmaping, mm_invalidate_range is called with
> +	 * the userptr address *and* the struct_mutex held.  To prevent that
> +	 * we set a flag under the i915_mmu_notifier spinlock to indicate
> +	 * whether this object is valid.
> +	 */
> +#if defined(CONFIG_MMU_NOTIFIER)
> +	if (obj->userptr.mmu_object == NULL)
> +		return;
> +
> +	spin_lock(&obj->userptr.mmu_object->mn->lock);
> +	obj->userptr.mmu_object->active = value;
> +	spin_unlock(&obj->userptr.mmu_object->mn->lock);
> +#endif
> +}
> +
> +static void
>   __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>   {
>   	struct get_pages_work *work = container_of(_work, typeof(*work), work);
> @@ -613,6 +638,8 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>   			}
>   		}
>   		obj->userptr.work = ERR_PTR(ret);
> +		if (ret)
> +			__i915_gem_userptr_set_active(obj, false);
>   	}
>
>   	obj->userptr.workers--;
> @@ -627,11 +654,60 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>   }
>
>   static int
> +__i915_gem_userptr_get_pages_schedule(struct drm_i915_gem_object *obj,
> +				      bool *active)
> +{
> +	struct get_pages_work *work;
> +
> +	/* Spawn a worker so that we can acquire the
> +	 * user pages without holding our mutex. Access
> +	 * to the user pages requires mmap_sem, and we have
> +	 * a strict lock ordering of mmap_sem, struct_mutex -
> +	 * we already hold struct_mutex here and so cannot
> +	 * call gup without encountering a lock inversion.
> +	 *
> +	 * Userspace will keep on repeating the operation
> +	 * (thanks to EAGAIN) until either we hit the fast
> +	 * path or the worker completes. If the worker is
> +	 * cancelled or superseded, the task is still run
> +	 * but the results ignored. (This leads to
> +	 * complications that we may have a stray object
> +	 * refcount that we need to be wary of when
> +	 * checking for existing objects during creation.)
> +	 * If the worker encounters an error, it reports
> +	 * that error back to this function through
> +	 * obj->userptr.work = ERR_PTR.
> +	 */
> +	if (obj->userptr.workers >= I915_GEM_USERPTR_MAX_WORKERS)
> +		return -EAGAIN;
> +
> +	work = kmalloc(sizeof(*work), GFP_KERNEL);
> +	if (work == NULL)
> +		return -ENOMEM;
> +
> +	obj->userptr.work = &work->work;
> +	obj->userptr.workers++;
> +
> +	work->obj = obj;
> +	drm_gem_object_reference(&obj->base);
> +
> +	work->task = current;
> +	get_task_struct(work->task);
> +
> +	INIT_WORK(&work->work, __i915_gem_userptr_get_pages_worker);
> +	schedule_work(&work->work);
> +
> +	*active = true;
> +	return -EAGAIN;
> +}
> +
> +static int
>   i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
>   {
>   	const int num_pages = obj->base.size >> PAGE_SHIFT;
>   	struct page **pvec;
>   	int pinned, ret;
> +	bool active;
>
>   	/* If userspace should engineer that these pages are replaced in
>   	 * the vma between us binding this page into the GTT and completion
> @@ -649,6 +725,18 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
>   	 * to the vma (discard or cloning) which should prevent the more
>   	 * egregious cases from causing harm.
>   	 */
> +	if (IS_ERR(obj->userptr.work)) {
> +		/* active flag will have been dropped already by the worker */
> +		ret = PTR_ERR(obj->userptr.work);
> +		obj->userptr.work = NULL;
> +		return ret;
> +	}
> +	if (obj->userptr.work)
> +		/* active flag should still be held for the pending work */
> +		return -EAGAIN;
> +
> +	/* Let the mmu-notifier know that we have begun and need cancellation */
> +	__i915_gem_userptr_set_active(obj, true);
>
>   	pvec = NULL;
>   	pinned = 0;
> @@ -657,73 +745,27 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
>   			       GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
>   		if (pvec == NULL) {
>   			pvec = drm_malloc_ab(num_pages, sizeof(struct page *));
> -			if (pvec == NULL)
> +			if (pvec == NULL) {
> +				__i915_gem_userptr_set_active(obj, false);
>   				return -ENOMEM;
> +			}
>   		}
>
>   		pinned = __get_user_pages_fast(obj->userptr.ptr, num_pages,
>   					       !obj->userptr.read_only, pvec);
>   	}
> -	if (pinned < num_pages) {
> -		if (pinned < 0) {
> -			ret = pinned;
> -			pinned = 0;
> -		} else {
> -			/* Spawn a worker so that we can acquire the
> -			 * user pages without holding our mutex. Access
> -			 * to the user pages requires mmap_sem, and we have
> -			 * a strict lock ordering of mmap_sem, struct_mutex -
> -			 * we already hold struct_mutex here and so cannot
> -			 * call gup without encountering a lock inversion.
> -			 *
> -			 * Userspace will keep on repeating the operation
> -			 * (thanks to EAGAIN) until either we hit the fast
> -			 * path or the worker completes. If the worker is
> -			 * cancelled or superseded, the task is still run
> -			 * but the results ignored. (This leads to
> -			 * complications that we may have a stray object
> -			 * refcount that we need to be wary of when
> -			 * checking for existing objects during creation.)
> -			 * If the worker encounters an error, it reports
> -			 * that error back to this function through
> -			 * obj->userptr.work = ERR_PTR.
> -			 */
> -			ret = -EAGAIN;
> -			if (obj->userptr.work == NULL &&
> -			    obj->userptr.workers < I915_GEM_USERPTR_MAX_WORKERS) {
> -				struct get_pages_work *work;
> -
> -				work = kmalloc(sizeof(*work), GFP_KERNEL);
> -				if (work != NULL) {
> -					obj->userptr.work = &work->work;
> -					obj->userptr.workers++;
> -
> -					work->obj = obj;
> -					drm_gem_object_reference(&obj->base);
> -
> -					work->task = current;
> -					get_task_struct(work->task);
> -
> -					INIT_WORK(&work->work, __i915_gem_userptr_get_pages_worker);
> -					schedule_work(&work->work);
> -				} else
> -					ret = -ENOMEM;
> -			} else {
> -				if (IS_ERR(obj->userptr.work)) {
> -					ret = PTR_ERR(obj->userptr.work);
> -					obj->userptr.work = NULL;
> -				}
> -			}
> -		}
> -	} else {
> +
> +	active = false;
> +	if (pinned < 0)
> +		ret = pinned;
> +	else if (pinned < num_pages)
> +		ret = __i915_gem_userptr_get_pages_schedule(obj, &active);
> +	else
>   		ret = __i915_gem_userptr_set_pages(obj, pvec, num_pages);
> -		if (ret == 0) {
> -			obj->userptr.work = NULL;
> -			pinned = 0;
> -		}
> +	if (ret) {
> +		__i915_gem_userptr_set_active(obj, active);
> +		release_pages(pvec, pinned, 0);

I think it would be safer to set pinned to zero in the pinned < 0 case, 
rather than rely on release_pages implementation knowledge. Otherwise it 
is too fragile.

If you can put that back you can put my R-B on the patch.

Regards,

Tvrtko