[Intel-xe] [PATCH] drm/xe: Drop -EAGAIN userptr invalidation dance from VM bind IOCTL

Mon Oct 2 09:00:30 UTC 2023

On 9/27/23 06:31, Matthew Brost wrote:
> This isn't needed as the nexr exec, preempt rebind worker, or page fault
> will issue a rebind. Drop thing complication.

s/thing/the/ ?

Also considering not a lot of code can actually be dropped and this 
enables us to build page-tables in the IOCTL rather than in the rebind 
worker it needs reconsidering? Also the prefetch OP will be more likely 
to succeed....

>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>   drivers/gpu/drm/xe/xe_pt.c | 61 +-------------------------------------
>   drivers/gpu/drm/xe/xe_vm.c | 10 -------
>   2 files changed, 1 insertion(+), 70 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index aec469842471..34954d354013 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -1042,36 +1042,6 @@ static void xe_vm_dbg_print_entries(struct xe_device *xe,
>   {}
>   #endif
>   
> -#ifdef CONFIG_DRM_XE_USERPTR_INVAL_INJECT
> -
> -static int xe_pt_userptr_inject_eagain(struct xe_vma *vma)
Hmm. Regardless of the way we chose to handle a racing invalidation, 
wouldn't it make sense to keep this but perhaps rename it to 
inject_inval to make sure we properly test racing invalidations?
> -{
> -	u32 divisor = vma->userptr.divisor ? vma->userptr.divisor : 2;
> -	static u32 count;
> -
> -	if (count++ % divisor == divisor - 1) {
> -		struct xe_vm *vm = xe_vma_vm(vma);
> -
> -		vma->userptr.divisor = divisor << 1;
> -		spin_lock(&vm->userptr.invalidated_lock);
> -		list_move_tail(&vma->userptr.invalidate_link,
> -			       &vm->userptr.invalidated);
> -		spin_unlock(&vm->userptr.invalidated_lock);
> -		return true;
> -	}
> -
> -	return false;
> -}
> -
> -#else
> -
> -static bool xe_pt_userptr_inject_eagain(struct xe_vma *vma)
> -{
> -	return false;
> -}
> -
> -#endif
> -
>   /**
>    * struct xe_pt_migrate_pt_update - Callback argument for pre-commit callbacks
>    * @base: Base we derive from.
> @@ -1139,7 +1109,6 @@ static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
>   	struct xe_pt_migrate_pt_update *userptr_update =
>   		container_of(pt_update, typeof(*userptr_update), base);
>   	struct xe_vma *vma = pt_update->vma;
> -	unsigned long notifier_seq = vma->userptr.notifier_seq;
>   	struct xe_vm *vm = xe_vma_vm(vma);
>   	int err = xe_pt_vm_dependencies(pt_update->job,
>   					&vm->rftree[pt_update->tile_id],
> @@ -1149,36 +1118,8 @@ static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
>   	if (err)
>   		return err;
>   
> -	userptr_update->locked = false;
> -
> -	/*
> -	 * Wait until nobody is running the invalidation notifier, and
> -	 * since we're exiting the loop holding the notifier lock,
> -	 * nobody can proceed invalidating either.
> -	 *
> -	 * Note that we don't update the vma->userptr.notifier_seq since
> -	 * we don't update the userptr pages.
> -	 */
> -	do {
> -		down_read(&vm->userptr.notifier_lock);
> -		if (!mmu_interval_read_retry(&vma->userptr.notifier,
> -					     notifier_seq))
> -			break;

You need to keep the mmu_interval_read_retry() under the notifier lock. 
Otherwise page-table updates may race with page-table zapping. But if we 
don't do the repin dance you need to return 0 if 
mmu_interval_read_retry() is true.

> -
> -		up_read(&vm->userptr.notifier_lock);
> -
> -		if (userptr_update->bind)
> -			return -EAGAIN;
> -
> -		notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
> -	} while (true);
> -
> -	/* Inject errors to test_whether they are handled correctly */
> -	if (userptr_update->bind && xe_pt_userptr_inject_eagain(vma)) {
> -		up_read(&vm->userptr.notifier_lock);
> -		return -EAGAIN;
Just return zero here if we don't do the repin.
> -	}
>   
> +	down_read(&vm->userptr.notifier_lock);
>   	userptr_update->locked = true;
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 861d050871bb..147d5f901f24 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -2757,7 +2757,6 @@ static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
>   	struct drm_exec exec;
>   	int err;
>   
> -retry_userptr:
>   	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
>   	drm_exec_until_all_locked(&exec) {
>   		err = op_execute(&exec, vm, vma, op);
> @@ -2767,15 +2766,6 @@ static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
>   	}
>   	drm_exec_fini(&exec);
>   
> -	if (err == -EAGAIN && xe_vma_is_userptr(vma)) {
> -		lockdep_assert_held_write(&vm->lock);
> -		err = xe_vma_userptr_pin_pages(vma);
> -		if (!err)
> -			goto retry_userptr;
> -
> -		trace_xe_vma_fail(vma);
> -	}
> -
>   	return err;
>   }
>