[CI] drm/xe: Fix fault mode invalidation with unbind

Tue Feb 25 20:40:04 UTC 2025

On Tue, Feb 25, 2025 at 02:44:26PM +0100, Thomas Hellström wrote:
> Fix fault mode invalidation racing with unbind leading to the
> PTE zapping potentially traversing an invalid page-table tree.
> Do this by holding the notifier lock across PTE zapping. This
> might transfer any contention waiting on the notifier seqlock
> read side to the notifier lock read side, but that shouldn't be
> a major problem.
> 
> At the same time get rid of the open-coded invalidation in the bind
> code by relying on the notifier even when the vma bind is not
> yet committed.
> 
> Finally let userptr invalidation injections during binding
> unconditionally return -EAGAIN.
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> ---
>  drivers/gpu/drm/xe/xe_pt.c | 38 +++++++++-----------------------------
>  drivers/gpu/drm/xe/xe_vm.c | 13 +++----------
>  2 files changed, 12 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index 1ddcc7e79a93..e9f90887b30c 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -1213,42 +1213,22 @@ static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
>  		return 0;
>  
>  	uvma = to_userptr_vma(vma);
> -	notifier_seq = uvma->userptr.notifier_seq;
> +	if (xe_pt_userptr_inject_eagain(uvma))
> +		return -EAGAIN;
>  
> -	if (uvma->userptr.initial_bind && !xe_vm_in_fault_mode(vm))
> -		return 0;
> +	notifier_seq = uvma->userptr.notifier_seq;
>  
>  	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
> -				     notifier_seq) &&
> -	    !xe_pt_userptr_inject_eagain(uvma))
> +				     notifier_seq))
>  		return 0;
>  
> -	if (xe_vm_in_fault_mode(vm)) {
> +	if (xe_vm_in_fault_mode(vm))
>  		return -EAGAIN;
> -	} else {
> -		spin_lock(&vm->userptr.invalidated_lock);
> -		list_move_tail(&uvma->userptr.invalidate_link,
> -			       &vm->userptr.invalidated);
> -		spin_unlock(&vm->userptr.invalidated_lock);
> -
> -		if (xe_vm_in_preempt_fence_mode(vm)) {
> -			struct dma_resv_iter cursor;
> -			struct dma_fence *fence;
> -			long err;
> -
> -			dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
> -					    DMA_RESV_USAGE_BOOKKEEP);
> -			dma_resv_for_each_fence_unlocked(&cursor, fence)
> -				dma_fence_enable_sw_signaling(fence);
> -			dma_resv_iter_end(&cursor);
> -
> -			err = dma_resv_wait_timeout(xe_vm_resv(vm),
> -						    DMA_RESV_USAGE_BOOKKEEP,
> -						    false, MAX_SCHEDULE_TIMEOUT);
> -			XE_WARN_ON(err <= 0);
> -		}
> -	}
>  
> +	/*
> +	 * Just continue the operation since exec or rebind worker
> +	 * will take care of rebinding.
> +	 */

Ah, ok. I see what you mean here by deferring this rebind naturally to
the notifier and moving the error inject eariler. This does look better
and accomplishes the same thing. Looks good to me with an error
injection fix, along with prefetch fix.

Matt 

>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 996000f2424e..047f17b230b2 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -605,26 +605,18 @@ static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
>  	down_write(&vm->userptr.notifier_lock);
>  	mmu_interval_set_seq(mni, cur_seq);
>  
> -	/* No need to stop gpu access if the userptr is not yet bound. */
> -	if (!userptr->initial_bind) {
> -		up_write(&vm->userptr.notifier_lock);
> -		return true;
> -	}
> -
>  	/*
>  	 * Tell exec and rebind worker they need to repin and rebind this
>  	 * userptr.
>  	 */
>  	if (!xe_vm_in_fault_mode(vm) &&
> -	    !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
> +	    !(vma->gpuva.flags & XE_VMA_DESTROYED)) {
>  		spin_lock(&vm->userptr.invalidated_lock);
>  		list_move_tail(&userptr->invalidate_link,
>  			       &vm->userptr.invalidated);
>  		spin_unlock(&vm->userptr.invalidated_lock);
>  	}
>  
> -	up_write(&vm->userptr.notifier_lock);
> -
>  	/*
>  	 * Preempt fences turn into schedule disables, pipeline these.
>  	 * Note that even in fault mode, we need to wait for binds and
> @@ -642,11 +634,12 @@ static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
>  				    false, MAX_SCHEDULE_TIMEOUT);
>  	XE_WARN_ON(err <= 0);
>  
> -	if (xe_vm_in_fault_mode(vm)) {
> +	if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
>  		err = xe_vm_invalidate_vma(vma);
>  		XE_WARN_ON(err);
>  	}
>  
> +	up_write(&vm->userptr.notifier_lock);
>  	trace_xe_vma_userptr_invalidate_complete(vma);
>  
>  	return true;
> -- 
> 2.48.1
>