[PATCH v2 2/3] drm/xe: Userptr invalidation race with binds fixes

Mon Feb 24 16:30:36 UTC 2025

On Mon, Feb 24, 2025 at 05:22:40PM +0100, Thomas Hellström wrote:
> On Mon, 2025-02-24 at 07:35 -0800, Matthew Brost wrote:
> > On Mon, Feb 24, 2025 at 04:20:07PM +0100, Thomas Hellström wrote:
> > > On Mon, 2025-02-24 at 07:06 -0800, Matthew Brost wrote:
> > > > On Mon, Feb 24, 2025 at 09:21:09AM +0100, Thomas Hellström wrote:
> > > > > On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> > > > > > Squash bind operation after userptr invalidation into a
> > > > > > clearing
> > > > > > of
> > > > > > PTEs. Will prevent valid GPU page tables from pointing to
> > > > > > stale
> > > > > > CPU
> > > > > > pages.
> > > > > > 
> > > > > > Fixup initial bind handling always add VMAs to invalidation
> > > > > > list
> > > > > > and
> > > > > > clear PTEs.
> > > > > > 
> > > > > > Remove unused rebind variable in xe_pt.
> > > > > > 
> > > > > > Always hold notifier across TLB invalidation in notifier to
> > > > > > prevent a
> > > > > > UAF if an unbind races.
> > > > > > 
> > > > > > Including all of the above changes for Fixes patch in hopes
> > > > > > of an
> > > > > > easier
> > > > > > backport which fix a single patch.
> > > > > > 
> > > > > > v2:
> > > > > >  - Wait dma-resv bookkeep before issuing PTE zap (Thomas)
> > > > > >  - Support scratch page on invalidation (Thomas)
> > > > > > 
> > > > > > Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > > > > > Cc: <stable at vger.kernel.org>
> > > > > > Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into
> > > > > > single
> > > > > > job")
> > > > > > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > > > > > ---
> > > > > >  drivers/gpu/drm/xe/xe_pt.c       | 146
> > > > > > +++++++++++++++++++++++--
> > > > > > ----
> > > > > > --
> > > > > >  drivers/gpu/drm/xe/xe_pt_types.h |   3 +-
> > > > > >  drivers/gpu/drm/xe/xe_vm.c       |   4 +-
> > > > > >  3 files changed, 115 insertions(+), 38 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/gpu/drm/xe/xe_pt.c
> > > > > > b/drivers/gpu/drm/xe/xe_pt.c
> > > > > > index 1ddcc7e79a93..add521b5c6ae 100644
> > > > > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > > > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > > > > @@ -351,7 +351,8 @@ xe_pt_new_shared(struct xe_walk_update
> > > > > > *wupd,
> > > > > > struct xe_pt *parent,
> > > > > >   */
> > > > > >  static int
> > > > > >  xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk,
> > > > > > struct
> > > > > > xe_pt *parent,
> > > > > > -		   pgoff_t offset, struct xe_pt *xe_child,
> > > > > > u64
> > > > > > pte)
> > > > > > +		   pgoff_t offset, struct xe_pt *xe_child,
> > > > > > u64
> > > > > > pte,
> > > > > > +		   unsigned int level)
> > > > > >  {
> > > > > >  	struct xe_pt_update *upd = &xe_walk-
> > > > > > > wupd.updates[parent-
> > > > > > > level];
> > > > > >  	struct xe_pt_update *child_upd = xe_child ?
> > > > > > @@ -389,6 +390,9 @@ xe_pt_insert_entry(struct
> > > > > > xe_pt_stage_bind_walk
> > > > > > *xe_walk, struct xe_pt *parent,
> > > > > >  		idx = offset - entry->ofs;
> > > > > >  		entry->pt_entries[idx].pt = xe_child;
> > > > > >  		entry->pt_entries[idx].pte = pte;
> > > > > > +		entry->pt_entries[idx].level = level;
> > > > > > +		if (likely(!xe_child))
> > > > > > +			entry->pt_entries[idx].level |=
> > > > > > XE_PT_IS_LEAF;
> > > > > >  		entry->qwords++;
> > > > > >  	}
> > > > > >  
> > > > > > @@ -515,7 +519,8 @@ xe_pt_stage_bind_entry(struct xe_ptw
> > > > > > *parent,
> > > > > > pgoff_t offset,
> > > > > >  			}
> > > > > >  		}
> > > > > >  
> > > > > > -		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > > > offset,
> > > > > > NULL, pte);
> > > > > > +		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > > > offset,
> > > > > > NULL, pte,
> > > > > > +					 level);
> > > > > >  		if (unlikely(ret))
> > > > > >  			return ret;
> > > > > >  
> > > > > > @@ -571,7 +576,7 @@ xe_pt_stage_bind_entry(struct xe_ptw
> > > > > > *parent,
> > > > > > pgoff_t offset,
> > > > > >  
> > > > > >  		pte = vm->pt_ops->pde_encode_bo(xe_child-
> > > > > > >bo, 0,
> > > > > > pat_index) | flags;
> > > > > >  		ret = xe_pt_insert_entry(xe_walk, xe_parent,
> > > > > > offset,
> > > > > > xe_child,
> > > > > > -					 pte);
> > > > > > +					 pte, level);
> > > > > >  	}
> > > > > >  
> > > > > >  	*action = ACTION_SUBTREE;
> > > > > > @@ -752,6 +757,10 @@ struct xe_pt_zap_ptes_walk {
> > > > > >  	/* Input parameters for the walk */
> > > > > >  	/** @tile: The tile we're building for */
> > > > > >  	struct xe_tile *tile;
> > > > > > +	/** @vm: VM we're building for */
> > > > > > +	struct xe_vm *vm;
> > > > > > +	/** @scratch: write entries with scratch */
> > > > > > +	bool scratch;
> > > > > >  
> > > > > >  	/* Output */
> > > > > >  	/** @needs_invalidate: Whether we need to invalidate
> > > > > > TLB*/
> > > > > > @@ -779,9 +788,18 @@ static int xe_pt_zap_ptes_entry(struct
> > > > > > xe_ptw
> > > > > > *parent, pgoff_t offset,
> > > > > >  	 */
> > > > > >  	if (xe_pt_nonshared_offsets(addr, next, --level,
> > > > > > walk,
> > > > > > action, &offset,
> > > > > >  				    &end_offset)) {
> > > > > > -		xe_map_memset(tile_to_xe(xe_walk->tile),
> > > > > > &xe_child-
> > > > > > > bo->vmap,
> > > > > > -			      offset * sizeof(u64), 0,
> > > > > > -			      (end_offset - offset) *
> > > > > > sizeof(u64));
> > > > > > +		if (unlikely(xe_walk->scratch)) {
> > > > > > +			u64 pte = __xe_pt_empty_pte(xe_walk-
> > > > > > > tile,
> > > > > > xe_walk->vm,
> > > > > > +						    level);
> > > > > > +
> > > > > > +			for (; offset < end_offset;
> > > > > > ++offset)
> > > > > > +				xe_pt_write(tile_to_xe(xe_wa
> > > > > > lk-
> > > > > > > tile),
> > > > > > +					    &xe_child->bo-
> > > > > > >vmap,
> > > > > > offset, pte);
> > > > > > +		} else {
> > > > > > +			xe_map_memset(tile_to_xe(xe_walk-
> > > > > > >tile),
> > > > > > &xe_child->bo->vmap,
> > > > > > +				      offset * sizeof(u64),
> > > > > > 0,
> > > > > > +				      (end_offset - offset)
> > > > > > *
> > > > > > sizeof(u64));
> > > > > > +		}
> > > > > >  		xe_walk->needs_invalidate = true;
> > > > > >  	}
> > > > > >  
> > > > > > @@ -792,6 +810,31 @@ static const struct xe_pt_walk_ops
> > > > > > xe_pt_zap_ptes_ops = {
> > > > > >  	.pt_entry = xe_pt_zap_ptes_entry,
> > > > > >  };
> > > > > >  
> > > > > > +struct xe_pt_zap_ptes_flags {
> > > > > > +	bool scratch:1;
> > > > > > +};
> > > > > > +
> > > > > > +static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct
> > > > > > xe_vma
> > > > > > *vma,
> > > > > > +			     struct xe_pt_zap_ptes_flags
> > > > > > flags)
> > > > > > +{
> > > > > > +	struct xe_pt_zap_ptes_walk xe_walk = {
> > > > > > +		.base = {
> > > > > > +			.ops = &xe_pt_zap_ptes_ops,
> > > > > > +			.shifts = xe_normal_pt_shifts,
> > > > > > +			.max_level = XE_PT_HIGHEST_LEVEL,
> > > > > > +		},
> > > > > > +		.tile = tile,
> > > > > > +		.vm = xe_vma_vm(vma),
> > > > > > +		.scratch = flags.scratch,
> > > > > > +	};
> > > > > > +	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile-
> > > > > > >id];
> > > > > > +
> > > > > > +	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > > > > xe_vma_start(vma),
> > > > > > +				xe_vma_end(vma),
> > > > > > &xe_walk.base);
> > > > > > +
> > > > > > +	return xe_walk.needs_invalidate;
> > > > > > +}
> > > > > > +
> > > > > >  /**
> > > > > >   * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address
> > > > > > range
> > > > > >   * @tile: The tile we're zapping for.
> > > > > > @@ -810,24 +853,13 @@ static const struct xe_pt_walk_ops
> > > > > > xe_pt_zap_ptes_ops = {
> > > > > >   */
> > > > > >  bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> > > > > > *vma)
> > > > > >  {
> > > > > > -	struct xe_pt_zap_ptes_walk xe_walk = {
> > > > > > -		.base = {
> > > > > > -			.ops = &xe_pt_zap_ptes_ops,
> > > > > > -			.shifts = xe_normal_pt_shifts,
> > > > > > -			.max_level = XE_PT_HIGHEST_LEVEL,
> > > > > > -		},
> > > > > > -		.tile = tile,
> > > > > > -	};
> > > > > > -	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile-
> > > > > > >id];
> > > > > > +	struct xe_pt_zap_ptes_flags flags = {};
> > > > > >  	u8 pt_mask = (vma->tile_present & ~vma-
> > > > > > > tile_invalidated);
> > > > > >  
> > > > > >  	if (!(pt_mask & BIT(tile->id)))
> > > > > >  		return false;
> > > > > >  
> > > > > > -	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > > > > > xe_vma_start(vma),
> > > > > > -				xe_vma_end(vma),
> > > > > > &xe_walk.base);
> > > > > > -
> > > > > > -	return xe_walk.needs_invalidate;
> > > > > > +	return __xe_pt_zap_ptes(tile, vma, flags);
> > > > > >  }
> > > > > >  
> > > > > >  static void
> > > > > > @@ -1201,7 +1233,46 @@ static bool
> > > > > > xe_pt_userptr_inject_eagain(struct
> > > > > > xe_userptr_vma *uvma)
> > > > > >  
> > > > > >  #endif
> > > > > >  
> > > > > > -static int vma_check_userptr(struct xe_vm *vm, struct xe_vma
> > > > > > *vma,
> > > > > > +static void
> > > > > > +vma_convert_to_invalidation(struct xe_tile *tile, struct
> > > > > > xe_vma
> > > > > > *vma,
> > > > > > +			    struct xe_vm_pgtable_update_ops
> > > > > > *pt_update)
> > > > > > +{
> > > > > > +	struct xe_pt_zap_ptes_flags flags = { .scratch =
> > > > > > true,
> > > > > > };
> > > > > > +	int i, j, k;
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * Need to update this function to bypass scratch
> > > > > > setup
> > > > > > if
> > > > > > in fault mode
> > > > > > +	 */
> > > > > > +	xe_assert(xe_vma_vm(vma)->xe,
> > > > > > !xe_vm_in_fault_mode(xe_vma_vm(vma)));
> > > > > > +
> > > > > > +	for (i = 0; i < pt_update->current_op; ++i) {
> > > > > > +		struct xe_vm_pgtable_update_op *op =
> > > > > > &pt_update-
> > > > > > > ops[i];
> > > > > > +
> > > > > > +		if (vma != op->vma || (!op->bind && !op-
> > > > > > > rebind))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		for (j = 0; j < op->num_entries; ++j) {
> > > > > > +			for (k = 0; k < op-
> > > > > > >entries[j].qwords;
> > > > > > ++k)
> > > > > > {
> > > > > > +				struct xe_pt_entry *entry =
> > > > > > +					&op-
> > > > > > > entries[j].pt_entries[k];
> > > > > > +				unsigned int level = entry-
> > > > > > > level;
> > > > > > +
> > > > > > +				if (!(level &
> > > > > > XE_PT_IS_LEAF))
> > > > > > +					continue;
> > > > > > +
> > > > > > +				level &= ~XE_PT_IS_LEAF;
> > > > > > +				entry->pte =
> > > > > > __xe_pt_empty_pte(tile,
> > > > > > +							    
> > > > > >   
> > > > > > xe_vma_vm(vma),
> > > > > > +							    
> > > > > >   
> > > > > > level);
> > > > > > +			}
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	__xe_pt_zap_ptes(tile, vma, flags);
> > > > > 
> > > > > As mentioned in my previous email, I'm pretty sure if we modify
> > > > > all
> > > > > the
> > > > > ptes in the entry array, not just the leaves, (that's basically
> > > > > all
> > > > > ptes of shared page-table entries) that will be equivalent to a
> > > > > zap.
> > > > > 
> > > > 
> > > > That doesn't work. I had that way originally but IGTs fail with
> > > > IOMMU
> > > > CAT errors (e.g. xe_exec_basic.once-userptr fails like this [1])
> > > > 
> > > > Let me explain with example.
> > > > 
> > > > Array of bind 0x0000-0x1000 (A), 0x1000-0x2000 (B)
> > > > 
> > > > - (A) hits userptr invalidation
> > > > 	- If modify all ptes in the entry array, highest level
> > > > PDE
> > > > is
> > > > 	  invalidated. Both (A) and (B) either are 0 or scratch
> > > > - (A) does rebind in exec
> > > > 	- We only modify leaf entry, not the highest level PDE
> > > > which
> > > > is
> > > > 	  0 or scratch
> > > > 
> > > > Matt
> > > 
> > > Argh. You're right. What would happen if we don't do anything to
> > > the
> > > ptes, then? It looks from the code in faulting mode we error and
> > > unwind
> > > with -EAGAIN.
> > > 
> > >  In preempt-fence mode and !LR we could instead ensure properly
> > > stop
> > > gpu access and put the userptr on the invalidated list in the
> > > notifier.
> > > 
> > 
> > In dma-fence mode we'd have a window where existing (previously
> > submited) GPU jobs could corrupt pages which have been invalidated.
> > Unlikely but a very subtle security / data corruption issue.
> 
> Agree we should not allow that to happen however small the chance is.
> 
> However, I was thinking of something along the following. I can't see
> any dma-fence jobs accessing the invalidated pages here, since we wait
> for already submitted jobs to complete, and no new ones could be
> submitted unless the userptr has been rebound.
> 

Yes, I missed this. After my change to wait on dma-resv in dma-fence
mode too, there shouldn't anything running so it safe to temporally
setup the page tables pointing to the invalidated pages as we will fixup
the pages next exec. The original code before my patch had this window.
Let me respin this patch simply wait on the dma-resv and drop
the complicated invalidation.

Matt

> Or are you thinking about the iommu issue where we should really
> dma_unmap() in the notifier, like the SVM code does? 
> 
> Or could you describe the race more in detail?
> 
> /Thomas
> 
> 
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index d664f2e418b2..c90c0687be51 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -605,12 +605,6 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
>         down_write(&vm->userptr.notifier_lock);
>         mmu_interval_set_seq(mni, cur_seq);
>  
> -       /* No need to stop gpu access if the userptr is not yet bound.
> */
> -       if (!userptr->initial_bind) {
> -               up_write(&vm->userptr.notifier_lock);
> -               return true;
> -       }
> -
>         /*
>          * Tell exec and rebind worker they need to repin and rebind
> this
>          * userptr.
> @@ -623,8 +617,6 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
>                 spin_unlock(&vm->userptr.invalidated_lock);
>         }
>  
> -       up_write(&vm->userptr.notifier_lock);
> -
>         /*
>          * Preempt fences turn into schedule disables, pipeline these.
>          * Note that even in fault mode, we need to wait for binds and
> @@ -642,11 +634,13 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
>                                     false, MAX_SCHEDULE_TIMEOUT);
>         XE_WARN_ON(err <= 0);
>  
> -       if (xe_vm_in_fault_mode(vm)) {
> +       if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
>                 err = xe_vm_invalidate_vma(vma);
>                 XE_WARN_ON(err);
>         }
>  
> +       up_write(&vm->userptr.notifier_lock);
> +
>         trace_xe_vma_userptr_invalidate_complete(vma);
>  
>         return true;
> (END)
> 
>