[PATCH 2/2] drm/xe: Userptr invalidation race with binds fixes

Fri Feb 21 14:43:09 UTC 2025

On Fri, Feb 21, 2025 at 12:34:12PM +0100, Thomas Hellström wrote:
> On Thu, 2025-02-20 at 15:45 -0800, Matthew Brost wrote:
> > Squash bind operation after userptr invalidation into a clearing of
> > PTEs. Will prevent valid GPU page tables from pointing to stale CPU
> > pages.
> > 
> > Fixup initial bind handling always add VMAs to invalidation list and
> > clear PTEs.
> > 
> > Remove unused rebind variable in xe_pt.
> > 
> > Always hold notifier across TLB invalidation in notifier to prevent a
> > UAF if an unbind races.
> > 
> > Including all of the above changes for Fixes patch in hopes of an
> > easier
> > backport which fix a single patch.
> > 
> > Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > Cc: <stable at vger.kernel.org>
> > Fixes: e8babb280b5e: ("drm/xe: Convert multiple bind ops into single
> > job")
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > ---
> >  drivers/gpu/drm/xe/regs/xe_gtt_defs.h |  1 +
> >  drivers/gpu/drm/xe/xe_pt.c            | 86 +++++++++++++++++++------
> > --
> >  drivers/gpu/drm/xe/xe_pt_types.h      |  1 -
> >  drivers/gpu/drm/xe/xe_vm.c            |  4 +-
> >  4 files changed, 64 insertions(+), 28 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/regs/xe_gtt_defs.h
> > b/drivers/gpu/drm/xe/regs/xe_gtt_defs.h
> > index 4389e5a76f89..ab281e721d94 100644
> > --- a/drivers/gpu/drm/xe/regs/xe_gtt_defs.h
> > +++ b/drivers/gpu/drm/xe/regs/xe_gtt_defs.h
> > @@ -13,6 +13,7 @@
> >  
> >  #define GUC_GGTT_TOP		0xFEE00000
> >  
> > +#define XE_PPGTT_IS_LEAF		BIT_ULL(63)
> >  #define XELPG_PPGTT_PTE_PAT3		BIT_ULL(62)
> >  #define XE2_PPGTT_PTE_PAT4		BIT_ULL(61)
> >  #define XE_PPGTT_PDE_PDPE_PAT2		BIT_ULL(12)
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > index 1ddcc7e79a93..26b513a54113 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -389,6 +389,8 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk
> > *xe_walk, struct xe_pt *parent,
> >  		idx = offset - entry->ofs;
> >  		entry->pt_entries[idx].pt = xe_child;
> >  		entry->pt_entries[idx].pte = pte;
> > +		if (!xe_child)
> > +			entry->pt_entries[idx].pte |=
> > XE_PPGTT_IS_LEAF;
> 
> I don't think this is needed.
> 

It is. Will explain below.

> >  		entry->qwords++;
> >  	}
> >  
> > @@ -792,6 +794,24 @@ static const struct xe_pt_walk_ops
> > xe_pt_zap_ptes_ops = {
> >  	.pt_entry = xe_pt_zap_ptes_entry,
> >  };
> >  
> > +static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> > *vma)
> > +{
> > +	struct xe_pt_zap_ptes_walk xe_walk = {
> > +		.base = {
> > +			.ops = &xe_pt_zap_ptes_ops,
> > +			.shifts = xe_normal_pt_shifts,
> > +			.max_level = XE_PT_HIGHEST_LEVEL,
> > +		},
> > +		.tile = tile,
> > +	};
> > +	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> > +
> > +	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > xe_vma_start(vma),
> > +				xe_vma_end(vma), &xe_walk.base);
> > +
> > +	return xe_walk.needs_invalidate;
> > +}
> > +
> >  /**
> >   * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
> >   * @tile: The tile we're zapping for.
> > @@ -810,24 +830,12 @@ static const struct xe_pt_walk_ops
> > xe_pt_zap_ptes_ops = {
> >   */
> >  bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
> >  {
> > -	struct xe_pt_zap_ptes_walk xe_walk = {
> > -		.base = {
> > -			.ops = &xe_pt_zap_ptes_ops,
> > -			.shifts = xe_normal_pt_shifts,
> > -			.max_level = XE_PT_HIGHEST_LEVEL,
> > -		},
> > -		.tile = tile,
> > -	};
> > -	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> >  	u8 pt_mask = (vma->tile_present & ~vma->tile_invalidated);
> >  
> >  	if (!(pt_mask & BIT(tile->id)))
> >  		return false;
> >  
> > -	(void)xe_pt_walk_shared(&pt->base, pt->level,
> > xe_vma_start(vma),
> > -				xe_vma_end(vma), &xe_walk.base);
> > -
> > -	return xe_walk.needs_invalidate;
> > +	return __xe_pt_zap_ptes(tile, vma);
> >  }
> >  
> >  static void
> > @@ -843,9 +851,10 @@ xe_vm_populate_pgtable(struct
> > xe_migrate_pt_update *pt_update, struct xe_tile *t
> >  	for (i = 0; i < num_qwords; i++) {
> >  		if (map)
> >  			xe_map_wr(tile_to_xe(tile), map, (qword_ofs
> > + i) *
> > -				  sizeof(u64), u64, ptes[i].pte);
> > +				  sizeof(u64), u64, ptes[i].pte &
> > +				  ~XE_PPGTT_IS_LEAF);
> >  		else
> > -			ptr[i] = ptes[i].pte;
> > +			ptr[i] = ptes[i].pte & ~XE_PPGTT_IS_LEAF;
> >  	}
> >  }
> >  
> > @@ -1201,7 +1210,30 @@ static bool xe_pt_userptr_inject_eagain(struct
> > xe_userptr_vma *uvma)
> >  
> >  #endif
> >  
> > -static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
> > +static void
> > +vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> > *vma,
> > +			    struct xe_vm_pgtable_update_ops
> > *pt_update)
> > +{
> > +	int i, j, k;
> > +
> > +	for (i = 0; i < pt_update->current_op; ++i) {
> > +		struct xe_vm_pgtable_update_op *op = &pt_update-
> > >ops[i];
> > +
> > +		if (vma == op->vma && (op->bind || op->rebind)) {
> > +			for (j = 0; j < op->num_entries; ++j) {
> > +				for (k = 0; k < op-
> > >entries[j].qwords; ++k)
> > +					if (op-
> > >entries[j].pt_entries[k].pte &
> > +					    XE_PPGTT_IS_LEAF)
> 
> IIRC It's a leaf iff op->entries[j].pt->level == 0 ||
> xe_pte_is_huge(pte) (xe_pte_is_huge(pte) needs to be coded)
> 

The 'op->entries[j].pt' here is overidden at this point as part of the
code which handles the unwind on error. See xe_pt_commit_prepare_bind.

We can't reason whether the pte is a leaf without knowing the level,
thus the extra bit here. Ofc this could be coded differently. Open to
ideas but this might not needed if we just return -EAGAIN - see below. 

> > +						op-
> > >entries[j].pt_entries[k].pte = 0;
> 
> This should be a scratch pte unless it's a faulting VM?
>

Yes, if the VM is in scratch mode, this should be set scratch.

> 
> > +			}
> > +		}
> > +	}
> > +
> > +	__xe_pt_zap_ptes(tile, vma);
> 
> Umm, A zap here would race completely with other pipelined updates
> right? Why is it needed?
> 

The PT tree is fully updated at this point. The nature of an array of
binds means each operation must fully updated the PT tree before running
the next operation. Thus is safe to call zap here.

This is needed for invalidating leafs which are part of an unconnected
tree always setup by the CPU.

e.g. A 4k bind on empty does this:

- 1 PT op to prune in a PTE at the highest level
- (Highest level - 1) to 0 are part of an unconnected tree setup by CPU

Here we'd only want to invalidate the level 0 PTE which __xe_pt_zap_ptes
does.

What is missing, as above, the zap would need program a scratch entry in
the VM has scratch enabled.

> > +}
> > +
> > +static int vma_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
> > +			     struct xe_vma *vma,
> >  			     struct xe_vm_pgtable_update_ops
> > *pt_update)
> >  {
> >  	struct xe_userptr_vma *uvma;
> > @@ -1215,9 +1247,6 @@ static int vma_check_userptr(struct xe_vm *vm,
> > struct xe_vma *vma,
> >  	uvma = to_userptr_vma(vma);
> >  	notifier_seq = uvma->userptr.notifier_seq;
> >  
> > -	if (uvma->userptr.initial_bind && !xe_vm_in_fault_mode(vm))
> > -		return 0;
> > -
> >  	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
> >  				     notifier_seq) &&
> >  	    !xe_pt_userptr_inject_eagain(uvma))
> > @@ -1231,6 +1260,8 @@ static int vma_check_userptr(struct xe_vm *vm,
> > struct xe_vma *vma,
> >  			       &vm->userptr.invalidated);
> >  		spin_unlock(&vm->userptr.invalidated_lock);
> >  
> > +		vma_convert_to_invalidation(tile, vma, pt_update);
> > +
> >  		if (xe_vm_in_preempt_fence_mode(vm)) {
> >  			struct dma_resv_iter cursor;
> >  			struct dma_fence *fence;
> 
> And I really think here we should remove the open-coded and
> undocumented invalidation (It took me like 15 minutes to understand
> what the code did, and then I'm already up-to-date with how
> invalidation works).
> 

Yea, this ended up being more complicated that I though it would be and
took me several hours to get this (mostly) right.

> If just returning -EAGAIN doesn't really work here for the preempt-

I think return -EAGAIN should work. A quick hack got what appears to be
livelock on xe_vm.munmap-style-unbind-userptr-one-partial with error
injection enabled. Need to dig in more, but my guess is this test does
user bind which results in enough internal OPs so the error injection is
always hit on every user bind. If this is the case, we could tweak the
injection algorithm to use a counter based on user binds not internal
OPs.

In practice, getting an invalidation between getting the CPU pages and
this step of the bind should be exceedingly rare, so return -EAGAIN
doesn't seem like a huge deal. But this code path could really help for
SVM prefetches where perhaps a single page is touched by the CPU, thus
aborting the entire prefetch. If we could just squash that part of the
prefetch into an invalidation, that could be useful. So weighing the
complexity of an invalidation vs -EAGAIN needs to be considered.

> fence case, Then I suggest that the error injection intstead triggers a
> real invalidation outside of the notifier lock.

If the error injection really kicked the notifier, that would be better.

> Perhaps using mmu_notifier_invalidate_range[start|end]()?
> 

Off the top of my head unsure how this would be implemented or if it is
possible. Generally a bad idea to call low level core MM functions
though and I could see that upsetting the core MM people too if they
make changes to whatever functions we need to call.

> But for now since this is a fix, restrict the -EINVAL inject to cases
> where it works and that we want to test.
> 

Not following this.

Matt

> Thanks,
> 
> /Thomas
> 
> 
> > @@ -1252,7 +1283,8 @@ static int vma_check_userptr(struct xe_vm *vm,
> > struct xe_vma *vma,
> >  	return 0;
> >  }
> >  
> > -static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op *op,
> > +static int op_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
> > +			    struct xe_vma_op *op,
> >  			    struct xe_vm_pgtable_update_ops
> > *pt_update)
> >  {
> >  	int err = 0;
> > @@ -1264,18 +1296,21 @@ static int op_check_userptr(struct xe_vm *vm,
> > struct xe_vma_op *op,
> >  		if (!op->map.immediate && xe_vm_in_fault_mode(vm))
> >  			break;
> >  
> > -		err = vma_check_userptr(vm, op->map.vma, pt_update);
> > +		err = vma_check_userptr(tile, vm, op->map.vma,
> > pt_update);
> >  		break;
> >  	case DRM_GPUVA_OP_REMAP:
> >  		if (op->remap.prev)
> > -			err = vma_check_userptr(vm, op->remap.prev,
> > pt_update);
> > +			err = vma_check_userptr(tile, vm, op-
> > >remap.prev,
> > +						pt_update);
> >  		if (!err && op->remap.next)
> > -			err = vma_check_userptr(vm, op->remap.next,
> > pt_update);
> > +			err = vma_check_userptr(tile, vm, op-
> > >remap.next,
> > +						pt_update);
> >  		break;
> >  	case DRM_GPUVA_OP_UNMAP:
> >  		break;
> >  	case DRM_GPUVA_OP_PREFETCH:
> > -		err = vma_check_userptr(vm, gpuva_to_vma(op-
> > >base.prefetch.va),
> > +		err = vma_check_userptr(tile, vm,
> > +					gpuva_to_vma(op-
> > >base.prefetch.va),
> >  					pt_update);
> >  		break;
> >  	default:
> > @@ -1301,7 +1336,8 @@ static int xe_pt_userptr_pre_commit(struct
> > xe_migrate_pt_update *pt_update)
> >  	down_read(&vm->userptr.notifier_lock);
> >  
> >  	list_for_each_entry(op, &vops->list, link) {
> > -		err = op_check_userptr(vm, op, pt_update_ops);
> > +		err = op_check_userptr(&vm->xe->tiles[pt_update-
> > >tile_id],
> > +				       vm, op, pt_update_ops);
> >  		if (err) {
> >  			up_read(&vm->userptr.notifier_lock);
> >  			break;
> > diff --git a/drivers/gpu/drm/xe/xe_pt_types.h
> > b/drivers/gpu/drm/xe/xe_pt_types.h
> > index 384cc04de719..0f9b7650cd6f 100644
> > --- a/drivers/gpu/drm/xe/xe_pt_types.h
> > +++ b/drivers/gpu/drm/xe/xe_pt_types.h
> > @@ -29,7 +29,6 @@ struct xe_pt {
> >  	struct xe_bo *bo;
> >  	unsigned int level;
> >  	unsigned int num_live;
> > -	bool rebind;
> >  	bool is_compact;
> >  #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
> >  	/** addr: Virtual address start address of the PT. */
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index ea2e287e6526..f90e5c92010c 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -623,8 +623,6 @@ static bool vma_userptr_invalidate(struct
> > mmu_interval_notifier *mni,
> >  		spin_unlock(&vm->userptr.invalidated_lock);
> >  	}
> >  
> > -	up_write(&vm->userptr.notifier_lock);
> > -
> >  	/*
> >  	 * Preempt fences turn into schedule disables, pipeline
> > these.
> >  	 * Note that even in fault mode, we need to wait for binds
> > and
> > @@ -647,6 +645,8 @@ static bool vma_userptr_invalidate(struct
> > mmu_interval_notifier *mni,
> >  		XE_WARN_ON(err);
> >  	}
> >  
> > +	up_write(&vm->userptr.notifier_lock);
> > +
> >  	trace_xe_vma_userptr_invalidate_complete(vma);
> >  
> >  	return true;
>