[PATCH v2 3/3] drm/xe: Add staging tree for VM binds

Matthew Brost matthew.brost at intel.com
Mon Feb 24 14:41:16 UTC 2025


On Mon, Feb 24, 2025 at 10:22:37AM +0100, Thomas Hellström wrote:
> Hi,
> 
> On Sun, 2025-02-23 at 20:05 -0800, Matthew Brost wrote:
> > Concurrent VM bind staging and zapping of PTEs from a userptr
> > notifier
> > do not work because the view of PTEs is not stable. VM binds cannot
> > acquire the notifier lock during staging, as memory allocations are
> > required. To resolve this race condition, use a staging tree for VM
> > binds that is committed only under the userptr notifier lock during
> > the
> > final step of the bind. This ensures a consistent view of the PTEs in
> > the userptr notifier.
> > 
> > A follow up may only use staging for VM in fault mode as this is the
> > only mode in which the above race exists.
> > 
> > Suggested-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > Cc: <stable at vger.kernel.org>
> > Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single
> > job")
> > Fixes: a708f6501c69 ("drm/xe: Update PT layer with better error
> > handling")
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> 
> This looks even simpler than what I initially thought. Ideally I think

Was pleasantly by how simple this was.

> the staging tree would exist only under the vm lock during the multi
> bind-unbind operation, but that adds code complication and execution
> time compared to this solution which is more memory-hungry.
>

It does take a bit more memory but I doubt that is a concern. We only
allocate an extra page of memory for each PDE which is pretty minor
considering the smallest PDE is still mapping 1G of memory. As I mention
in the commit message we could even drop the staging for non faulting
VMs too.
 
> Minor comment below.
> 
> > ---
> >  drivers/gpu/drm/xe/xe_pt.c      | 50 +++++++++++++++++++++++++------
> > --
> >  drivers/gpu/drm/xe/xe_pt_walk.c |  3 +-
> >  drivers/gpu/drm/xe/xe_pt_walk.h |  4 +++
> >  3 files changed, 44 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > index add521b5c6ae..118b81cd7205 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -28,6 +28,8 @@ struct xe_pt_dir {
> >  	struct xe_pt pt;
> >  	/** @children: Array of page-table child nodes */
> >  	struct xe_ptw *children[XE_PDES];
> > +	/** @staging: Array of page-table staging nodes */
> > +	struct xe_ptw *staging[XE_PDES];
> >  };
> >  
> >  #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
> > @@ -50,7 +52,7 @@ static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt
> > *pt)
> >  
> >  static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned
> > int index)
> >  {
> > -	return container_of(pt_dir->children[index], struct xe_pt,
> > base);
> > +	return container_of(pt_dir->staging[index], struct xe_pt,
> > base);
> 
> It looks like the only user here is with staging == true, but I'm a
> little worried about future users. Perhaps include a staging bool in
> the interface and assert that it is true? Or rename to
> xe_pt_entry_staging()?
> 

I had a bool originally but found that xe_pt_entry is only called during
the staging steps so dropped it. Can rename to xe_pt_entry_staging for
clarity. 

> 
> >  }
> >  
> >  static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm,
> > @@ -125,6 +127,7 @@ struct xe_pt *xe_pt_create(struct xe_vm *vm,
> > struct xe_tile *tile,
> >  	}
> >  	pt->bo = bo;
> >  	pt->base.children = level ? as_xe_pt_dir(pt)->children :
> > NULL;
> > +	pt->base.staging = level ? as_xe_pt_dir(pt)->staging : NULL;
> >  
> >  	if (vm->xef)
> >  		xe_drm_client_add_bo(vm->xef->client, pt->bo);
> > @@ -377,8 +380,10 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk
> > *xe_walk, struct xe_pt *parent,
> >  		/* Continue building a non-connected subtree. */
> >  		struct iosys_map *map = &parent->bo->vmap;
> >  
> > -		if (unlikely(xe_child))
> > +		if (unlikely(xe_child)) {
> >  			parent->base.children[offset] = &xe_child-
> > >base;
> > +			parent->base.staging[offset] = &xe_child-
> > >base;
> > +		}
> >  
> >  		xe_pt_write(xe_walk->vm->xe, map, offset, pte);
> >  		parent->num_live++;
> > @@ -619,6 +624,7 @@ xe_pt_stage_bind(struct xe_tile *tile, struct
> > xe_vma *vma,
> >  			.ops = &xe_pt_stage_bind_ops,
> >  			.shifts = xe_normal_pt_shifts,
> >  			.max_level = XE_PT_HIGHEST_LEVEL,
> > +			.staging = true,
> >  		},
> >  		.vm = xe_vma_vm(vma),
> >  		.tile = tile,
> > @@ -812,6 +818,7 @@ static const struct xe_pt_walk_ops
> > xe_pt_zap_ptes_ops = {
> >  
> >  struct xe_pt_zap_ptes_flags {
> >  	bool scratch:1;
> > +	bool staging:1;
> >  };
> 
> As mentioned in the comments to the previous commit. I don't think
> zapping is needed when aborting a bind.
> 

Let me reply there as I think there is a bit of confusion still.

Matt

> Otherwise LGTM
> /Thomas
> 
> >  
> >  static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> > *vma,
> > @@ -822,6 +829,7 @@ static bool __xe_pt_zap_ptes(struct xe_tile
> > *tile, struct xe_vma *vma,
> >  			.ops = &xe_pt_zap_ptes_ops,
> >  			.shifts = xe_normal_pt_shifts,
> >  			.max_level = XE_PT_HIGHEST_LEVEL,
> > +			.staging = flags.staging,
> >  		},
> >  		.tile = tile,
> >  		.vm = xe_vma_vm(vma),
> > @@ -905,7 +913,7 @@ static void xe_pt_cancel_bind(struct xe_vma *vma,
> >  	}
> >  }
> >  
> > -static void xe_pt_commit_locks_assert(struct xe_vma *vma)
> > +static void xe_pt_commit_prepare_locks_assert(struct xe_vma *vma)
> >  {
> >  	struct xe_vm *vm = xe_vma_vm(vma);
> >  
> > @@ -917,6 +925,16 @@ static void xe_pt_commit_locks_assert(struct
> > xe_vma *vma)
> >  	xe_vm_assert_held(vm);
> >  }
> >  
> > +static void xe_pt_commit_locks_assert(struct xe_vma *vma)
> > +{
> > +	struct xe_vm *vm = xe_vma_vm(vma);
> > +
> > +	xe_pt_commit_prepare_locks_assert(vma);
> > +
> > +	if (xe_vma_is_userptr(vma))
> > +		lockdep_assert_held_read(&vm-
> > >userptr.notifier_lock);
> > +}
> > +
> >  static void xe_pt_commit(struct xe_vma *vma,
> >  			 struct xe_vm_pgtable_update *entries,
> >  			 u32 num_entries, struct llist_head
> > *deferred)
> > @@ -927,13 +945,17 @@ static void xe_pt_commit(struct xe_vma *vma,
> >  
> >  	for (i = 0; i < num_entries; i++) {
> >  		struct xe_pt *pt = entries[i].pt;
> > +		struct xe_pt_dir *pt_dir;
> >  
> >  		if (!pt->level)
> >  			continue;
> >  
> > +		pt_dir = as_xe_pt_dir(pt);
> >  		for (j = 0; j < entries[i].qwords; j++) {
> >  			struct xe_pt *oldpte =
> > entries[i].pt_entries[j].pt;
> > +			int j_ = j + entries[i].ofs;
> >  
> > +			pt_dir->children[j_] = pt_dir->staging[j_];
> >  			xe_pt_destroy(oldpte, xe_vma_vm(vma)->flags,
> > deferred);
> >  		}
> >  	}
> > @@ -945,7 +967,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
> >  {
> >  	int i, j;
> >  
> > -	xe_pt_commit_locks_assert(vma);
> > +	xe_pt_commit_prepare_locks_assert(vma);
> >  
> >  	for (i = num_entries - 1; i >= 0; --i) {
> >  		struct xe_pt *pt = entries[i].pt;
> > @@ -963,7 +985,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
> >  			struct xe_pt *newpte = xe_pt_entry(pt_dir,
> > j_);
> >  			struct xe_pt *oldpte =
> > entries[i].pt_entries[j].pt;
> >  
> > -			pt_dir->children[j_] = oldpte ? &oldpte-
> > >base : 0;
> > +			pt_dir->staging[j_] = oldpte ? &oldpte->base
> > : 0;
> >  			xe_pt_destroy(newpte, xe_vma_vm(vma)->flags,
> > NULL);
> >  		}
> >  	}
> > @@ -975,7 +997,7 @@ static void xe_pt_commit_prepare_bind(struct
> > xe_vma *vma,
> >  {
> >  	u32 i, j;
> >  
> > -	xe_pt_commit_locks_assert(vma);
> > +	xe_pt_commit_prepare_locks_assert(vma);
> >  
> >  	for (i = 0; i < num_entries; i++) {
> >  		struct xe_pt *pt = entries[i].pt;
> > @@ -996,7 +1018,7 @@ static void xe_pt_commit_prepare_bind(struct
> > xe_vma *vma,
> >  			if (xe_pt_entry(pt_dir, j_))
> >  				oldpte = xe_pt_entry(pt_dir, j_);
> >  
> > -			pt_dir->children[j_] = &newpte->base;
> > +			pt_dir->staging[j_] = &newpte->base;
> >  			entries[i].pt_entries[j].pt = oldpte;
> >  		}
> >  	}
> > @@ -1237,7 +1259,10 @@ static void
> >  vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> > *vma,
> >  			    struct xe_vm_pgtable_update_ops
> > *pt_update)
> >  {
> > -	struct xe_pt_zap_ptes_flags flags = { .scratch = true, };
> > +	struct xe_pt_zap_ptes_flags flags = {
> > +		.scratch = true,
> > +		.staging = true,
> > +	};
> >  	int i, j, k;
> >  
> >  	/*
> > @@ -1590,6 +1615,7 @@ static unsigned int xe_pt_stage_unbind(struct
> > xe_tile *tile, struct xe_vma *vma,
> >  			.ops = &xe_pt_stage_unbind_ops,
> >  			.shifts = xe_normal_pt_shifts,
> >  			.max_level = XE_PT_HIGHEST_LEVEL,
> > +			.staging = true,
> >  		},
> >  		.tile = tile,
> >  		.modified_start = xe_vma_start(vma),
> > @@ -1631,7 +1657,7 @@ static void xe_pt_abort_unbind(struct xe_vma
> > *vma,
> >  {
> >  	int i, j;
> >  
> > -	xe_pt_commit_locks_assert(vma);
> > +	xe_pt_commit_prepare_locks_assert(vma);
> >  
> >  	for (i = num_entries - 1; i >= 0; --i) {
> >  		struct xe_vm_pgtable_update *entry = &entries[i];
> > @@ -1644,7 +1670,7 @@ static void xe_pt_abort_unbind(struct xe_vma
> > *vma,
> >  			continue;
> >  
> >  		for (j = entry->ofs; j < entry->ofs + entry->qwords;
> > j++)
> > -			pt_dir->children[j] =
> > +			pt_dir->staging[j] =
> >  				entries[i].pt_entries[j - entry-
> > >ofs].pt ?
> >  				&entries[i].pt_entries[j - entry-
> > >ofs].pt->base : NULL;
> >  	}
> > @@ -1657,7 +1683,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
> >  {
> >  	int i, j;
> >  
> > -	xe_pt_commit_locks_assert(vma);
> > +	xe_pt_commit_prepare_locks_assert(vma);
> >  
> >  	for (i = 0; i < num_entries; ++i) {
> >  		struct xe_vm_pgtable_update *entry = &entries[i];
> > @@ -1672,7 +1698,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma,
> >  		for (j = entry->ofs; j < entry->ofs + entry->qwords;
> > j++) {
> >  			entry->pt_entries[j - entry->ofs].pt =
> >  				xe_pt_entry(pt_dir, j);
> > -			pt_dir->children[j] = NULL;
> > +			pt_dir->staging[j] = NULL;
> >  		}
> >  	}
> >  }
> > diff --git a/drivers/gpu/drm/xe/xe_pt_walk.c
> > b/drivers/gpu/drm/xe/xe_pt_walk.c
> > index b8b3d2aea492..be602a763ff3 100644
> > --- a/drivers/gpu/drm/xe/xe_pt_walk.c
> > +++ b/drivers/gpu/drm/xe/xe_pt_walk.c
> > @@ -74,7 +74,8 @@ int xe_pt_walk_range(struct xe_ptw *parent,
> > unsigned int level,
> >  		     u64 addr, u64 end, struct xe_pt_walk *walk)
> >  {
> >  	pgoff_t offset = xe_pt_offset(addr, level, walk);
> > -	struct xe_ptw **entries = parent->children ? parent-
> > >children : NULL;
> > +	struct xe_ptw **entries = walk->staging ? (parent->staging
> > ?: NULL) :
> > +		(parent->children ?: NULL);
> >  	const struct xe_pt_walk_ops *ops = walk->ops;
> >  	enum page_walk_action action;
> >  	struct xe_ptw *child;
> > diff --git a/drivers/gpu/drm/xe/xe_pt_walk.h
> > b/drivers/gpu/drm/xe/xe_pt_walk.h
> > index 5ecc4d2f0f65..5c02c244f7de 100644
> > --- a/drivers/gpu/drm/xe/xe_pt_walk.h
> > +++ b/drivers/gpu/drm/xe/xe_pt_walk.h
> > @@ -11,12 +11,14 @@
> >  /**
> >   * struct xe_ptw - base class for driver pagetable subclassing.
> >   * @children: Pointer to an array of children if any.
> > + * @staging: Pointer to an array of staging if any.
> >   *
> >   * Drivers could subclass this, and if it's a page-directory,
> > typically
> >   * embed an array of xe_ptw pointers.
> >   */
> >  struct xe_ptw {
> >  	struct xe_ptw **children;
> > +	struct xe_ptw **staging;
> >  };
> >  
> >  /**
> > @@ -41,6 +43,8 @@ struct xe_pt_walk {
> >  	 * as shared pagetables.
> >  	 */
> >  	bool shared_pt_mode;
> > +	/** @staging: Walk staging PT structure */
> > +	bool staging;
> >  };
> >  
> >  /**
> 


More information about the Intel-xe mailing list