[PATCH v2 5/5] drm/xe/uapi, drm/xe: Make the PT code handle placement per PTE rather than per vma / range

Tue Mar 25 17:57:04 UTC 2025

On Tue, Mar 25, 2025 at 03:23:14PM +0100, Thomas Hellström wrote:
> On Fri, 2025-03-21 at 10:16 -0700, Matthew Brost wrote:
> > On Fri, Mar 21, 2025 at 05:34:16PM +0100, Thomas Hellström wrote:
> > > With SVM, ranges forwarded to the PT code for binding can, mostly
> > > due to races when migrating, point to both VRAM and system /
> > > foreign
> > > device memory. Make the PT code able to handle that by checking,
> > > for each PTE set up, whether it points to local VRAM or to system
> > > memory.
> > > 
> > > The UAPI is changed implicitly in that before this patch,
> > > global atomics required a bo with VRAM/System placements. With
> > > this patch that is changed to requiring LR mode, and
> > > if the required placement is not available upon GPU atomic access
> > > pagefault, an error will be generated and the VM banned.
> > 
> > I think the queue will get banned if I'm reading the code correctly.
> > 
> > > 
> > > v2:
> > > - Fix system memory GPU atomic access.
> > > 
> > > Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/xe_bo.c |  12 +++--
> > >  drivers/gpu/drm/xe/xe_pt.c | 106 ++++++++++++++++-----------------
> > > ----
> > >  2 files changed, 56 insertions(+), 62 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_bo.c
> > > b/drivers/gpu/drm/xe/xe_bo.c
> > > index 9d043d2c30fd..3c7c2353d3c8 100644
> > > --- a/drivers/gpu/drm/xe/xe_bo.c
> > > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > > @@ -2116,10 +2116,16 @@ uint64_t vram_region_gpu_offset(struct
> > > ttm_resource *res)
> > >  {
> > >  	struct xe_device *xe = ttm_to_xe_device(res->bo->bdev);
> > >  
> > > -	if (res->mem_type == XE_PL_STOLEN)
> > > +	switch (res->mem_type) {
> > > +	case XE_PL_STOLEN:
> > >  		return xe_ttm_stolen_gpu_offset(xe);
> > > -
> > > -	return res_to_mem_region(res)->dpa_base;
> > > +	case XE_PL_TT:
> > > +	case XE_PL_SYSTEM:
> > > +		return 0;
> > > +	default:
> > > +		return res_to_mem_region(res)->dpa_base;
> > > +	}
> > > +	return 0;
> > >  }
> > >  
> > >  /**
> > > diff --git a/drivers/gpu/drm/xe/xe_pt.c
> > > b/drivers/gpu/drm/xe/xe_pt.c
> > > index 9e719535a3bb..3ffe135c27f1 100644
> > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > @@ -278,13 +278,15 @@ struct xe_pt_stage_bind_walk {
> > >  	struct xe_vm *vm;
> > >  	/** @tile: The tile we're building for. */
> > >  	struct xe_tile *tile;
> > > -	/** @default_pte: PTE flag only template. No address is
> > > associated */
> > > -	u64 default_pte;
> > > +	/** @default_pte: PTE flag only template for VRAM. No
> > > address is associated */
> > > +	u64 default_vram_pte;
> > > +	/** @default_pte: PTE flag only template for VRAM. No
> > > address is associated */
> > > +	u64 default_system_pte;
> > >  	/** @dma_offset: DMA offset to add to the PTE. */
> > >  	u64 dma_offset;
> > >  	/**
> > >  	 * @needs_64k: This address range enforces 64K alignment
> > > and
> > > -	 * granularity.
> > > +	 * granularity on VRAM.
> > >  	 */
> > >  	bool needs_64K;
> > >  	/**
> > > @@ -515,13 +517,16 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> > > pgoff_t offset,
> > >  	if (level == 0 || xe_pt_hugepte_possible(addr, next,
> > > level, xe_walk)) {
> > >  		struct xe_res_cursor *curs = xe_walk->curs;
> > >  		bool is_null = xe_vma_is_null(xe_walk->vma);
> > > +		bool is_vram = is_null ? false :
> > > xe_res_is_vram(curs);
> > 
> > So xe_res_is_vram is still per range / VMA. I assume a follow on will
> > change this to be per page? 
> 
> For VMAs it's per VMA, but for ranges it's per mapped page AFAICT (at
> least that's the intention).
> 

Ah, yes it is per page for range.

> > 
> > Patch LGTM though. With that:
> > Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> 
> Thanks. Does this hold for v3 as well? (I dropped the atomic UAPI
> change).
> 

Yes. The uAPI was most confusing part of this patch, reasoned it was
good but fine with doing that change in follow up after a bit more
thought.

Matt 

> /Thomas
> 
> 
> > 
> > >  
> > >  		XE_WARN_ON(xe_walk->va_curs_start != addr);
> > >  
> > >  		pte = vm->pt_ops->pte_encode_vma(is_null ? 0 :
> > >  						 xe_res_dma(curs)
> > > + xe_walk->dma_offset,
> > >  						 xe_walk->vma,
> > > pat_index, level);
> > > -		pte |= xe_walk->default_pte;
> > > +		if (!is_null)
> > > +			pte |= is_vram ? xe_walk->default_vram_pte
> > > :
> > > +				xe_walk->default_system_pte;
> > >  
> > >  		/*
> > >  		 * Set the XE_PTE_PS64 hint if possible, otherwise
> > > if
> > > @@ -531,7 +536,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent,
> > > pgoff_t offset,
> > >  			if (xe_pt_is_pte_ps64K(addr, next,
> > > xe_walk)) {
> > >  				xe_walk->vma->gpuva.flags |=
> > > XE_VMA_PTE_64K;
> > >  				pte |= XE_PTE_PS64;
> > > -			} else if (XE_WARN_ON(xe_walk->needs_64K))
> > > {
> > > +			} else if (XE_WARN_ON(xe_walk->needs_64K
> > > && is_vram)) {
> > >  				return -EINVAL;
> > >  			}
> > >  		}
> > > @@ -603,6 +608,31 @@ static const struct xe_pt_walk_ops
> > > xe_pt_stage_bind_ops = {
> > >  	.pt_entry = xe_pt_stage_bind_entry,
> > >  };
> > >  
> > > +/* The GPU can always to atomics in VRAM */
> > > +static bool xe_atomic_for_vram(struct xe_vm *vm)
> > > +{
> > > +	return true;
> > > +}
> > > +
> > > +/*
> > > + * iGFX always expect to be able to do atomics in system.
> > > + *
> > > + * For DGFX, 3D clients want to do atomics in system that is
> > > + * not coherent with CPU atomics. Compute clients want
> > > + * atomics that look coherent with CPU atomics. We
> > > + * distinguish the two by checking for lr mode. For
> > > + * compute we then disallow atomics in system.
> > > + * Compute attempts to perform atomics in system memory would
> > > + * then cause an unrecoverable page-fault in preempt-fence
> > > + * mode, but in fault mode the data would be migrated to VRAM
> > > + * for GPU atomics and to system for CPU atomics.
> > > + */
> > > +static bool xe_atomic_for_system(struct xe_vm *vm)
> > > +{
> > > +	return (!IS_DGFX(vm->xe) || !xe_vm_in_lr_mode(vm)) &&
> > > +		vm->xe->info.has_device_atomics_on_smem;
> > > +}
> > > +
> > >  /**
> > >   * xe_pt_stage_bind() - Build a disconnected page-table tree for a
> > > given address
> > >   * range.
> > > @@ -629,9 +659,8 @@ xe_pt_stage_bind(struct xe_tile *tile, struct
> > > xe_vma *vma,
> > >  {
> > >  	struct xe_device *xe = tile_to_xe(tile);
> > >  	struct xe_bo *bo = xe_vma_bo(vma);
> > > -	bool is_devmem = !xe_vma_is_userptr(vma) && bo &&
> > > -		(xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo));
> > >  	struct xe_res_cursor curs;
> > > +	struct xe_vm *vm = xe_vma_vm(vma);
> > >  	struct xe_pt_stage_bind_walk xe_walk = {
> > >  		.base = {
> > >  			.ops = &xe_pt_stage_bind_ops,
> > > @@ -639,7 +668,7 @@ xe_pt_stage_bind(struct xe_tile *tile, struct
> > > xe_vma *vma,
> > >  			.max_level = XE_PT_HIGHEST_LEVEL,
> > >  			.staging = true,
> > >  		},
> > > -		.vm = xe_vma_vm(vma),
> > > +		.vm = vm,
> > >  		.tile = tile,
> > >  		.curs = &curs,
> > >  		.va_curs_start = range ? range->base.itree.start :
> > > @@ -647,26 +676,22 @@ xe_pt_stage_bind(struct xe_tile *tile, struct
> > > xe_vma *vma,
> > >  		.vma = vma,
> > >  		.wupd.entries = entries,
> > >  	};
> > > -	struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> > > +	struct xe_pt *pt = vm->pt_root[tile->id];
> > >  	int ret;
> > >  
> > >  	if (range) {
> > >  		/* Move this entire thing to xe_svm.c? */
> > > -		xe_svm_notifier_lock(xe_vma_vm(vma));
> > > +		xe_svm_notifier_lock(vm);
> > >  		if (!xe_svm_range_pages_valid(range)) {
> > >  			xe_svm_range_debug(range, "BIND PREPARE -
> > > RETRY");
> > > -			xe_svm_notifier_unlock(xe_vma_vm(vma));
> > > +			xe_svm_notifier_unlock(vm);
> > >  			return -EAGAIN;
> > >  		}
> > >  		if (xe_svm_range_has_dma_mapping(range)) {
> > >  			xe_res_first_dma(range->base.dma_addr, 0,
> > >  					 range->base.itree.last +
> > > 1 - range->base.itree.start,
> > >  					 &curs);
> > > -			is_devmem = xe_res_is_vram(&curs);
> > > -			if (is_devmem)
> > > -				xe_svm_range_debug(range, "BIND
> > > PREPARE - DMA VRAM");
> > > -			else
> > > -				xe_svm_range_debug(range, "BIND
> > > PREPARE - DMA");
> > > +			xe_svm_range_debug(range, "BIND PREPARE -
> > > MIXED");
> > >  		} else {
> > >  			xe_assert(xe, false);
> > >  		}
> > > @@ -674,54 +699,17 @@ xe_pt_stage_bind(struct xe_tile *tile, struct
> > > xe_vma *vma,
> > >  		 * Note, when unlocking the resource cursor dma
> > > addresses may become
> > >  		 * stale, but the bind will be aborted anyway at
> > > commit time.
> > >  		 */
> > > -		xe_svm_notifier_unlock(xe_vma_vm(vma));
> > > +		xe_svm_notifier_unlock(vm);
> > >  	}
> > >  
> > > -	xe_walk.needs_64K = (xe_vma_vm(vma)->flags &
> > > XE_VM_FLAG_64K) && is_devmem;
> > > -
> > > -	/**
> > > -	 * Default atomic expectations for different allocation
> > > scenarios are as follows:
> > > -	 *
> > > -	 * 1. Traditional API: When the VM is not in LR mode:
> > > -	 *    - Device atomics are expected to function with all
> > > allocations.
> > > -	 *
> > > -	 * 2. Compute/SVM API: When the VM is in LR mode:
> > > -	 *    - Device atomics are the default behavior when the
> > > bo is placed in a single region.
> > > -	 *    - In all other cases device atomics will be disabled
> > > with AE=0 until an application
> > > -	 *      request differently using a ioctl like madvise.
> > > -	 */
> > > +	xe_walk.needs_64K = (vm->flags & XE_VM_FLAG_64K);
> > >  	if (vma->gpuva.flags & XE_VMA_ATOMIC_PTE_BIT) {
> > > -		if (xe_vm_in_lr_mode(xe_vma_vm(vma))) {
> > > -			if (bo && xe_bo_has_single_placement(bo))
> > > -				xe_walk.default_pte |=
> > > XE_USM_PPGTT_PTE_AE;
> > > -			/**
> > > -			 * If a SMEM+LMEM allocation is backed by
> > > SMEM, a device
> > > -			 * atomics will cause a gpu page fault and
> > > which then
> > > -			 * gets migrated to LMEM, bind such
> > > allocations with
> > > -			 * device atomics enabled.
> > > -			 */
> > > -			else if (is_devmem)
> > > -				xe_walk.default_pte |=
> > > XE_USM_PPGTT_PTE_AE;
> > > -		} else {
> > > -			xe_walk.default_pte |=
> > > XE_USM_PPGTT_PTE_AE;
> > > -		}
> > > -
> > > -		/**
> > > -		 * Unset AE if the platform(PVC) doesn't support
> > > it on an
> > > -		 * allocation
> > > -		 */
> > > -		if (!xe->info.has_device_atomics_on_smem &&
> > > !is_devmem)
> > > -			xe_walk.default_pte &=
> > > ~XE_USM_PPGTT_PTE_AE;
> > > +		xe_walk.default_vram_pte = xe_atomic_for_vram(vm)
> > > ? XE_USM_PPGTT_PTE_AE : 0;
> > > +		xe_walk.default_system_pte =
> > > xe_atomic_for_system(vm) ? XE_USM_PPGTT_PTE_AE : 0;
> > >  	}
> > >  
> > > -	if (is_devmem) {
> > > -		xe_walk.default_pte |= XE_PPGTT_PTE_DM;
> > > -		xe_walk.dma_offset = bo ?
> > > vram_region_gpu_offset(bo->ttm.resource) : 0;
> > > -	}
> > > -
> > > -	if (!xe_vma_has_no_bo(vma) && xe_bo_is_stolen(bo))
> > > -		xe_walk.dma_offset =
> > > xe_ttm_stolen_gpu_offset(xe_bo_device(bo));
> > > -
> > > +	xe_walk.default_vram_pte |= XE_PPGTT_PTE_DM;
> > > +	xe_walk.dma_offset = bo ? vram_region_gpu_offset(bo-
> > > >ttm.resource) : 0;
> > >  	if (!range)
> > >  		xe_bo_assert_held(bo);
> > >  
> > > -- 
> > > 2.48.1
> > > 
>