[PATCH v2 13/32] drm/xe/svm: Incase of atomic access ensure get_pages happens from vram

Tue Apr 22 15:27:51 UTC 2025

On Mon, Apr 21, 2025 at 10:28:25AM +0530, Ghimiray, Himal Prasad wrote:
> 
> 
> On 17-04-2025 09:49, Matthew Brost wrote:
> > On Mon, Apr 07, 2025 at 03:47:00PM +0530, Himal Prasad Ghimiray wrote:
> > > Ranges can be invalidated in between vram allocation and get_pages,
> > > ensure the dma mapping is happening from vram only incase of atomic
> > > access. Retry 3 times before calling out fault in case of concurrent
> > > cpu/gpu access.
> > > 
> > 
> > Again I pulled this patch into a series which will minimally enable
> > atomics per UMD request. See the version of the patch [1] I landed on -
> > that is basically my review feedback. I took ownership but left SoB by
> > you as it based on this patch. We will need another reviewer though as
> > we are both contributors but feel to comment there.
> 
> Thanks for update. Will use the version you modified for the prefetch series
> too. It looks good to me.

I'm going to repost today with your feedback addresses. I'd pull these
patches in as the first patches in your series to avoid conflicts if
those patches merge ahead of your series - I believe we are going to try
to those patches in 6.15 as fixes patches as the UMD doesn't really work
without them.

Matt

>  >
> > Matt
> > 
> > [1] https://patchwork.freedesktop.org/patch/649010/?series=147846&rev=2
> > 
> > > Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> > > ---
> > >   drivers/gpu/drm/xe/xe_svm.c | 43 ++++++++++++++++++++++++-------------
> > >   1 file changed, 28 insertions(+), 15 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> > > index f4ae3feaf9d3..7ec7ecd7eb1f 100644
> > > --- a/drivers/gpu/drm/xe/xe_svm.c
> > > +++ b/drivers/gpu/drm/xe/xe_svm.c
> > > @@ -778,11 +778,13 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
> > >   			IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
> > >   		.check_pages_threshold = IS_DGFX(vm->xe) &&
> > >   			IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0,
> > > +		.vram_only = 0,
> > >   	};
> > >   	struct xe_svm_range *range;
> > >   	struct drm_exec exec;
> > >   	struct dma_fence *fence;
> > >   	struct xe_tile *tile = gt_to_tile(gt);
> > > +	int retry_count = 3;
> > >   	ktime_t end = 0;
> > >   	int err;
> > > @@ -792,6 +794,7 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
> > >   	xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_PAGEFAULT_COUNT, 1);
> > >   retry:
> > > +	retry_count--;
> > >   	/* Always process UNMAPs first so view SVM ranges is current */
> > >   	err = xe_svm_garbage_collector(vm);
> > >   	if (err)
> > > @@ -807,30 +810,40 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
> > >   	range_debug(range, "PAGE FAULT");
> > > -	/* XXX: Add migration policy, for now migrate range once */
> > > -	if (!range->skip_migrate &&
> > > -	    xe_svm_range_needs_migrate_to_vram(range, vma, IS_DGFX(vm->xe))) {
> > > -		range->skip_migrate = true;
> > > -
> > > +	if (xe_svm_range_needs_migrate_to_vram(range, vma, IS_DGFX(vm->xe))) {
> > >   		err = xe_svm_alloc_vram(vm, tile, range, &ctx);
> > >   		if (err) {
> > > -			drm_dbg(&vm->xe->drm,
> > > -				"VRAM allocation failed, falling back to "
> > > -				"retrying fault, asid=%u, errno=%pe\n",
> > > -				vm->usm.asid, ERR_PTR(err));
> > > -			goto retry;
> > > +			if (retry_count) {
> > > +				drm_dbg(&vm->xe->drm,
> > > +					"VRAM allocation failed, falling back to retrying fault, asid=%u, errno=%pe\n",
> > > +					vm->usm.asid, ERR_PTR(err));
> > > +				goto retry;
> > > +			} else {
> > > +				drm_err(&vm->xe->drm,
> > > +					"VRAM allocation failed, retry count exceeded, asid=%u, errno=%pe\n",
> > > +					vm->usm.asid, ERR_PTR(err));
> > > +				return err;
> > > +			}
> > >   		}
> > > +
> > >   	}
> > > +	if (atomic)
> > > +		ctx.vram_only = 1;
> > > +
> > >   	range_debug(range, "GET PAGES");
> > >   	err = xe_svm_range_get_pages(vm, range, &ctx);
> > >   	/* Corner where CPU mappings have changed */
> > >   	if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {
> > > -		drm_dbg(&vm->xe->drm,
> > > -			"Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n",
> > > -			vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
> > > -		range_debug(range, "PAGE FAULT - RETRY PAGES");
> > > -		goto retry;
> > > +		if (retry_count) {
> > > +			drm_dbg(&vm->xe->drm, "Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n",
> > > +				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
> > > +			range_debug(range, "PAGE FAULT - RETRY PAGES");
> > > +			goto retry;
> > > +		} else {
> > > +			drm_err(&vm->xe->drm, "Get pages failed,, retry count exceeded, asid=%u,, errno=%pe\n",
> > > +				vm->usm.asid, ERR_PTR(err));
> > > +		}
> > >   	}
> > >   	if (err) {
> > >   		range_debug(range, "PAGE FAULT - FAIL PAGE COLLECT");
> > > -- 
> > > 2.34.1
> > > 
>