[PATCH v2 13/32] drm/xe/svm: Incase of atomic access ensure get_pages happens from vram

Tue Apr 22 15:25:41 UTC 2025

On Mon, Apr 21, 2025 at 11:59:30AM +0530, Ghimiray, Himal Prasad wrote:
> 
> 
> On 21-04-2025 10:28, Ghimiray, Himal Prasad wrote:
> > 
> > 
> > On 17-04-2025 09:49, Matthew Brost wrote:
> > > On Mon, Apr 07, 2025 at 03:47:00PM +0530, Himal Prasad Ghimiray wrote:
> > > > Ranges can be invalidated in between vram allocation and get_pages,
> > > > ensure the dma mapping is happening from vram only incase of atomic
> > > > access. Retry 3 times before calling out fault in case of concurrent
> > > > cpu/gpu access.
> > > > 
> > > 
> > > Again I pulled this patch into a series which will minimally enable
> > > atomics per UMD request. See the version of the patch [1] I landed on -
> > > that is basically my review feedback. I took ownership but left SoB by
> > > you as it based on this patch. We will need another reviewer though as
> > > we are both contributors but feel to comment there.
> > 
> > Thanks for update. Will use the version you modified for the prefetch
> > series too. It looks good to me.
> 
> Actually, I see a retry count check missing for get_pages in
> https://patchwork.freedesktop.org/patch/649010/?series=147846&rev=2 which
> might lead to infinite loop of get_pages retry from vram.
> 

Ah, yes indeed that is needed too.

We something like this on a get_pages() failure. 

if (vram_only && migrate_retry_count <= 0)
	bail
else
	retry

Matt

> 
> >   >
> > > Matt
> > > 
> > > [1] https://patchwork.freedesktop.org/patch/649010/?series=147846&rev=2
> > > 
> > > > Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> > > > ---
> > > >   drivers/gpu/drm/xe/xe_svm.c | 43 ++++++++++++++++++++++++-------------
> > > >   1 file changed, 28 insertions(+), 15 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> > > > index f4ae3feaf9d3..7ec7ecd7eb1f 100644
> > > > --- a/drivers/gpu/drm/xe/xe_svm.c
> > > > +++ b/drivers/gpu/drm/xe/xe_svm.c
> > > > @@ -778,11 +778,13 @@ int xe_svm_handle_pagefault(struct xe_vm
> > > > *vm, struct xe_vma *vma,
> > > >               IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
> > > >           .check_pages_threshold = IS_DGFX(vm->xe) &&
> > > >               IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0,
> > > > +        .vram_only = 0,
> > > >       };
> > > >       struct xe_svm_range *range;
> > > >       struct drm_exec exec;
> > > >       struct dma_fence *fence;
> > > >       struct xe_tile *tile = gt_to_tile(gt);
> > > > +    int retry_count = 3;
> > > >       ktime_t end = 0;
> > > >       int err;
> > > > @@ -792,6 +794,7 @@ int xe_svm_handle_pagefault(struct xe_vm
> > > > *vm, struct xe_vma *vma,
> > > >       xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_PAGEFAULT_COUNT, 1);
> > > >   retry:
> > > > +    retry_count--;
> > > >       /* Always process UNMAPs first so view SVM ranges is current */
> > > >       err = xe_svm_garbage_collector(vm);
> > > >       if (err)
> > > > @@ -807,30 +810,40 @@ int xe_svm_handle_pagefault(struct xe_vm
> > > > *vm, struct xe_vma *vma,
> > > >       range_debug(range, "PAGE FAULT");
> > > > -    /* XXX: Add migration policy, for now migrate range once */
> > > > -    if (!range->skip_migrate &&
> > > > -        xe_svm_range_needs_migrate_to_vram(range, vma,
> > > > IS_DGFX(vm- >xe))) {
> > > > -        range->skip_migrate = true;
> > > > -
> > > > +    if (xe_svm_range_needs_migrate_to_vram(range, vma,
> > > > IS_DGFX(vm- >xe))) {
> > > >           err = xe_svm_alloc_vram(vm, tile, range, &ctx);
> > > >           if (err) {
> > > > -            drm_dbg(&vm->xe->drm,
> > > > -                "VRAM allocation failed, falling back to "
> > > > -                "retrying fault, asid=%u, errno=%pe\n",
> > > > -                vm->usm.asid, ERR_PTR(err));
> > > > -            goto retry;
> > > > +            if (retry_count) {
> > > > +                drm_dbg(&vm->xe->drm,
> > > > +                    "VRAM allocation failed, falling back to
> > > > retrying fault, asid=%u, errno=%pe\n",
> > > > +                    vm->usm.asid, ERR_PTR(err));
> > > > +                goto retry;
> > > > +            } else {
> > > > +                drm_err(&vm->xe->drm,
> > > > +                    "VRAM allocation failed, retry count
> > > > exceeded, asid=%u, errno=%pe\n",
> > > > +                    vm->usm.asid, ERR_PTR(err));
> > > > +                return err;
> > > > +            }
> > > >           }
> > > > +
> > > >       }
> > > > +    if (atomic)
> > > > +        ctx.vram_only = 1;
> > > > +
> > > >       range_debug(range, "GET PAGES");
> > > >       err = xe_svm_range_get_pages(vm, range, &ctx);
> > > >       /* Corner where CPU mappings have changed */
> > > >       if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {
> > > > -        drm_dbg(&vm->xe->drm,
> > > > -            "Get pages failed, falling back to retrying,
> > > > asid=%u, gpusvm=%p, errno=%pe\n",
> > > > -            vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
> > > > -        range_debug(range, "PAGE FAULT - RETRY PAGES");
> > > > -        goto retry;
> > > > +        if (retry_count) {
> > > > +            drm_dbg(&vm->xe->drm, "Get pages failed, falling
> > > > back to retrying, asid=%u, gpusvm=%p, errno=%pe\n",
> > > > +                vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
> > > > +            range_debug(range, "PAGE FAULT - RETRY PAGES");
> > > > +            goto retry;
> > > > +        } else {
> > > > +            drm_err(&vm->xe->drm, "Get pages failed,, retry
> > > > count exceeded, asid=%u,, errno=%pe\n",
> > > > +                vm->usm.asid, ERR_PTR(err));
> > > > +        }
> > > >       }
> > > >       if (err) {
> > > >           range_debug(range, "PAGE FAULT - FAIL PAGE COLLECT");
> > > > -- 
> > > > 2.34.1
> > > > 
> > 
>