[PATCH v2 13/32] drm/xe/svm: Incase of atomic access ensure get_pages happens from vram

Mon Apr 21 06:29:30 UTC 2025

On 21-04-2025 10:28, Ghimiray, Himal Prasad wrote:
> 
> 
> On 17-04-2025 09:49, Matthew Brost wrote:
>> On Mon, Apr 07, 2025 at 03:47:00PM +0530, Himal Prasad Ghimiray wrote:
>>> Ranges can be invalidated in between vram allocation and get_pages,
>>> ensure the dma mapping is happening from vram only incase of atomic
>>> access. Retry 3 times before calling out fault in case of concurrent
>>> cpu/gpu access.
>>>
>>
>> Again I pulled this patch into a series which will minimally enable
>> atomics per UMD request. See the version of the patch [1] I landed on -
>> that is basically my review feedback. I took ownership but left SoB by
>> you as it based on this patch. We will need another reviewer though as
>> we are both contributors but feel to comment there.
> 
> Thanks for update. Will use the version you modified for the prefetch 
> series too. It looks good to me.

Actually, I see a retry count check missing for get_pages in
https://patchwork.freedesktop.org/patch/649010/?series=147846&rev=2 
which might lead to infinite loop of get_pages retry from vram.

>   >
>> Matt
>>
>> [1] https://patchwork.freedesktop.org/patch/649010/?series=147846&rev=2
>>
>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>> ---
>>>   drivers/gpu/drm/xe/xe_svm.c | 43 ++++++++++++++++++++++++-------------
>>>   1 file changed, 28 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
>>> index f4ae3feaf9d3..7ec7ecd7eb1f 100644
>>> --- a/drivers/gpu/drm/xe/xe_svm.c
>>> +++ b/drivers/gpu/drm/xe/xe_svm.c
>>> @@ -778,11 +778,13 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, 
>>> struct xe_vma *vma,
>>>               IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
>>>           .check_pages_threshold = IS_DGFX(vm->xe) &&
>>>               IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0,
>>> +        .vram_only = 0,
>>>       };
>>>       struct xe_svm_range *range;
>>>       struct drm_exec exec;
>>>       struct dma_fence *fence;
>>>       struct xe_tile *tile = gt_to_tile(gt);
>>> +    int retry_count = 3;
>>>       ktime_t end = 0;
>>>       int err;
>>> @@ -792,6 +794,7 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, 
>>> struct xe_vma *vma,
>>>       xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_PAGEFAULT_COUNT, 1);
>>>   retry:
>>> +    retry_count--;
>>>       /* Always process UNMAPs first so view SVM ranges is current */
>>>       err = xe_svm_garbage_collector(vm);
>>>       if (err)
>>> @@ -807,30 +810,40 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, 
>>> struct xe_vma *vma,
>>>       range_debug(range, "PAGE FAULT");
>>> -    /* XXX: Add migration policy, for now migrate range once */
>>> -    if (!range->skip_migrate &&
>>> -        xe_svm_range_needs_migrate_to_vram(range, vma, IS_DGFX(vm- 
>>> >xe))) {
>>> -        range->skip_migrate = true;
>>> -
>>> +    if (xe_svm_range_needs_migrate_to_vram(range, vma, IS_DGFX(vm- 
>>> >xe))) {
>>>           err = xe_svm_alloc_vram(vm, tile, range, &ctx);
>>>           if (err) {
>>> -            drm_dbg(&vm->xe->drm,
>>> -                "VRAM allocation failed, falling back to "
>>> -                "retrying fault, asid=%u, errno=%pe\n",
>>> -                vm->usm.asid, ERR_PTR(err));
>>> -            goto retry;
>>> +            if (retry_count) {
>>> +                drm_dbg(&vm->xe->drm,
>>> +                    "VRAM allocation failed, falling back to 
>>> retrying fault, asid=%u, errno=%pe\n",
>>> +                    vm->usm.asid, ERR_PTR(err));
>>> +                goto retry;
>>> +            } else {
>>> +                drm_err(&vm->xe->drm,
>>> +                    "VRAM allocation failed, retry count exceeded, 
>>> asid=%u, errno=%pe\n",
>>> +                    vm->usm.asid, ERR_PTR(err));
>>> +                return err;
>>> +            }
>>>           }
>>> +
>>>       }
>>> +    if (atomic)
>>> +        ctx.vram_only = 1;
>>> +
>>>       range_debug(range, "GET PAGES");
>>>       err = xe_svm_range_get_pages(vm, range, &ctx);
>>>       /* Corner where CPU mappings have changed */
>>>       if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {
>>> -        drm_dbg(&vm->xe->drm,
>>> -            "Get pages failed, falling back to retrying, asid=%u, 
>>> gpusvm=%p, errno=%pe\n",
>>> -            vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
>>> -        range_debug(range, "PAGE FAULT - RETRY PAGES");
>>> -        goto retry;
>>> +        if (retry_count) {
>>> +            drm_dbg(&vm->xe->drm, "Get pages failed, falling back to 
>>> retrying, asid=%u, gpusvm=%p, errno=%pe\n",
>>> +                vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
>>> +            range_debug(range, "PAGE FAULT - RETRY PAGES");
>>> +            goto retry;
>>> +        } else {
>>> +            drm_err(&vm->xe->drm, "Get pages failed,, retry count 
>>> exceeded, asid=%u,, errno=%pe\n",
>>> +                vm->usm.asid, ERR_PTR(err));
>>> +        }
>>>       }
>>>       if (err) {
>>>           range_debug(range, "PAGE FAULT - FAIL PAGE COLLECT");
>>> -- 
>>> 2.34.1
>>>
>