[PATCH v2 13/32] drm/xe/svm: Incase of atomic access ensure get_pages happens from vram
Ghimiray, Himal Prasad
himal.prasad.ghimiray at intel.com
Mon Apr 21 06:29:30 UTC 2025
On 21-04-2025 10:28, Ghimiray, Himal Prasad wrote:
>
>
> On 17-04-2025 09:49, Matthew Brost wrote:
>> On Mon, Apr 07, 2025 at 03:47:00PM +0530, Himal Prasad Ghimiray wrote:
>>> Ranges can be invalidated in between vram allocation and get_pages,
>>> ensure the dma mapping is happening from vram only incase of atomic
>>> access. Retry 3 times before calling out fault in case of concurrent
>>> cpu/gpu access.
>>>
>>
>> Again I pulled this patch into a series which will minimally enable
>> atomics per UMD request. See the version of the patch [1] I landed on -
>> that is basically my review feedback. I took ownership but left SoB by
>> you as it based on this patch. We will need another reviewer though as
>> we are both contributors but feel to comment there.
>
> Thanks for update. Will use the version you modified for the prefetch
> series too. It looks good to me.
Actually, I see a retry count check missing for get_pages in
https://patchwork.freedesktop.org/patch/649010/?series=147846&rev=2
which might lead to infinite loop of get_pages retry from vram.
> >
>> Matt
>>
>> [1] https://patchwork.freedesktop.org/patch/649010/?series=147846&rev=2
>>
>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>> ---
>>> drivers/gpu/drm/xe/xe_svm.c | 43 ++++++++++++++++++++++++-------------
>>> 1 file changed, 28 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
>>> index f4ae3feaf9d3..7ec7ecd7eb1f 100644
>>> --- a/drivers/gpu/drm/xe/xe_svm.c
>>> +++ b/drivers/gpu/drm/xe/xe_svm.c
>>> @@ -778,11 +778,13 @@ int xe_svm_handle_pagefault(struct xe_vm *vm,
>>> struct xe_vma *vma,
>>> IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
>>> .check_pages_threshold = IS_DGFX(vm->xe) &&
>>> IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0,
>>> + .vram_only = 0,
>>> };
>>> struct xe_svm_range *range;
>>> struct drm_exec exec;
>>> struct dma_fence *fence;
>>> struct xe_tile *tile = gt_to_tile(gt);
>>> + int retry_count = 3;
>>> ktime_t end = 0;
>>> int err;
>>> @@ -792,6 +794,7 @@ int xe_svm_handle_pagefault(struct xe_vm *vm,
>>> struct xe_vma *vma,
>>> xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_PAGEFAULT_COUNT, 1);
>>> retry:
>>> + retry_count--;
>>> /* Always process UNMAPs first so view SVM ranges is current */
>>> err = xe_svm_garbage_collector(vm);
>>> if (err)
>>> @@ -807,30 +810,40 @@ int xe_svm_handle_pagefault(struct xe_vm *vm,
>>> struct xe_vma *vma,
>>> range_debug(range, "PAGE FAULT");
>>> - /* XXX: Add migration policy, for now migrate range once */
>>> - if (!range->skip_migrate &&
>>> - xe_svm_range_needs_migrate_to_vram(range, vma, IS_DGFX(vm-
>>> >xe))) {
>>> - range->skip_migrate = true;
>>> -
>>> + if (xe_svm_range_needs_migrate_to_vram(range, vma, IS_DGFX(vm-
>>> >xe))) {
>>> err = xe_svm_alloc_vram(vm, tile, range, &ctx);
>>> if (err) {
>>> - drm_dbg(&vm->xe->drm,
>>> - "VRAM allocation failed, falling back to "
>>> - "retrying fault, asid=%u, errno=%pe\n",
>>> - vm->usm.asid, ERR_PTR(err));
>>> - goto retry;
>>> + if (retry_count) {
>>> + drm_dbg(&vm->xe->drm,
>>> + "VRAM allocation failed, falling back to
>>> retrying fault, asid=%u, errno=%pe\n",
>>> + vm->usm.asid, ERR_PTR(err));
>>> + goto retry;
>>> + } else {
>>> + drm_err(&vm->xe->drm,
>>> + "VRAM allocation failed, retry count exceeded,
>>> asid=%u, errno=%pe\n",
>>> + vm->usm.asid, ERR_PTR(err));
>>> + return err;
>>> + }
>>> }
>>> +
>>> }
>>> + if (atomic)
>>> + ctx.vram_only = 1;
>>> +
>>> range_debug(range, "GET PAGES");
>>> err = xe_svm_range_get_pages(vm, range, &ctx);
>>> /* Corner where CPU mappings have changed */
>>> if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {
>>> - drm_dbg(&vm->xe->drm,
>>> - "Get pages failed, falling back to retrying, asid=%u,
>>> gpusvm=%p, errno=%pe\n",
>>> - vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
>>> - range_debug(range, "PAGE FAULT - RETRY PAGES");
>>> - goto retry;
>>> + if (retry_count) {
>>> + drm_dbg(&vm->xe->drm, "Get pages failed, falling back to
>>> retrying, asid=%u, gpusvm=%p, errno=%pe\n",
>>> + vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
>>> + range_debug(range, "PAGE FAULT - RETRY PAGES");
>>> + goto retry;
>>> + } else {
>>> + drm_err(&vm->xe->drm, "Get pages failed,, retry count
>>> exceeded, asid=%u,, errno=%pe\n",
>>> + vm->usm.asid, ERR_PTR(err));
>>> + }
>>> }
>>> if (err) {
>>> range_debug(range, "PAGE FAULT - FAIL PAGE COLLECT");
>>> --
>>> 2.34.1
>>>
>
More information about the Intel-xe
mailing list