[PATCH v2 23/32] drm/xe: Implement madvise ioctl for xe

Wed May 28 05:22:59 UTC 2025

On 20-05-2025 15:45, Ghimiray, Himal Prasad wrote:
> 
> 
> On 15-05-2025 03:11, Matthew Brost wrote:
>> On Mon, Apr 07, 2025 at 03:47:10PM +0530, Himal Prasad Ghimiray wrote:
>>> This driver-specific ioctl enables UMDs to control the memory attributes
>>> for GPU VMAs within a specified input range. If the start or end
>>> addresses fall within an existing VMA, the VMA is split accordingly. The
>>> attributes of the VMA are modified as provided by the users. The old
>>> mappings of the VMAs are invalidated, and TLB invalidation is performed
>>> if necessary.
>>>
>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>> ---
>>>   drivers/gpu/drm/xe/Makefile        |   1 +
>>>   drivers/gpu/drm/xe/xe_device.c     |   2 +
>>>   drivers/gpu/drm/xe/xe_vm_madvise.c | 309 +++++++++++++++++++++++++++++
>>>   drivers/gpu/drm/xe/xe_vm_madvise.h |  15 ++
>>>   4 files changed, 327 insertions(+)
>>>   create mode 100644 drivers/gpu/drm/xe/xe_vm_madvise.c
>>>   create mode 100644 drivers/gpu/drm/xe/xe_vm_madvise.h
>>>
>>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>>> index e4fec90bab55..3e83ae8b9dc1 100644
>>> --- a/drivers/gpu/drm/xe/Makefile
>>> +++ b/drivers/gpu/drm/xe/Makefile
>>> @@ -117,6 +117,7 @@ xe-y += xe_bb.o \
>>>       xe_uc.o \
>>>       xe_uc_fw.o \
>>>       xe_vm.o \
>>> +    xe_vm_madvise.o \
>>>       xe_vram.o \
>>>       xe_vram_freq.o \
>>>       xe_vsec.o \
>>> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/ 
>>> xe_device.c
>>> index d8e227ddf255..3e57300014bf 100644
>>> --- a/drivers/gpu/drm/xe/xe_device.c
>>> +++ b/drivers/gpu/drm/xe/xe_device.c
>>> @@ -60,6 +60,7 @@
>>>   #include "xe_ttm_stolen_mgr.h"
>>>   #include "xe_ttm_sys_mgr.h"
>>>   #include "xe_vm.h"
>>> +#include "xe_vm_madvise.h"
>>>   #include "xe_vram.h"
>>>   #include "xe_vsec.h"
>>>   #include "xe_wait_user_fence.h"
>>> @@ -196,6 +197,7 @@ static const struct drm_ioctl_desc xe_ioctls[] = {
>>>       DRM_IOCTL_DEF_DRV(XE_WAIT_USER_FENCE, xe_wait_user_fence_ioctl,
>>>                 DRM_RENDER_ALLOW),
>>>       DRM_IOCTL_DEF_DRV(XE_OBSERVATION, xe_observation_ioctl, 
>>> DRM_RENDER_ALLOW),
>>> +    DRM_IOCTL_DEF_DRV(XE_MADVISE, xe_vm_madvise_ioctl, 
>>> DRM_RENDER_ALLOW),
>>>   };
>>>   static long xe_drm_ioctl(struct file *file, unsigned int cmd, 
>>> unsigned long arg)
>>> diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c b/drivers/gpu/drm/xe/ 
>>> xe_vm_madvise.c
>>> new file mode 100644
>>> index 000000000000..ef50031649e0
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
>>> @@ -0,0 +1,309 @@
>>> +// SPDX-License-Identifier: MIT
>>> +/*
>>> + * Copyright © 2024 Intel Corporation
>>> + */
>>> +
>>> +#include "xe_vm_madvise.h"
>>> +
>>> +#include <linux/nospec.h>
>>> +#include <drm/ttm/ttm_tt.h>
>>> +#include <drm/xe_drm.h>
>>> +
>>> +#include "xe_bo.h"
>>> +#include "xe_gt_tlb_invalidation.h"
>>> +#include "xe_pt.h"
>>> +#include "xe_svm.h"
>>> +
>>> +static struct xe_vma **get_vmas(struct xe_vm *vm, int *num_vmas,
>>> +                u64 addr, u64 range)
>>> +{
>>> +    struct xe_vma **vmas, **__vmas;
>>> +    struct drm_gpuva *gpuva;
>>> +    int max_vmas = 8;
>>> +
>>> +    lockdep_assert_held(&vm->lock);
>>
>> lockdep_assert_held_write
> 
> ok
> 
>>
>>> +
>>> +    *num_vmas = 0;
>>> +    vmas = kmalloc_array(max_vmas, sizeof(*vmas), GFP_KERNEL);
>>> +    if (!vmas)
>>> +        return NULL;
>>> +
>>> +    vm_dbg(&vm->xe->drm, "VMA's in range: start=0x%016llx, 
>>> end=0x%016llx", addr, addr + range);
>>> +
>>> +    drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, addr, addr + 
>>> range) {
>>> +        struct xe_vma *vma = gpuva_to_vma(gpuva);
>>> +
>>> +        if (*num_vmas == max_vmas) {
>>> +            max_vmas <<= 1;
>>> +            __vmas = krealloc(vmas, max_vmas * sizeof(*vmas), 
>>> GFP_KERNEL);
>>> +            if (!__vmas) {
>>> +                kfree(vmas);
>>> +                return NULL;
>>> +            }
>>> +            vmas = __vmas;
>>> +        }
>>> +
>>> +        vmas[*num_vmas] = vma;
>>> +        (*num_vmas)++;
>>> +    }
>>> +
>>> +    vm_dbg(&vm->xe->drm, "*num_vmas = %d\n", *num_vmas);
>>> +
>>> +    if (!*num_vmas) {
>>> +        kfree(vmas);
>>> +        return NULL;
>>> +    }
>>> +
>>> +    return vmas;
>>> +}
>>> +
>>> +static int madvise_preferred_mem_loc(struct xe_device *xe, struct 
>>> xe_vm *vm,
>>> +                     struct xe_vma **vmas, int num_vmas,
>>> +                     struct drm_xe_madvise_ops ops)
>>> +{
>>> +    /* Implementation pending */
>>> +    return 0;
>>> +}
>>> +
>>> +static int madvise_atomic(struct xe_device *xe, struct xe_vm *vm,
>>> +              struct xe_vma **vmas, int num_vmas,
>>> +              struct drm_xe_madvise_ops ops)
>>> +{
>>> +    /* Implementation pending */
>>> +    return 0;
>>> +}
>>> +
>>> +static int madvise_pat_index(struct xe_device *xe, struct xe_vm *vm,
>>> +                 struct xe_vma **vmas, int num_vmas,
>>> +                 struct drm_xe_madvise_ops ops)
>>> +{
>>> +    /* Implementation pending */
>>> +    return 0;
>>> +}
>>> +
>>> +static int madvise_purgeable_state(struct xe_device *xe, struct 
>>> xe_vm *vm,
>>> +                   struct xe_vma **vmas, int num_vmas,
>>> +                   struct drm_xe_madvise_ops ops)
>>> +{
>>> +    /* Implementation pending */
>>> +    return 0;
>>> +}
>>> +
>>> +typedef int (*madvise_func)(struct xe_device *xe, struct xe_vm *vm,
>>> +                struct xe_vma **vmas, int num_vmas, struct 
>>> drm_xe_madvise_ops ops);
>>> +
>>> +static const madvise_func madvise_funcs[] = {
>>> +    [DRM_XE_VMA_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc,
>>> +    [DRM_XE_VMA_ATTR_ATOMIC] = madvise_atomic,
>>> +    [DRM_XE_VMA_ATTR_PAT] = madvise_pat_index,
>>> +    [DRM_XE_VMA_ATTR_PURGEABLE_STATE] = madvise_purgeable_state,
>>> +};
>>> +
>>> +static void xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 
>>> start, u64 end, u8 *tile_mask)
>>> +{
>>> +    struct drm_gpusvm_notifier *notifier;
>>> +    struct drm_gpuva *gpuva;
>>> +    struct xe_svm_range *range;
>>> +    struct xe_tile *tile;
>>> +    u64 adj_start, adj_end;
>>> +    u8 id;
>>> +
>>> +    lockdep_assert_held(&vm->lock);
>>
>> lockdep_assert_held_write
>>
>>> +
> 
> ok
> 
>>
>>     /* Waiting on pending binds */
>>
>>> +    if (dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP,
>>> +                  false, MAX_SCHEDULE_TIMEOUT) <= 0)
>>> +        XE_WARN_ON(1);
>>> +
>>> +    down_write(&vm->svm.gpusvm.notifier_lock);
>>> +
>>
>> xe_svm_notifier_lock
> 
> ok

While testing I remembered now, xe_svm_notifier_lock takes read lock 
whereas xe_pt_zap_ptes_range needs write lock.

> 
>>
>>> +    drm_gpusvm_for_each_notifier(notifier, &vm->svm.gpusvm, start, 
>>> end) {
>>> +        struct drm_gpusvm_range *r = NULL;
>>> +
>>> +        adj_start = max(start, notifier->itree.start);
>>> +        adj_end = min(end, notifier->itree.last + 1);
>>> +        drm_gpusvm_for_each_range(r, notifier, adj_start, adj_end) {
>>> +            range = to_xe_range(r);
>>> +            for_each_tile(tile, vm->xe, id) {
>>> +                if (xe_pt_zap_ptes_range(tile, vm, range)) {
>>> +                    *tile_mask |= BIT(id);
>>> +                    range->tile_invalidated |= BIT(id);
>>> +                }
>>> +            }
>>> +        }
>>> +    }
>>> +
>>> +    up_write(&vm->svm.gpusvm.notifier_lock);
>>> +
>>
>> xe_svm_notifier_unlock
>>
> 
> Hmm
> 
>>> +    drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
>>> +        struct xe_vma *vma = gpuva_to_vma(gpuva);
>>> +
>>> +        if (xe_vma_is_cpu_addr_mirror(vma))
>>> +            continue;
>>> +
>>> +        if (xe_vma_is_userptr(vma)) {
>>> +            WARN_ON_ONCE(!mmu_interval_check_retry
>>> +                    (&to_userptr_vma(vma)->userptr.notifier,
>>> +                     to_userptr_vma(vma)->userptr.notifier_seq));
>>> +
>>> +            WARN_ON_ONCE(! 
>>> dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
>>> +                                 DMA_RESV_USAGE_BOOKKEEP));
>>> +        }
>>> +
>>> +        if (xe_vma_bo(vma))
>>> +            xe_bo_lock(xe_vma_bo(vma), false);
>>> +
>>
>> Do you need the BO's dma-resv lock here? I don't think you do. Maybe 
>> double
>> check with Thomas on this one as I could be forgeting something here.
> 
> Sure
> 
>>
>>> +        for_each_tile(tile, vm->xe, id) {
>>> +            if (xe_pt_zap_ptes(tile, vma))
>>> +                *tile_mask |= BIT(id);
>>> +        }
>>> +
>>> +        if (xe_vma_bo(vma))
>>> +            xe_bo_unlock(xe_vma_bo(vma));
>>> +    }
>>> +}
>>> +
>>> +static void xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 
>>> start, u64 end)
>>> +{
>>> +    struct xe_gt_tlb_invalidation_fence
>>> +        fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
>>> +    struct xe_tile *tile;
>>> +    u32 fence_id = 0;
>>> +    u8 tile_mask = 0;
>>> +    u8 id;
>>> +
>>> +    xe_zap_ptes_in_madvise_range(vm, start, end, &tile_mask);
>>> +    if (!tile_mask)
>>> +        return;
>>> +
>>> +    xe_device_wmb(vm->xe);
>>> +
>>
>> We have the below pattern in a few places in the driver. I wonder if it
>> time for a helper?
> 
> Makes sense
> 
>>
>>> +    for_each_tile(tile, vm->xe, id) {
>>> +        if (tile_mask & BIT(id)) {
>>> +            int err;
>>> +
>>> +            xe_gt_tlb_invalidation_fence_init(tile->primary_gt,
>>> +                              &fence[fence_id], true);
>>> +
>>> +            err = xe_gt_tlb_invalidation_range(tile->primary_gt,
>>> +                               &fence[fence_id],
>>> +                               start,
>>> +                               end,
>>> +                               vm->usm.asid);
>>> +            if (WARN_ON_ONCE(err < 0))
>>> +                goto wait;
>>> +            ++fence_id;
>>> +
>>> +            if (!tile->media_gt)
>>> +                continue;
>>> +
>>> +            xe_gt_tlb_invalidation_fence_init(tile->media_gt,
>>> +                              &fence[fence_id], true);
>>> +
>>> +            err = xe_gt_tlb_invalidation_range(tile->media_gt,
>>> +                               &fence[fence_id],
>>> +                               start,
>>> +                               end,
>>> +                               vm->usm.asid);
>>> +            if (WARN_ON_ONCE(err < 0))
>>> +                goto wait;
>>> +            ++fence_id;
>>> +        }
>>> +    }
>>> +
>>> +wait:
>>> +    for (id = 0; id < fence_id; ++id)
>>> +        xe_gt_tlb_invalidation_fence_wait(&fence[id]);
>>> +}
>>> +
>>> +static int input_ranges_same(struct drm_xe_madvise_ops *old,
>>> +                 struct drm_xe_madvise_ops *new)
>>> +{
>>> +    return (new->start == old->start && new->range == old->range);
>>> +}
>>> +
>>> +int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct 
>>> drm_file *file)
>>
>> Kernel doc.
> 
> Sure
> 
>>
>>> +{
>>> +    struct xe_device *xe = to_xe_device(dev);
>>> +    struct xe_file *xef = to_xe_file(file);
>>> +    struct drm_xe_madvise_ops *advs_ops;
>>> +    struct drm_xe_madvise *args = data;
>>> +    struct xe_vm *vm;
>>> +    struct xe_vma **vmas = NULL;
>>> +    int num_vmas, err = 0;
>>> +    int i, j, attr_type;
>>> +
>>> +    if (XE_IOCTL_DBG(xe, args->num_ops < 1))
>>> +        return -EINVAL;
>>> +
>>> +    vm = xe_vm_lookup(xef, args->vm_id);
>>> +    if (XE_IOCTL_DBG(xe, !vm))
>>> +        return -EINVAL;
>>> +
>>> +    if (XE_IOCTL_DBG(xe, !xe_vm_in_fault_mode(vm))) {
>>
>> Do we want to restrict this fault mode? Maybe check with Mesa if they
>> see any use cases.
> 
> Ok
> 
>>
>>> +        err = -EINVAL;
>>> +        goto put_vm;
>>> +    }
>>> +
>>> +    down_write(&vm->lock);
>>> +
>>> +    if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
>>> +        err = -ENOENT;
>>> +        goto unlock_vm;
>>> +    }
>>> +
>>> +    if (args->num_ops > 1) {
>>> +        u64 __user *madvise_user = u64_to_user_ptr(args- 
>>> >vector_of_ops);
>>> +
>>> +        advs_ops = kvmalloc_array(args->num_ops, sizeof(struct 
>>> drm_xe_madvise_ops),
>>> +                      GFP_KERNEL | __GFP_ACCOUNT |
>>> +                      __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
>>> +        if (!advs_ops)
>>> +            return args->num_ops > 1 ? -ENOBUFS : -ENOMEM;
>>> +
>>> +        err = __copy_from_user(advs_ops, madvise_user,
>>> +                       sizeof(struct drm_xe_madvise_ops) *
>>> +                       args->num_ops);
>>> +        if (XE_IOCTL_DBG(xe, err)) {
>>> +            err = -EFAULT;
>>> +            goto free_advs_ops;
>>> +        }
>>> +    } else {
>>> +        advs_ops = &args->ops;
>>> +    }
>>> +
>>> +    for (i = 0; i < args->num_ops; i++) {
>>> +        xe_vm_alloc_madvise_vma(vm, advs_ops[i].start, 
>>> advs_ops[i].range);
>>> +
>>> +        vmas = get_vmas(vm, &num_vmas, advs_ops[i].start, 
>>> advs_ops[i].range);
>>> +        if (!vmas) {
>>> +            err = -ENOMEM;
>>> +            goto unlock_vm;
>>> +        }
>>> +
>>> +        attr_type = array_index_nospec(advs_ops[i].type, 
>>> ARRAY_SIZE(madvise_funcs));
>>> +        err = madvise_funcs[attr_type](xe, vm, vmas, num_vmas, 
>>> advs_ops[i]);
>>> +
>>> +        kfree(vmas);
>>> +        vmas = NULL;
>>> +
>>> +        if (err)
>>> +            break;
>>> +    }
>>> +
>>> +    for (i = 0; i < args->num_ops; i++) {
>>> +        for (j = i + 1; j < args->num_ops; ++j) {
>>> +            if (input_ranges_same(&advs_ops[j], &advs_ops[i]))
>>> +                break;
>>> +        }
>>
>> The above loop doesn't look like it actually does anything.
> 
> My bad.
> 
> was intending to do
> 
> if (input_ranges_same(&advs_ops[j], &advs_ops[i])) {
>                         needs_invalidation = false;
>  >> +                break;
> }
>               if(needs_invalidation)
>                   xe_vm_invalidate_madvise_range(vm, advs_ops[i].start,
>                         advs_ops[i].start + advs_ops[i].range);
>  >> +    }
> 
>>
>> Matt
>>
>>> +        xe_vm_invalidate_madvise_range(vm, advs_ops[i].start,
>>> +                           advs_ops[i].start + advs_ops[i].range);
>>> +    }
>>> +free_advs_ops:
>>> +    if (args->num_ops > 1)
>>> +        kvfree(advs_ops);
>>> +unlock_vm:
>>> +    up_write(&vm->lock);
>>> +put_vm:
>>> +    xe_vm_put(vm);
>>> +    return err;
>>> +}
>>> diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.h b/drivers/gpu/drm/xe/ 
>>> xe_vm_madvise.h
>>> new file mode 100644
>>> index 000000000000..c5cdd058c322
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/xe_vm_madvise.h
>>> @@ -0,0 +1,15 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2024 Intel Corporation
>>> + */
>>> +
>>> +#ifndef _XE_VM_MADVISE_H_
>>> +#define _XE_VM_MADVISE_H_
>>> +
>>> +struct drm_device;
>>> +struct drm_file;
>>> +
>>> +int xe_vm_madvise_ioctl(struct drm_device *dev, void *data,
>>> +            struct drm_file *file);
>>> +
>>> +#endif
>>> -- 
>>> 2.34.1
>>>
>