[PATCH v4 09/20] drm/xe: Implement madvise ioctl for xe
Matthew Brost
matthew.brost at intel.com
Mon Jun 23 05:33:52 UTC 2025
On Fri, Jun 13, 2025 at 06:25:47PM +0530, Himal Prasad Ghimiray wrote:
> This driver-specific ioctl enables UMDs to control the memory attributes
> for GPU VMAs within a specified input range. If the start or end
> addresses fall within an existing VMA, the VMA is split accordingly. The
> attributes of the VMA are modified as provided by the users. The old
> mappings of the VMAs are invalidated, and TLB invalidation is performed
> if necessary.
>
> v2(Matthew brost)
> - xe_vm_in_fault_mode can't be enabled by Mesa, hence allow ioctl in non
> fault mode too
> - fix tlb invalidation skip for same ranges in multiple op
> - use helper for tlb invalidation
> - use xe_svm_notifier_lock/unlock helper
> - s/lockdep_assert_held/lockdep_assert_held_write
> - Add kernel-doc
>
> v3(Matthew Brost)
> - make vfunc fail safe
> - Add sanitizing input args before vfunc
>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
> drivers/gpu/drm/xe/Makefile | 1 +
> drivers/gpu/drm/xe/xe_device.c | 2 +
> drivers/gpu/drm/xe/xe_vm_madvise.c | 282 +++++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_vm_madvise.h | 15 ++
> 4 files changed, 300 insertions(+)
> create mode 100644 drivers/gpu/drm/xe/xe_vm_madvise.c
> create mode 100644 drivers/gpu/drm/xe/xe_vm_madvise.h
>
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index f5f5775acdc0..d375b549c30f 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -117,6 +117,7 @@ xe-y += xe_bb.o \
> xe_uc.o \
> xe_uc_fw.o \
> xe_vm.o \
> + xe_vm_madvise.o \
> xe_vram.o \
> xe_vram_freq.o \
> xe_vsec.o \
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 7d9a31868ea9..632d3ab12392 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -61,6 +61,7 @@
> #include "xe_ttm_stolen_mgr.h"
> #include "xe_ttm_sys_mgr.h"
> #include "xe_vm.h"
> +#include "xe_vm_madvise.h"
> #include "xe_vram.h"
> #include "xe_vsec.h"
> #include "xe_wait_user_fence.h"
> @@ -197,6 +198,7 @@ static const struct drm_ioctl_desc xe_ioctls[] = {
> DRM_IOCTL_DEF_DRV(XE_WAIT_USER_FENCE, xe_wait_user_fence_ioctl,
> DRM_RENDER_ALLOW),
> DRM_IOCTL_DEF_DRV(XE_OBSERVATION, xe_observation_ioctl, DRM_RENDER_ALLOW),
> + DRM_IOCTL_DEF_DRV(XE_MADVISE, xe_vm_madvise_ioctl, DRM_RENDER_ALLOW),
I'd make this change the last patch in the series once the
implementation is complete.
> };
>
> static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c b/drivers/gpu/drm/xe/xe_vm_madvise.c
> new file mode 100644
> index 000000000000..ff560914ad7e
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
> @@ -0,0 +1,282 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2025 Intel Corporation
> + */
> +
> +#include "xe_vm_madvise.h"
> +
> +#include <linux/nospec.h>
> +#include <drm/ttm/ttm_tt.h>
Why do you need the ttm_tt include?
> +#include <drm/xe_drm.h>
> +
> +#include "xe_bo.h"
> +#include "xe_gt_tlb_invalidation.h"
I don't this you need the xe_gt_tlb_invalidation include.
> +#include "xe_pt.h"
> +#include "xe_svm.h"
> +
> +struct xe_vmas_in_madvise_range {
> + u64 addr;
> + u64 range;
> + struct xe_vma **vmas;
> + int num_vmas;
> + bool has_svm_vmas;
> + bool has_bo_vmas;
> + bool has_userptr_vmas;
> +};
> +
> +static int get_vmas(struct xe_vm *vm, struct xe_vmas_in_madvise_range *madvise_range)
> +{
> + u64 addr = madvise_range->addr;
> + u64 range = madvise_range->range;
> +
> + struct xe_vma **__vmas;
> + struct drm_gpuva *gpuva;
> + int max_vmas = 8;
> +
> + lockdep_assert_held(&vm->lock);
> +
> + madvise_range->num_vmas = 0;
> + madvise_range->vmas = kmalloc_array(max_vmas, sizeof(*madvise_range->vmas), GFP_KERNEL);
> + if (!madvise_range->vmas)
> + return -ENOMEM;
> +
> + vm_dbg(&vm->xe->drm, "VMA's in range: start=0x%016llx, end=0x%016llx", addr, addr + range);
> +
> + drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, addr, addr + range) {
> + struct xe_vma *vma = gpuva_to_vma(gpuva);
> +
> + if (xe_vma_bo(vma))
> + madvise_range->has_bo_vmas = true;
> + else if (xe_vma_is_cpu_addr_mirror(vma))
> + madvise_range->has_svm_vmas = true;
> + else if (xe_vma_is_userptr(vma))
> + madvise_range->has_userptr_vmas = true;
> + else
> + XE_WARN_ON("UNEXPECTED VMA");
A NULL range would be odd to set madvise for but I don't think it
warrents a WARN_ON, rather just ingore it.
> +
> + if (madvise_range->num_vmas == max_vmas) {
> + max_vmas <<= 1;
> + __vmas = krealloc(madvise_range->vmas,
> + max_vmas * sizeof(*madvise_range->vmas),
> + GFP_KERNEL);
> + if (!__vmas) {
> + kfree(madvise_range->vmas);
> + return -ENOMEM;
> + }
> + madvise_range->vmas = __vmas;
> + }
> +
> + madvise_range->vmas[madvise_range->num_vmas] = vma;
> + (madvise_range->num_vmas)++;
> + }
> +
> + if (!madvise_range->num_vmas)
> + kfree(madvise_range->vmas);
> +
> + vm_dbg(&vm->xe->drm, "madvise_range-num_vmas = %d\n", madvise_range->num_vmas);
> +
> + return 0;
> +}
> +
> +static void madvise_preferred_mem_loc(struct xe_device *xe, struct xe_vm *vm,
> + struct xe_vma **vmas, int num_vmas,
> + struct drm_xe_madvise *op)
> +{
> + /* Implementation pending */
> +}
> +
> +static void madvise_atomic(struct xe_device *xe, struct xe_vm *vm,
> + struct xe_vma **vmas, int num_vmas,
> + struct drm_xe_madvise *op)
> +{
> + /* Implementation pending */
> +}
> +
> +static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm,
> + struct xe_vma **vmas, int num_vmas,
> + struct drm_xe_madvise *op)
> +{
> + /* Implementation pending */
> +}
> +
> +typedef void (*madvise_func)(struct xe_device *xe, struct xe_vm *vm,
> + struct xe_vma **vmas, int num_vmas,
> + struct drm_xe_madvise *op);
> +
> +static const madvise_func madvise_funcs[] = {
> + [DRM_XE_VMA_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc,
> + [DRM_XE_VMA_ATTR_ATOMIC] = madvise_atomic,
> + [DRM_XE_VMA_ATTR_PAT] = madvise_pat_index,
> +};
> +
> +static void xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end, u8 *tile_mask)
Return a tile_mask?
> +{
> + struct drm_gpuva *gpuva;
> + struct xe_tile *tile;
> + u8 id;
> +
> + lockdep_assert_held_write(&vm->lock);
> +
/* Wait for pending binds */
> + if (dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP,
> + false, MAX_SCHEDULE_TIMEOUT) <= 0)
> + XE_WARN_ON(1);
> +
> + *tile_mask = xe_svm_ranges_zap_ptes_in_range(vm, start, end);
> +
> + drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
> + struct xe_vma *vma = gpuva_to_vma(gpuva);
> +
> + if (xe_vma_is_cpu_addr_mirror(vma))
> + continue;
> +
> + if (xe_vma_is_userptr(vma)) {
> + WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
> + DMA_RESV_USAGE_BOOKKEEP));
> + }
You don't need brackets.
Also I think BO move could make this pop. So this might be incorrect in
xe_vm_invalidate_vma too. Maybe let's check with Thomas on this as he
added this to xe_vm_invalidate_vma originally.
> +
> + for_each_tile(tile, vm->xe, id) {
> + if (xe_pt_zap_ptes(tile, vma)) {
> + *tile_mask |= BIT(id);
> + vma->tile_invalidated |= BIT(id);
WRITE_ONCE /w pairs with comment.
> + }
> + }
> + }
> +}
> +
> +static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end)
> +{
> + u8 tile_mask = 0;
> +
> + xe_zap_ptes_in_madvise_range(vm, start, end, &tile_mask);
> + if (!tile_mask)
> + return 0;
> +
> + xe_device_wmb(vm->xe);
> +
> + return xe_vm_range_tilemask_tlb_invalidation(vm, start, end, tile_mask);
> +}
> +
> +static int drm_xe_madvise_args_are_sane(struct xe_device *xe, const struct drm_xe_madvise *args)
> +{
> + if (XE_IOCTL_DBG(xe, !args))
> + return -EINVAL;
> +
> + if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->start, SZ_4K)))
> + return -EINVAL;
> +
> + if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->range, SZ_4K)))
> + return -EINVAL;
> +
> + if (XE_IOCTL_DBG(xe, args->range < SZ_4K))
> + return -EINVAL;
> +
> + switch (args->type) {
> + case DRM_XE_VMA_ATTR_ATOMIC:
> + if (XE_IOCTL_DBG(xe, args->atomic.val > DRM_XE_VMA_ATOMIC_CPU))
> + return -EINVAL;
Ensure atomic.reserved is zero.
> + break;
> + case DRM_XE_VMA_ATTR_PAT:
> + /*TODO: Add valid pat check */
> + break;
> + case DRM_XE_VMA_ATTR_PREFERRED_LOC:
> + if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.migration_policy >
> + DRM_XE_MIGRATE_ONLY_SYSTEM_PAGES))
> + return -EINVAL;
> + break;
> + default:
> + if (XE_IOCTL_DBG(xe, 1))
> + return -EINVAL;
> + }
> +
> + if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +/**
> + * xe_vm_madvise_ioctl - Handle MADVise ioctl for a VM
> + * @dev: DRM device pointer
> + * @data: Pointer to ioctl data (drm_xe_madvise*)
> + * @file: DRM file pointer
> + *
> + * Handles the MADVISE ioctl to provide memory advice for vma's within
> + * input range.
> + *
> + * Return: 0 on success or a negative error code on failure.
> + */
> +int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> +{
> + struct xe_device *xe = to_xe_device(dev);
> + struct xe_file *xef = to_xe_file(file);
> + struct drm_xe_madvise *args = data;
> + struct xe_vm *vm;
> + struct xe_bo *bo;
> + struct drm_exec exec;
> + int err = 0;
> + int attr_type;
> +
> + vm = xe_vm_lookup(xef, args->vm_id);
> + if (XE_IOCTL_DBG(xe, !vm))
> + return -EINVAL;
> +
> + if (drm_xe_madvise_args_are_sane(vm->xe, args))
> + return -EINVAL;
> +
> + down_write(&vm->lock);
down_write_killable + error check.
> +
> + if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
> + err = -ENOENT;
> + goto unlock_vm;
> + }
> +
> + xe_vm_alloc_madvise_vma(vm, args->start, args->range);
This function has a return value, any reason you are not checking the
return value and erroring out on failure?
> +
> + struct xe_vmas_in_madvise_range madvise_range = {.addr = args->start,
> + .range = args->range, };
> + err = get_vmas(vm, &madvise_range);
> + if (err || !madvise_range.num_vmas)
> + goto unlock_vm;
> +
> + if (madvise_range.has_bo_vmas) {
> + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
| DRM_EXEC_INTERRUPTIBLE_WAIT
> + drm_exec_until_all_locked(&exec) {
> + for (int i = 0; i < madvise_range.num_vmas; i++) {
> + bo = xe_vma_bo(madvise_range.vmas[i]);
Since BO is local, struct xe_bo *bo =...
Then delete struct xe_bo at the top of the function.
> + if (!bo)
> + continue;
> + err = drm_exec_lock_obj(&exec, &bo->ttm.base);
> + drm_exec_retry_on_contention(&exec);
> + if (err)
> + goto err_fini;
> + }
> + }
> + }
> +
> + if (madvise_range.has_userptr_vmas)
> + down_read(&vm->userptr.notifier_lock);
> +
> + if (madvise_range.has_svm_vmas)
> + xe_svm_notifier_lock(vm);
These likely should be interruptable locks too...
> +
> + attr_type = array_index_nospec(args->type, ARRAY_SIZE(madvise_funcs));
> + madvise_funcs[attr_type](xe, vm, madvise_range.vmas, madvise_range.num_vmas, args);
> +
> + kfree(madvise_range.vmas);
> + madvise_range.vmas = NULL;
> +
> + err = xe_vm_invalidate_madvise_range(vm, args->start, args->start + args->range);
> +
> + if (madvise_range.has_svm_vmas)
> + xe_svm_notifier_unlock(vm);
> +
> + if (madvise_range.has_userptr_vmas)
> + up_read(&vm->userptr.notifier_lock);
> +err_fini:
The error handling is wrong here. You'd leak madvise_range.vmas on this
error path.
Matt
> + if (madvise_range.has_bo_vmas)
> + drm_exec_fini(&exec);
> +unlock_vm:
> + up_write(&vm->lock);
> + xe_vm_put(vm);
> + return err;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.h b/drivers/gpu/drm/xe/xe_vm_madvise.h
> new file mode 100644
> index 000000000000..b0e1fc445f23
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_vm_madvise.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2025 Intel Corporation
> + */
> +
> +#ifndef _XE_VM_MADVISE_H_
> +#define _XE_VM_MADVISE_H_
> +
> +struct drm_device;
> +struct drm_file;
> +
> +int xe_vm_madvise_ioctl(struct drm_device *dev, void *data,
> + struct drm_file *file);
> +
> +#endif
> --
> 2.34.1
>
More information about the Intel-xe
mailing list