[PATCH v2 23/32] drm/xe: Implement madvise ioctl for xe

Tue May 20 10:15:02 UTC 2025

On 15-05-2025 03:11, Matthew Brost wrote:
> On Mon, Apr 07, 2025 at 03:47:10PM +0530, Himal Prasad Ghimiray wrote:
>> This driver-specific ioctl enables UMDs to control the memory attributes
>> for GPU VMAs within a specified input range. If the start or end
>> addresses fall within an existing VMA, the VMA is split accordingly. The
>> attributes of the VMA are modified as provided by the users. The old
>> mappings of the VMAs are invalidated, and TLB invalidation is performed
>> if necessary.
>>
>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> ---
>>   drivers/gpu/drm/xe/Makefile        |   1 +
>>   drivers/gpu/drm/xe/xe_device.c     |   2 +
>>   drivers/gpu/drm/xe/xe_vm_madvise.c | 309 +++++++++++++++++++++++++++++
>>   drivers/gpu/drm/xe/xe_vm_madvise.h |  15 ++
>>   4 files changed, 327 insertions(+)
>>   create mode 100644 drivers/gpu/drm/xe/xe_vm_madvise.c
>>   create mode 100644 drivers/gpu/drm/xe/xe_vm_madvise.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index e4fec90bab55..3e83ae8b9dc1 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -117,6 +117,7 @@ xe-y += xe_bb.o \
>>   	xe_uc.o \
>>   	xe_uc_fw.o \
>>   	xe_vm.o \
>> +	xe_vm_madvise.o \
>>   	xe_vram.o \
>>   	xe_vram_freq.o \
>>   	xe_vsec.o \
>> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>> index d8e227ddf255..3e57300014bf 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -60,6 +60,7 @@
>>   #include "xe_ttm_stolen_mgr.h"
>>   #include "xe_ttm_sys_mgr.h"
>>   #include "xe_vm.h"
>> +#include "xe_vm_madvise.h"
>>   #include "xe_vram.h"
>>   #include "xe_vsec.h"
>>   #include "xe_wait_user_fence.h"
>> @@ -196,6 +197,7 @@ static const struct drm_ioctl_desc xe_ioctls[] = {
>>   	DRM_IOCTL_DEF_DRV(XE_WAIT_USER_FENCE, xe_wait_user_fence_ioctl,
>>   			  DRM_RENDER_ALLOW),
>>   	DRM_IOCTL_DEF_DRV(XE_OBSERVATION, xe_observation_ioctl, DRM_RENDER_ALLOW),
>> +	DRM_IOCTL_DEF_DRV(XE_MADVISE, xe_vm_madvise_ioctl, DRM_RENDER_ALLOW),
>>   };
>>   
>>   static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>> diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c b/drivers/gpu/drm/xe/xe_vm_madvise.c
>> new file mode 100644
>> index 000000000000..ef50031649e0
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
>> @@ -0,0 +1,309 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2024 Intel Corporation
>> + */
>> +
>> +#include "xe_vm_madvise.h"
>> +
>> +#include <linux/nospec.h>
>> +#include <drm/ttm/ttm_tt.h>
>> +#include <drm/xe_drm.h>
>> +
>> +#include "xe_bo.h"
>> +#include "xe_gt_tlb_invalidation.h"
>> +#include "xe_pt.h"
>> +#include "xe_svm.h"
>> +
>> +static struct xe_vma **get_vmas(struct xe_vm *vm, int *num_vmas,
>> +				u64 addr, u64 range)
>> +{
>> +	struct xe_vma **vmas, **__vmas;
>> +	struct drm_gpuva *gpuva;
>> +	int max_vmas = 8;
>> +
>> +	lockdep_assert_held(&vm->lock);
> 
> lockdep_assert_held_write

ok

> 
>> +
>> +	*num_vmas = 0;
>> +	vmas = kmalloc_array(max_vmas, sizeof(*vmas), GFP_KERNEL);
>> +	if (!vmas)
>> +		return NULL;
>> +
>> +	vm_dbg(&vm->xe->drm, "VMA's in range: start=0x%016llx, end=0x%016llx", addr, addr + range);
>> +
>> +	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, addr, addr + range) {
>> +		struct xe_vma *vma = gpuva_to_vma(gpuva);
>> +
>> +		if (*num_vmas == max_vmas) {
>> +			max_vmas <<= 1;
>> +			__vmas = krealloc(vmas, max_vmas * sizeof(*vmas), GFP_KERNEL);
>> +			if (!__vmas) {
>> +				kfree(vmas);
>> +				return NULL;
>> +			}
>> +			vmas = __vmas;
>> +		}
>> +
>> +		vmas[*num_vmas] = vma;
>> +		(*num_vmas)++;
>> +	}
>> +
>> +	vm_dbg(&vm->xe->drm, "*num_vmas = %d\n", *num_vmas);
>> +
>> +	if (!*num_vmas) {
>> +		kfree(vmas);
>> +		return NULL;
>> +	}
>> +
>> +	return vmas;
>> +}
>> +
>> +static int madvise_preferred_mem_loc(struct xe_device *xe, struct xe_vm *vm,
>> +				     struct xe_vma **vmas, int num_vmas,
>> +				     struct drm_xe_madvise_ops ops)
>> +{
>> +	/* Implementation pending */
>> +	return 0;
>> +}
>> +
>> +static int madvise_atomic(struct xe_device *xe, struct xe_vm *vm,
>> +			  struct xe_vma **vmas, int num_vmas,
>> +			  struct drm_xe_madvise_ops ops)
>> +{
>> +	/* Implementation pending */
>> +	return 0;
>> +}
>> +
>> +static int madvise_pat_index(struct xe_device *xe, struct xe_vm *vm,
>> +			     struct xe_vma **vmas, int num_vmas,
>> +			     struct drm_xe_madvise_ops ops)
>> +{
>> +	/* Implementation pending */
>> +	return 0;
>> +}
>> +
>> +static int madvise_purgeable_state(struct xe_device *xe, struct xe_vm *vm,
>> +				   struct xe_vma **vmas, int num_vmas,
>> +				   struct drm_xe_madvise_ops ops)
>> +{
>> +	/* Implementation pending */
>> +	return 0;
>> +}
>> +
>> +typedef int (*madvise_func)(struct xe_device *xe, struct xe_vm *vm,
>> +			    struct xe_vma **vmas, int num_vmas, struct drm_xe_madvise_ops ops);
>> +
>> +static const madvise_func madvise_funcs[] = {
>> +	[DRM_XE_VMA_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc,
>> +	[DRM_XE_VMA_ATTR_ATOMIC] = madvise_atomic,
>> +	[DRM_XE_VMA_ATTR_PAT] = madvise_pat_index,
>> +	[DRM_XE_VMA_ATTR_PURGEABLE_STATE] = madvise_purgeable_state,
>> +};
>> +
>> +static void xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end, u8 *tile_mask)
>> +{
>> +	struct drm_gpusvm_notifier *notifier;
>> +	struct drm_gpuva *gpuva;
>> +	struct xe_svm_range *range;
>> +	struct xe_tile *tile;
>> +	u64 adj_start, adj_end;
>> +	u8 id;
>> +
>> +	lockdep_assert_held(&vm->lock);
> 
> lockdep_assert_held_write
> 
>> +

ok

> 
> 	/* Waiting on pending binds */
> 
>> +	if (dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP,
>> +				  false, MAX_SCHEDULE_TIMEOUT) <= 0)
>> +		XE_WARN_ON(1);
>> +
>> +	down_write(&vm->svm.gpusvm.notifier_lock);
>> +
> 
> xe_svm_notifier_lock

ok

> 
>> +	drm_gpusvm_for_each_notifier(notifier, &vm->svm.gpusvm, start, end) {
>> +		struct drm_gpusvm_range *r = NULL;
>> +
>> +		adj_start = max(start, notifier->itree.start);
>> +		adj_end = min(end, notifier->itree.last + 1);
>> +		drm_gpusvm_for_each_range(r, notifier, adj_start, adj_end) {
>> +			range = to_xe_range(r);
>> +			for_each_tile(tile, vm->xe, id) {
>> +				if (xe_pt_zap_ptes_range(tile, vm, range)) {
>> +					*tile_mask |= BIT(id);
>> +					range->tile_invalidated |= BIT(id);
>> +				}
>> +			}
>> +		}
>> +	}
>> +
>> +	up_write(&vm->svm.gpusvm.notifier_lock);
>> +
> 
> xe_svm_notifier_unlock
> 

Hmm

>> +	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
>> +		struct xe_vma *vma = gpuva_to_vma(gpuva);
>> +
>> +		if (xe_vma_is_cpu_addr_mirror(vma))
>> +			continue;
>> +
>> +		if (xe_vma_is_userptr(vma)) {
>> +			WARN_ON_ONCE(!mmu_interval_check_retry
>> +				    (&to_userptr_vma(vma)->userptr.notifier,
>> +				     to_userptr_vma(vma)->userptr.notifier_seq));
>> +
>> +			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
>> +							     DMA_RESV_USAGE_BOOKKEEP));
>> +		}
>> +
>> +		if (xe_vma_bo(vma))
>> +			xe_bo_lock(xe_vma_bo(vma), false);
>> +
> 
> Do you need the BO's dma-resv lock here? I don't think you do. Maybe double
> check with Thomas on this one as I could be forgeting something here.

Sure

> 
>> +		for_each_tile(tile, vm->xe, id) {
>> +			if (xe_pt_zap_ptes(tile, vma))
>> +				*tile_mask |= BIT(id);
>> +		}
>> +
>> +		if (xe_vma_bo(vma))
>> +			xe_bo_unlock(xe_vma_bo(vma));
>> +	}
>> +}
>> +
>> +static void xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end)
>> +{
>> +	struct xe_gt_tlb_invalidation_fence
>> +		fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
>> +	struct xe_tile *tile;
>> +	u32 fence_id = 0;
>> +	u8 tile_mask = 0;
>> +	u8 id;
>> +
>> +	xe_zap_ptes_in_madvise_range(vm, start, end, &tile_mask);
>> +	if (!tile_mask)
>> +		return;
>> +
>> +	xe_device_wmb(vm->xe);
>> +
> 
> We have the below pattern in a few places in the driver. I wonder if it
> time for a helper?

Makes sense

> 
>> +	for_each_tile(tile, vm->xe, id) {
>> +		if (tile_mask & BIT(id)) {
>> +			int err;
>> +
>> +			xe_gt_tlb_invalidation_fence_init(tile->primary_gt,
>> +							  &fence[fence_id], true);
>> +
>> +			err = xe_gt_tlb_invalidation_range(tile->primary_gt,
>> +							   &fence[fence_id],
>> +							   start,
>> +							   end,
>> +							   vm->usm.asid);
>> +			if (WARN_ON_ONCE(err < 0))
>> +				goto wait;
>> +			++fence_id;
>> +
>> +			if (!tile->media_gt)
>> +				continue;
>> +
>> +			xe_gt_tlb_invalidation_fence_init(tile->media_gt,
>> +							  &fence[fence_id], true);
>> +
>> +			err = xe_gt_tlb_invalidation_range(tile->media_gt,
>> +							   &fence[fence_id],
>> +							   start,
>> +							   end,
>> +							   vm->usm.asid);
>> +			if (WARN_ON_ONCE(err < 0))
>> +				goto wait;
>> +			++fence_id;
>> +		}
>> +	}
>> +
>> +wait:
>> +	for (id = 0; id < fence_id; ++id)
>> +		xe_gt_tlb_invalidation_fence_wait(&fence[id]);
>> +}
>> +
>> +static int input_ranges_same(struct drm_xe_madvise_ops *old,
>> +			     struct drm_xe_madvise_ops *new)
>> +{
>> +	return (new->start == old->start && new->range == old->range);
>> +}
>> +
>> +int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> 
> Kernel doc.

Sure

> 
>> +{
>> +	struct xe_device *xe = to_xe_device(dev);
>> +	struct xe_file *xef = to_xe_file(file);
>> +	struct drm_xe_madvise_ops *advs_ops;
>> +	struct drm_xe_madvise *args = data;
>> +	struct xe_vm *vm;
>> +	struct xe_vma **vmas = NULL;
>> +	int num_vmas, err = 0;
>> +	int i, j, attr_type;
>> +
>> +	if (XE_IOCTL_DBG(xe, args->num_ops < 1))
>> +		return -EINVAL;
>> +
>> +	vm = xe_vm_lookup(xef, args->vm_id);
>> +	if (XE_IOCTL_DBG(xe, !vm))
>> +		return -EINVAL;
>> +
>> +	if (XE_IOCTL_DBG(xe, !xe_vm_in_fault_mode(vm))) {
> 
> Do we want to restrict this fault mode? Maybe check with Mesa if they
> see any use cases.

Ok

> 
>> +		err = -EINVAL;
>> +		goto put_vm;
>> +	}
>> +
>> +	down_write(&vm->lock);
>> +
>> +	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
>> +		err = -ENOENT;
>> +		goto unlock_vm;
>> +	}
>> +
>> +	if (args->num_ops > 1) {
>> +		u64 __user *madvise_user = u64_to_user_ptr(args->vector_of_ops);
>> +
>> +		advs_ops = kvmalloc_array(args->num_ops, sizeof(struct drm_xe_madvise_ops),
>> +					  GFP_KERNEL | __GFP_ACCOUNT |
>> +					  __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
>> +		if (!advs_ops)
>> +			return args->num_ops > 1 ? -ENOBUFS : -ENOMEM;
>> +
>> +		err = __copy_from_user(advs_ops, madvise_user,
>> +				       sizeof(struct drm_xe_madvise_ops) *
>> +				       args->num_ops);
>> +		if (XE_IOCTL_DBG(xe, err)) {
>> +			err = -EFAULT;
>> +			goto free_advs_ops;
>> +		}
>> +	} else {
>> +		advs_ops = &args->ops;
>> +	}
>> +
>> +	for (i = 0; i < args->num_ops; i++) {
>> +		xe_vm_alloc_madvise_vma(vm, advs_ops[i].start, advs_ops[i].range);
>> +
>> +		vmas = get_vmas(vm, &num_vmas, advs_ops[i].start, advs_ops[i].range);
>> +		if (!vmas) {
>> +			err = -ENOMEM;
>> +			goto unlock_vm;
>> +		}
>> +
>> +		attr_type = array_index_nospec(advs_ops[i].type, ARRAY_SIZE(madvise_funcs));
>> +		err = madvise_funcs[attr_type](xe, vm, vmas, num_vmas, advs_ops[i]);
>> +
>> +		kfree(vmas);
>> +		vmas = NULL;
>> +
>> +		if (err)
>> +			break;
>> +	}
>> +
>> +	for (i = 0; i < args->num_ops; i++) {
>> +		for (j = i + 1; j < args->num_ops; ++j) {
>> +			if (input_ranges_same(&advs_ops[j], &advs_ops[i]))
>> +				break;
>> +		}
> 
> The above loop doesn't look like it actually does anything.

My bad.

was intending to do

if (input_ranges_same(&advs_ops[j], &advs_ops[i])) {
                        needs_invalidation = false;
 >> +				break;
}
              if(needs_invalidation)
                  xe_vm_invalidate_madvise_range(vm, advs_ops[i].start,
				       advs_ops[i].start + advs_ops[i].range);
 >> +	}

> 
> Matt
> 
>> +		xe_vm_invalidate_madvise_range(vm, advs_ops[i].start,
>> +					       advs_ops[i].start + advs_ops[i].range);
>> +	}
>> +free_advs_ops:
>> +	if (args->num_ops > 1)
>> +		kvfree(advs_ops);
>> +unlock_vm:
>> +	up_write(&vm->lock);
>> +put_vm:
>> +	xe_vm_put(vm);
>> +	return err;
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.h b/drivers/gpu/drm/xe/xe_vm_madvise.h
>> new file mode 100644
>> index 000000000000..c5cdd058c322
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_vm_madvise.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2024 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_VM_MADVISE_H_
>> +#define _XE_VM_MADVISE_H_
>> +
>> +struct drm_device;
>> +struct drm_file;
>> +
>> +int xe_vm_madvise_ioctl(struct drm_device *dev, void *data,
>> +			struct drm_file *file);
>> +
>> +#endif
>> -- 
>> 2.34.1
>>