[PATCH v2 14/32] drm/xe/svm: Implement prefetch support for SVM ranges

Thu Apr 17 04:54:57 UTC 2025

On Mon, Apr 07, 2025 at 03:47:01PM +0530, Himal Prasad Ghimiray wrote:
> This commit adds prefetch support for SVM ranges, utilizing the
> existing ioctl vm_bind functionality to achieve this.
> 
> v2: rebase
> 
> Cc: Matthew Brost <matthew.brost at intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_pt.c |  61 +++++++++---
>  drivers/gpu/drm/xe/xe_vm.c | 185 +++++++++++++++++++++++++++++++++++--
>  2 files changed, 222 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index de4e3edda758..59dc065fae93 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -1458,7 +1458,8 @@ static int xe_pt_svm_pre_commit(struct xe_migrate_pt_update *pt_update)
>  	struct xe_vm *vm = pt_update->vops->vm;
>  	struct xe_vma_ops *vops = pt_update->vops;
>  	struct xe_vma_op *op;
> -	int err;
> +	int ranges_count;
> +	int err, i;
>  
>  	err = xe_pt_pre_commit(pt_update);
>  	if (err)
> @@ -1467,20 +1468,33 @@ static int xe_pt_svm_pre_commit(struct xe_migrate_pt_update *pt_update)
>  	xe_svm_notifier_lock(vm);
>  
>  	list_for_each_entry(op, &vops->list, link) {
> -		struct xe_svm_range *range = op->map_range.range;
> +		struct xe_svm_range *range = NULL;
>  
>  		if (op->subop == XE_VMA_SUBOP_UNMAP_RANGE)
>  			continue;
>  
> -		xe_svm_range_debug(range, "PRE-COMMIT");
> -
> -		xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(op->map_range.vma));
> -		xe_assert(vm->xe, op->subop == XE_VMA_SUBOP_MAP_RANGE);
> +		if (op->base.op == DRM_GPUVA_OP_PREFETCH) {
> +			xe_assert(vm->xe,
> +				  xe_vma_is_cpu_addr_mirror(gpuva_to_vma(op->base.prefetch.va)));
> +			ranges_count = op->prefetch_range.ranges_count;
> +		} else {
> +			xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(op->map_range.vma));
> +			xe_assert(vm->xe, op->subop == XE_VMA_SUBOP_MAP_RANGE);
> +			ranges_count  = 1;
> +		}
>  
> -		if (!xe_svm_range_pages_valid(range)) {
> -			xe_svm_range_debug(range, "PRE-COMMIT - RETRY");
> -			xe_svm_notifier_unlock(vm);
> -			return -EAGAIN;
> +		for (i = 0; i < ranges_count; i++) {

xa_for_each as it doesn't make any assumptions above the key (e.g. the value of i).

> +			if (op->base.op == DRM_GPUVA_OP_PREFETCH)
> +				range = xa_load(&op->prefetch_range.range, i);

I'd move this logic above... So I'd write it like this...

if (op->base.op == DRM_GPUVA_OP_PREFETCH) {
	assert
	xe_for_each
		do_pages_check()
} else {
	assert
	do_pages_check();
}

> +			else
> +				range = op->map_range.range;
> +			xe_svm_range_debug(range, "PRE-COMMIT");
> +
> +			if (!xe_svm_range_pages_valid(range)) {
> +				xe_svm_range_debug(range, "PRE-COMMIT - RETRY");
> +				xe_svm_notifier_unlock(vm);
> +				return -EAGAIN;

So in the case of prefetch, this is bit inconsistent as below when
things race, you return -ENODATA which is converted to 0 in the IOCTL.
-EAGAIN here could result in a livelock in the right conditions as
-EAGAIN means must retry. I think maybe just -ENODATA if a prefetch
fails... If there are any other binds in the array of IOCTL they will
just fault I guess. Maybe not a concern as only VK uses array of binds
at the moment.

> +			}
>  		}
>  	}
>  
> @@ -2065,11 +2079,21 @@ static int op_prepare(struct xe_vm *vm,
>  	{
>  		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
>  
> -		if (xe_vma_is_cpu_addr_mirror(vma))
> -			break;
> +		if (xe_vma_is_cpu_addr_mirror(vma)) {
> +			struct xe_svm_range *range;
> +			int i;
>  
> -		err = bind_op_prepare(vm, tile, pt_update_ops, vma, false);
> -		pt_update_ops->wait_vm_kernel = true;
> +			for (i = 0; i < op->prefetch_range.ranges_count; i++) {
> +				range = xa_load(&op->prefetch_range.range, i);

Again xe_for_each...

> +				err = bind_range_prepare(vm, tile, pt_update_ops,
> +							 vma, range);
> +				if (err)
> +					return err;
> +			}
> +		} else {
> +			err = bind_op_prepare(vm, tile, pt_update_ops, vma, false);
> +			pt_update_ops->wait_vm_kernel = true;
> +		}
>  		break;
>  	}
>  	case DRM_GPUVA_OP_DRIVER:
> @@ -2273,9 +2297,16 @@ static void op_commit(struct xe_vm *vm,
>  	{
>  		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
>  
> -		if (!xe_vma_is_cpu_addr_mirror(vma))
> +		if (xe_vma_is_cpu_addr_mirror(vma)) {
> +			for (int i = 0 ; i < op->prefetch_range.ranges_count; i++) {

Again xe_for_each...

> +				struct xe_svm_range *range = xa_load(&op->prefetch_range.range, i);
> +
> +				range_present_and_invalidated_tile(vm, range, tile->id);
> +			}
> +		} else {
>  			bind_op_commit(vm, tile, pt_update_ops, vma, fence,
>  				       fence2, false);
> +		}
>  		break;
>  	}
>  	case DRM_GPUVA_OP_DRIVER:
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 57af2c37f927..ffd7ad664921 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -798,10 +798,36 @@ static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
>  }
>  ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
>  
> +static void clean_svm_prefetch_op(struct xe_vma_op *op)
> +{

Can we rename this with fini convention to match xe_vma_ops_fini?

> +	struct xe_vma *vma;
> +
> +	vma = gpuva_to_vma(op->base.prefetch.va);
> +
> +	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma)) {
> +		xa_destroy(&op->prefetch_range.range);
> +		op->prefetch_range.ranges_count = 0;

Do you need to set 'op->prefetch_range.ranges_count' to zero here.

> +	}
> +}
> +
> +static void clean_svm_prefetch_in_vma_ops(struct xe_vma_ops *vops)
> +{

Same here, fini convention?

> +	struct xe_vma_op *op;
> +
> +	if (!(vops->flags & XE_VMA_OPS_HAS_SVM_PREFETCH))
> +		return;
> +
> +	list_for_each_entry(op, &vops->list, link) {
> +		clean_svm_prefetch_op(op);
> +	}

Brackets not needed.

> +}
> +
>  static void xe_vma_ops_fini(struct xe_vma_ops *vops)
>  {
>  	int i;
>  
> +	clean_svm_prefetch_in_vma_ops(vops);
> +
>  	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
>  		kfree(vops->pt_update_ops[i].ops);
>  }
> @@ -2248,13 +2274,25 @@ static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
>  	return true;
>  }
>  
> +static void clean_svm_prefetch_in_gpuva_ops(struct drm_gpuva_ops *ops)
> +{

Same here, fini convention?

> +	struct drm_gpuva_op *__op;
> +
> +	drm_gpuva_for_each_op(__op, ops) {
> +		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
> +
> +		clean_svm_prefetch_op(op);
> +	}
> +}
> +
>  /*
>   * Create operations list from IOCTL arguments, setup operations fields so parse
>   * and commit steps are decoupled from IOCTL arguments. This step can fail.
>   */
>  static struct drm_gpuva_ops *
> -vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
> -			 u64 bo_offset_or_userptr, u64 addr, u64 range,
> +vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
> +			 struct xe_bo *bo, u64 bo_offset_or_userptr,
> +			 u64 addr, u64 range,
>  			 u32 operation, u32 flags,
>  			 u32 prefetch_region, u16 pat_index)
>  {
> @@ -2262,6 +2300,7 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
>  	struct drm_gpuva_ops *ops;
>  	struct drm_gpuva_op *__op;
>  	struct drm_gpuvm_bo *vm_bo;
> +	u64 range_end = addr + range;
>  	int err;
>  
>  	lockdep_assert_held_write(&vm->lock);
> @@ -2323,14 +2362,61 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
>  			op->map.invalidate_on_bind =
>  				__xe_vm_needs_clear_scratch_pages(vm, flags);
>  		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
> -			op->prefetch.region = prefetch_region;
> -		}
> +			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
> +
> +			if (!xe_vma_is_cpu_addr_mirror(vma)) {
> +				op->prefetch.region = prefetch_region;
> +				break;
> +			}
>  
> +			struct drm_gpusvm_ctx ctx = {
> +			.read_only = xe_vma_read_only(vma),
> +			.devmem_possible = IS_DGFX(vm->xe) &&
> +					   IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
> +			.check_pages_threshold = IS_DGFX(vm->xe) &&
> +						 IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ?
> +						 SZ_64K : 0,

The alignment looks weird here.

Also you don't techincally need to set check_pages_threshold here give
this is used by get_pages which is not called here.

> +			};
> +
> +			op->prefetch_range.region = prefetch_region;
> +			struct xe_svm_range *svm_range;
> +			int i = 0;

I don't think you need 'i' here, you probably just can use xa_alloc
rather than xa_store if use xe_for_each everywhere else.

> +
> +			xa_init(&op->prefetch_range.range);
> +			op->prefetch_range.ranges_count = 0;
> +alloc_next_range:
> +			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);
> +

I think you want to check if range has a mapping and is in preferred
location, if it is then don't add to the xarray as no reason to try to
migrate it or rebind the GPU pages.

> +			if (PTR_ERR(svm_range) == -ENOENT)
> +				break;
> +
> +			if (IS_ERR(svm_range)) {
> +				err = PTR_ERR(svm_range);
> +				goto unwind_prefetch_ops;
> +			}
> +
> +			xa_store(&op->prefetch_range.range, i, svm_range, GFP_KERNEL);
> +			op->prefetch_range.ranges_count++;
> +			vops->flags |= XE_VMA_OPS_HAS_SVM_PREFETCH;
> +
> +			if (range_end > xe_svm_range_end(svm_range) &&
> +			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
> +				i++;
> +				addr = xe_svm_range_end(svm_range);
> +				goto alloc_next_range;
> +			}
> +		}
>  		print_op(vm->xe, __op);
>  	}
>  
>  	return ops;
> +
> +unwind_prefetch_ops:
> +	clean_svm_prefetch_in_gpuva_ops(ops);
> +	drm_gpuva_ops_free(&vm->gpuvm, ops);
> +	return ERR_PTR(err);
>  }
> +
>  ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
>  
>  static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
> @@ -2645,8 +2731,12 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
>  					return err;
>  			}
>  
> -			if (!xe_vma_is_cpu_addr_mirror(vma))
> +			if (xe_vma_is_cpu_addr_mirror(vma))
> +				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
> +							      op->prefetch_range.ranges_count);
> +			else
>  				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
> +
>  			break;
>  		default:
>  			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
> @@ -2772,6 +2862,58 @@ static int check_ufence(struct xe_vma *vma)
>  	return 0;
>  }
>  
> +static int prefetch_ranges_lock_and_prep(struct xe_vm *vm,
> +					 struct xe_vma_op *op)
> +{
> +	int err = 0;
> +
> +	if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {

if (op->base.op != DRM_GPUVA_OP_PREFETCH || xe_vma_is_cpu_addr_mirror(vma))
	return 0;

Will help with spacing. Or do this check at the caller.

> +		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
> +		struct drm_gpusvm_ctx ctx = {
> +			.read_only = xe_vma_read_only(vma),
> +			.devmem_possible = IS_DGFX(vm->xe) &&
> +					   IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
> +			.check_pages_threshold = IS_DGFX(vm->xe) &&
> +						 IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ?
> +						 SZ_64K : 0,
> +		};
> +		struct xe_svm_range *svm_range;
> +		struct xe_tile *tile;
> +		u32 region;
> +		int i;
> +
> +		if (!xe_vma_is_cpu_addr_mirror(vma))
> +			return 0;
> +
> +		region = op->prefetch_range.region;
> +
> +		/* TODO: Threading the migration */
> +		for (i = 0; i < op->prefetch_range.ranges_count; i++) {

Again xa_for_each...

> +			svm_range = xa_load(&op->prefetch_range.range, i);
> +			if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, region)) {
> +				tile = &vm->xe->tiles[region_to_mem_type[region] - XE_PL_VRAM0];
> +				err = xe_svm_alloc_vram(vm, tile, svm_range, &ctx);
> +				if (err) {
> +					drm_err(&vm->xe->drm, "VRAM allocation failed, can be retried from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
> +						vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));

Not drm_err here, drm_dbg as user space can easily retrigger this.

> +					return -ENODATA;

So this gets squashed into return 0, which I think is correct for now.
Same explaination as above wrt to error codes.

> +				}
> +			}
> +
> +			err = xe_svm_range_get_pages(vm, svm_range, &ctx);
> +			if (err) {
> +				if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
> +					err = -ENODATA;
> +
> +				drm_err(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
> +					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));

Same as above.

We are going to want IGTs to test these error paths btw, issue prefetch
then have another thread immediately touch some of the memory to abort
the prefetch.

> +				return err;
> +			}
> +		}
> +	}
> +	return err;
> +}
> +
>  static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
>  			    struct xe_vma_op *op)
>  {
> @@ -2809,7 +2951,12 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
>  	case DRM_GPUVA_OP_PREFETCH:
>  	{
>  		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
> -		u32 region = op->prefetch.region;
> +		u32 region;
> +
> +		if (xe_vma_is_cpu_addr_mirror(vma))
> +			region = op->prefetch_range.region;
> +		else
> +			region = op->prefetch.region;
>  
>  		xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
>  
> @@ -2828,6 +2975,23 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
>  	return err;
>  }
>  
> +static int xe_vma_ops_execute_ready(struct xe_vm *vm, struct xe_vma_ops *vops)
> +{

Let's make these names consistent.

How about...

s/xe_vma_ops_execute_ready/vm_bind_ioctl_ops_prefetch_ranges

s/prefetch_ranges_lock_and_prep/prefetch_ranges

> +	struct xe_vma_op *op;
> +	int err;
> +
> +	if (!(vops->flags & XE_VMA_OPS_HAS_SVM_PREFETCH))
> +		return 0;
> +
> +	list_for_each_entry(op, &vops->list, link) {
> +		err = prefetch_ranges_lock_and_prep(vm, op);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}
> +
>  static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
>  					   struct xe_vm *vm,
>  					   struct xe_vma_ops *vops)
> @@ -2850,7 +3014,6 @@ static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
>  	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
>  		return -ENOSPC;
>  #endif
> -

Look unrelated.

Matt

>  	return 0;
>  }
>  
> @@ -3492,7 +3655,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
>  		u16 pat_index = bind_ops[i].pat_index;
>  
> -		ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
> +		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
>  						  addr, range, op, flags,
>  						  prefetch_region, pat_index);
>  		if (IS_ERR(ops[i])) {
> @@ -3525,6 +3688,10 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	if (err)
>  		goto unwind_ops;
>  
> +	err = xe_vma_ops_execute_ready(vm, &vops);
> +	if (err)
> +		goto unwind_ops;
> +
>  	fence = vm_bind_ioctl_ops_execute(vm, &vops);
>  	if (IS_ERR(fence))
>  		err = PTR_ERR(fence);
> @@ -3594,7 +3761,7 @@ struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
>  
>  	xe_vma_ops_init(&vops, vm, q, NULL, 0);
>  
> -	ops = vm_bind_ioctl_ops_create(vm, bo, 0, addr, bo->size,
> +	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, bo->size,
>  				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
>  				       vm->xe->pat.idx[cache_lvl]);
>  	if (IS_ERR(ops)) {
> -- 
> 2.34.1
>