[PATCH 08/15] drm/xe: Convert existing drm_exec transactions for exhaustive eviction
Matthew Brost
matthew.brost at intel.com
Thu Aug 14 02:48:13 UTC 2025
On Wed, Aug 13, 2025 at 12:51:14PM +0200, Thomas Hellström wrote:
> Convert existing drm_exec transactions, like GT pagefault validation,
> non-LR exec() IOCTL and the rebind worker to support
> exhaustive eviction using the xe_validation_guard().
>
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> ---
> drivers/gpu/drm/xe/xe_exec.c | 20 ++--
> drivers/gpu/drm/xe/xe_gt_pagefault.c | 20 ++--
> drivers/gpu/drm/xe/xe_svm.c | 4 -
> drivers/gpu/drm/xe/xe_vm.c | 132 +++++++++++----------------
> drivers/gpu/drm/xe/xe_vm.h | 2 -
> 5 files changed, 70 insertions(+), 108 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> index 0bcb4fb9a10e..cdc3ff931a90 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -119,10 +119,10 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn};
> struct drm_exec *exec = &vm_exec.exec;
> u32 i, num_syncs, num_ufence = 0;
> + struct xe_validation_ctx ctx;
> struct xe_sched_job *job;
> struct xe_vm *vm;
> bool write_locked, skip_retry = false;
> - ktime_t end = 0;
> int err = 0;
> struct xe_hw_engine_group *group;
> enum xe_hw_engine_group_execution_mode mode, previous_mode;
> @@ -241,17 +241,12 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> goto err_unlock_list;
> }
>
> - vm_exec.vm = &vm->gpuvm;
> - vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
> - if (xe_vm_in_lr_mode(vm)) {
> - drm_exec_init(exec, vm_exec.flags, 0);
> - } else {
> - err = drm_gpuvm_exec_lock(&vm_exec);
> - if (err) {
> - if (xe_vm_validate_should_retry(exec, err, &end))
> - err = -EAGAIN;
> + if (!xe_vm_in_lr_mode(vm)) {
> + vm_exec.vm = &vm->gpuvm;
> + vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
> + err = xe_validation_exec_lock(&ctx, &vm_exec, &xe->val);
> + if (err)
> goto err_unlock_list;
> - }
> }
>
> if (xe_vm_is_closed_or_banned(q->vm)) {
> @@ -345,7 +340,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> if (err)
> xe_sched_job_put(job);
> err_exec:
> - drm_exec_fini(exec);
> + if (!xe_vm_in_lr_mode(vm))
> + xe_validation_ctx_fini(&ctx);
> err_unlock_list:
> up_read(&vm->lock);
> if (err == -EAGAIN && !skip_retry)
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index 2c7f10cc423f..67dc503d6e04 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -112,9 +112,9 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
> {
> struct xe_vm *vm = xe_vma_vm(vma);
> struct xe_tile *tile = gt_to_tile(gt);
> + struct xe_validation_ctx ctx;
> struct drm_exec exec;
> struct dma_fence *fence;
> - ktime_t end = 0;
> int err;
>
> lockdep_assert_held_write(&vm->lock);
> @@ -139,12 +139,11 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
> }
>
> /* Lock VM and BOs dma-resv */
> - drm_exec_init(&exec, 0, 0);
> + xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, 0, 0, false);
> drm_exec_until_all_locked(&exec) {
> err = xe_pf_begin(&exec, vma, atomic, tile->mem.vram);
> drm_exec_retry_on_contention(&exec);
> - if (xe_vm_validate_should_retry(&exec, err, &end))
> - err = -EAGAIN;
> + xe_validation_retry_on_oom(&ctx, &err);
> if (err)
> goto unlock_dma_resv;
>
> @@ -153,8 +152,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
> fence = xe_vma_rebind(vm, vma, BIT(tile->id));
> if (IS_ERR(fence)) {
> err = PTR_ERR(fence);
> - if (xe_vm_validate_should_retry(&exec, err, &end))
> - err = -EAGAIN;
> + xe_validation_retry_on_oom(&ctx, &err);
> goto unlock_dma_resv;
> }
> }
> @@ -163,7 +161,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
> dma_fence_put(fence);
>
> unlock_dma_resv:
> - drm_exec_fini(&exec);
> + xe_validation_ctx_fini(&ctx);
> if (err == -EAGAIN)
> goto retry_userptr;
>
> @@ -545,6 +543,7 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
> {
> struct xe_device *xe = gt_to_xe(gt);
> struct xe_tile *tile = gt_to_tile(gt);
> + struct xe_validation_ctx ctx;
> struct drm_exec exec;
> struct xe_vm *vm;
> struct xe_vma *vma;
> @@ -574,15 +573,14 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
> goto unlock_vm;
>
> /* Lock VM and BOs dma-resv */
> - drm_exec_init(&exec, 0, 0);
> + xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, 0, 0, false);
> drm_exec_until_all_locked(&exec) {
> ret = xe_pf_begin(&exec, vma, true, tile->mem.vram);
> drm_exec_retry_on_contention(&exec);
> - if (ret)
> - break;
> + xe_validation_retry_on_oom(&ctx, &ret);
> }
>
> - drm_exec_fini(&exec);
> + xe_validation_ctx_fini(&ctx);
> unlock_vm:
> up_read(&vm->lock);
> xe_vm_put(vm);
> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> index ba85665d85d4..93d10f0b81cb 100644
> --- a/drivers/gpu/drm/xe/xe_svm.c
> +++ b/drivers/gpu/drm/xe/xe_svm.c
> @@ -821,7 +821,6 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
> struct dma_fence *fence;
> struct xe_tile *tile = gt_to_tile(gt);
> int migrate_try_count = ctx.devmem_only ? 3 : 1;
> - ktime_t end = 0;
> int err;
>
> lockdep_assert_held_write(&vm->lock);
> @@ -891,7 +890,6 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
>
> range_debug(range, "PAGE FAULT - BIND");
>
> -retry_bind:
> xe_vm_lock(vm, false);
> fence = xe_vm_range_rebind(vm, vma, range, BIT(tile->id));
> if (IS_ERR(fence)) {
> @@ -902,8 +900,6 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
> range_debug(range, "PAGE FAULT - RETRY BIND");
> goto retry;
> }
> - if (xe_vm_validate_should_retry(NULL, err, &end))
> - goto retry_bind;
> goto err_out;
> }
> xe_vm_unlock(vm);
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 1c2d9d9065c6..989d84c2e82f 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -241,6 +241,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
> .num_fences = 1,
> };
> struct drm_exec *exec = &vm_exec.exec;
> + struct xe_validation_ctx ctx;
> struct dma_fence *pfence;
> int err;
> bool wait;
> @@ -248,7 +249,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
> xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
>
> down_write(&vm->lock);
> - err = drm_gpuvm_exec_lock(&vm_exec);
> + err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val);
> if (err)
> goto out_up_write;
>
> @@ -280,7 +281,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
> up_read(&vm->userptr.notifier_lock);
>
> out_fini:
> - drm_exec_fini(exec);
> + xe_validation_ctx_fini(&ctx);
> out_up_write:
> up_write(&vm->lock);
>
> @@ -363,39 +364,6 @@ void xe_vm_kill(struct xe_vm *vm, bool unlocked)
> /* TODO: Inform user the VM is banned */
> }
>
> -/**
> - * xe_vm_validate_should_retry() - Whether to retry after a validate error.
> - * @exec: The drm_exec object used for locking before validation.
> - * @err: The error returned from ttm_bo_validate().
> - * @end: A ktime_t cookie that should be set to 0 before first use and
> - * that should be reused on subsequent calls.
> - *
> - * With multiple active VMs, under memory pressure, it is possible that
> - * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
> - * Until ttm properly handles locking in such scenarios, best thing the
> - * driver can do is retry with a timeout. Check if that is necessary, and
> - * if so unlock the drm_exec's objects while keeping the ticket to prepare
> - * for a rerun.
> - *
> - * Return: true if a retry after drm_exec_init() is recommended;
> - * false otherwise.
> - */
> -bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
> -{
> - ktime_t cur;
> -
> - if (err != -ENOMEM)
> - return false;
> -
> - cur = ktime_get();
> - *end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
> - if (!ktime_before(cur, *end))
> - return false;
> -
> - msleep(20);
> - return true;
> -}
> -
> static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
> {
> struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
> @@ -497,10 +465,10 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
> static void preempt_rebind_work_func(struct work_struct *w)
> {
> struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
> + struct xe_validation_ctx ctx;
> struct drm_exec exec;
> unsigned int fence_count = 0;
> LIST_HEAD(preempt_fences);
> - ktime_t end = 0;
> int err = 0;
> long wait;
> int __maybe_unused tries = 0;
> @@ -523,19 +491,20 @@ static void preempt_rebind_work_func(struct work_struct *w)
> goto out_unlock_outer;
> }
>
> - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
> + err = xe_validation_ctx_init(&ctx, &vm->xe->val,
> + &exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0, false);
> + if (err)
> + goto out_unlock_outer;
>
> drm_exec_until_all_locked(&exec) {
> bool done = false;
>
> err = xe_preempt_work_begin(&exec, vm, &done);
> drm_exec_retry_on_contention(&exec);
> + xe_validation_retry_on_oom(&ctx, &err);
> if (err || done) {
> xe_vm_set_validation_exec(vm, NULL);
> - drm_exec_fini(&exec);
> - if (err && xe_vm_validate_should_retry(&exec, err, &end))
> - err = -EAGAIN;
> -
> + xe_validation_ctx_fini(&ctx);
> goto out_unlock_outer;
> }
> }
> @@ -582,7 +551,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
>
> out_unlock:
> xe_vm_set_validation_exec(vm, NULL);
> - drm_exec_fini(&exec);
> + xe_validation_ctx_fini(&ctx);
> out_unlock_outer:
> if (err == -EAGAIN) {
> trace_xe_vm_rebind_worker_retry(vm);
> @@ -1400,20 +1369,19 @@ int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
>
> static void xe_vma_destroy_unlocked(struct xe_vma *vma)
> {
> + struct xe_device *xe = xe_vma_vm(vma)->xe;
> + struct xe_validation_ctx ctx;
> struct drm_exec exec;
> - int err;
> + int err = 0;
>
> - drm_exec_init(&exec, 0, 0);
> - drm_exec_until_all_locked(&exec) {
> + xe_validation_guard(&ctx, &xe->val, &exec, 0, err, false) {
> err = xe_vm_lock_vma(&exec, vma);
> drm_exec_retry_on_contention(&exec);
> if (XE_WARN_ON(err))
> break;
> + xe_vma_destroy(vma, NULL);
> }
> -
> - xe_vma_destroy(vma, NULL);
> -
> - drm_exec_fini(&exec);
> + xe_assert(xe, !err);
> }
>
> struct xe_vma *
> @@ -2490,6 +2458,7 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
> u16 pat_index, unsigned int flags)
> {
> struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
> + struct xe_validation_ctx ctx;
> struct drm_exec exec;
> struct xe_vma *vma;
> int err = 0;
> @@ -2497,9 +2466,9 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
> lockdep_assert_held_write(&vm->lock);
>
> if (bo) {
> - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
> - drm_exec_until_all_locked(&exec) {
> - err = 0;
> + err = 0;
> + xe_validation_guard(&ctx, &vm->xe->val, &exec,
> + DRM_EXEC_INTERRUPTIBLE_WAIT, err, false) {
> if (!bo->vm) {
> err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
> drm_exec_retry_on_contention(&exec);
> @@ -2508,27 +2477,34 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
> err = drm_exec_lock_obj(&exec, &bo->ttm.base);
> drm_exec_retry_on_contention(&exec);
> }
> - if (err) {
> - drm_exec_fini(&exec);
> + if (err)
> return ERR_PTR(err);
> +
> + vma = xe_vma_create(vm, bo, op->gem.offset,
> + op->va.addr, op->va.addr +
> + op->va.range - 1, pat_index, flags);
> + if (IS_ERR(vma))
> + return vma;
> +
> + if (!bo->vm) {
> + err = add_preempt_fences(vm, bo);
> + goto out_err;
> }
> }
> + if (err)
> + return ERR_PTR(err);
> + } else {
> + vma = xe_vma_create(vm, NULL, op->gem.offset,
> + op->va.addr, op->va.addr +
> + op->va.range - 1, pat_index, flags);
> + if (IS_ERR(vma))
> + return vma;
> +
> + if (xe_vma_is_userptr(vma))
> + err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
> }
> - vma = xe_vma_create(vm, bo, op->gem.offset,
> - op->va.addr, op->va.addr +
> - op->va.range - 1, pat_index, flags);
> - if (IS_ERR(vma))
> - goto err_unlock;
> -
> - if (xe_vma_is_userptr(vma))
> - err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
> - else if (!xe_vma_has_no_bo(vma) && !bo->vm)
> - err = add_preempt_fences(vm, bo);
> -
> -err_unlock:
> - if (bo)
> - drm_exec_fini(&exec);
>
> +out_err:
> if (err) {
> prep_vma_destroy(vm, vma, false);
> xe_vma_destroy_unlocked(vma);
> @@ -3296,34 +3272,32 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
> static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
> struct xe_vma_ops *vops)
> {
> + struct xe_validation_ctx ctx;
> struct drm_exec exec;
> struct dma_fence *fence;
> - int err;
> + int err = 0;
>
> lockdep_assert_held_write(&vm->lock);
>
> - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
> - DRM_EXEC_IGNORE_DUPLICATES, 0);
> - drm_exec_until_all_locked(&exec) {
> + xe_validation_guard(&ctx, &vm->xe->val, &exec,
> + DRM_EXEC_INTERRUPTIBLE_WAIT |
> + DRM_EXEC_IGNORE_DUPLICATES, err, true) {
> err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
> drm_exec_retry_on_contention(&exec);
> - if (err) {
> - fence = ERR_PTR(err);
> - goto unlock;
> - }
> + xe_validation_retry_on_oom(&ctx, &err);
> + if (err)
> + return ERR_PTR(err);
>
> fence = ops_execute(vm, vops);
> if (IS_ERR(fence)) {
> if (PTR_ERR(fence) == -ENODATA)
> vm_bind_ioctl_ops_fini(vm, vops, NULL);
> - goto unlock;
> + return fence;
> }
>
> vm_bind_ioctl_ops_fini(vm, vops, fence);
> }
>
> -unlock:
> - drm_exec_fini(&exec);
> return fence;
> }
> ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
> diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
> index 3b6e7234dac4..418940222690 100644
> --- a/drivers/gpu/drm/xe/xe_vm.h
> +++ b/drivers/gpu/drm/xe/xe_vm.h
> @@ -262,8 +262,6 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma);
>
> int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma);
>
> -bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);
> -
> int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
>
> int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
> --
> 2.50.1
>
More information about the Intel-xe
mailing list