[Intel-xe] [PATCH 1/2] drm/xe: Handle -EDEADLK case in preempt worker
Hellstrom, Thomas
thomas.hellstrom at intel.com
Tue May 9 05:46:48 UTC 2023
On Mon, 2023-05-08 at 22:24 -0700, Niranjana Vishwanathapura wrote:
> With multiple active VMs, under memory pressure, it is possible that
> ttm_bo_validate() run into -EDEADLK in ttm_mem_evict_wait_busy() and
> return -ENOMEM.
>
> Until ttm properly handles locking in such scenarios, best thing the
> driver can do is unwind the lock and retry.
>
> Update preempt worker to retry validating BOs with a timeout upon
> -ENOMEM and while at it, apply the timeout for retries upon -EAGAIN
> as well.
>
> Signed-off-by: Niranjana Vishwanathapura
> <niranjana.vishwanathapura at intel.com>
> ---
> drivers/gpu/drm/xe/xe_vm.c | 25 ++++++++++++++++++++++---
> 1 file changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 91576cec000d..d92c9f659ab3 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -10,6 +10,7 @@
> #include <drm/ttm/ttm_execbuf_util.h>
> #include <drm/ttm/ttm_tt.h>
> #include <drm/xe_drm.h>
> +#include <linux/delay.h>
> #include <linux/kthread.h>
> #include <linux/mm.h>
> #include <linux/swap.h>
> @@ -508,6 +509,8 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm,
> kvfree(tv);
> }
>
> +#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
> +
> static void preempt_rebind_work_func(struct work_struct *w)
> {
> struct xe_vm *vm = container_of(w, struct xe_vm,
> preempt.rebind_work);
> @@ -519,6 +522,7 @@ static void preempt_rebind_work_func(struct
> work_struct *w)
> struct dma_fence *rebind_fence;
> unsigned int fence_count = 0;
> LIST_HEAD(preempt_fences);
> + ktime_t end = 0;
> int err;
> long wait;
> int __maybe_unused tries = 0;
> @@ -633,9 +637,24 @@ static void preempt_rebind_work_func(struct
> work_struct *w)
> out_unlock:
> xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
> out_unlock_outer:
> - if (err == -EAGAIN) {
> - trace_xe_vm_rebind_worker_retry(vm);
> - goto retry;
> + /*
> + * With multiple active VMs, under memory pressure, it is
> possible that
> + * ttm_bo_validate() run into -EDEADLK and in such case
> returns -ENOMEM.
> + * Until ttm properly handles locking in such scenarios, best
> thing the
> + * driver can do is retry with a timeout. Apply timeout for
> retries upon
> + * -EAGAIN as well instead of indefinitely retrying. Killing
> the VM or
> + * putting it in error state after timeout or other error
> scenarios is
> + * still TBD.
> + */
> + if ((err == -EAGAIN) || (err == -ENOMEM)) {
We should not timeout for userptr -EAGAIN, see previous mail.
Otherwise LGTM.
> + ktime_t cur = ktime_get();
> +
> + end = end ? : ktime_add_ms(cur,
> XE_VM_REBIND_RETRY_TIMEOUT_MS);
> + if (ktime_before(cur, end)) {
> + msleep(20);
> + trace_xe_vm_rebind_worker_retry(vm);
> + goto retry;
> + }
> }
> up_write(&vm->lock);
>
More information about the Intel-xe
mailing list