[Intel-xe] [RFC] drm/xe: Handle -EDEADLK case in preempt worker

Mon May 8 22:34:50 UTC 2023

On Sun, May 07, 2023 at 11:27:30PM -0700, Niranjana Vishwanathapura wrote:
> With multiple active VMs, under memory pressure, it is possible that
> ttm_bo_validate() run into -EDEADLK in ttm_mem_evict_wait_busy() and
> return -ENOMEM.
> 
> Until ttm properly handles locking in such scenarios, best thing the
> driver can do is unwind the lock and retry.
> 
> Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_vm.c | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 91576cec000d..df1214f00a06 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -10,6 +10,7 @@
>  #include <drm/ttm/ttm_execbuf_util.h>
>  #include <drm/ttm/ttm_tt.h>
>  #include <drm/xe_drm.h>
> +#include <linux/delay.h>
>  #include <linux/kthread.h>
>  #include <linux/mm.h>
>  #include <linux/swap.h>
> @@ -508,11 +509,15 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm,
>  		kvfree(tv);
>  }
>  
> +#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
> +
>  static void preempt_rebind_work_func(struct work_struct *w)
>  {
>  	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
>  	struct xe_vma *vma;
>  	struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
> +	const ktime_t retry_end = ktime_add_ms(ktime_get(),
> +					       XE_VM_REBIND_RETRY_TIMEOUT_MS);
>  	struct ttm_validate_buffer *tv;
>  	struct ww_acquire_ctx ww;
>  	struct list_head objs;
> @@ -637,6 +642,19 @@ static void preempt_rebind_work_func(struct work_struct *w)
>  		trace_xe_vm_rebind_worker_retry(vm);
>  		goto retry;
>  	}
> +
> +	/*
> +	 * With multiple active VMs, under memory pressure, it is possible that
> +	 * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
> +	 * Until ttm properly handles locking in such scenarios, best thing the
> +	 * driver can do is retry with a timeout. Killing the VM or putting it
> +	 * in error state after timeout or other error scenarios is still TBD.
> +	 */
> +	if ((err == -ENOMEM) && ktime_before(ktime_get(), retry_end)) {

I'd combine this timer with the userptr case too (-EAGAIN) above as that
can livelock too, hopefully Thomas agrees? Let's get his opinion on this
too.

Matt

> +		msleep(20);
> +		trace_xe_vm_rebind_worker_retry(vm);
> +		goto retry;
> +	}
>  	up_write(&vm->lock);
>  
>  	free_preempt_fences(&preempt_fences);
> -- 
> 2.21.0.rc0.32.g243a4c7e27
>