[Intel-xe] [PATCH 2/2] drm/xe: Handle -EDEADLK case in exec ioctl
Hellstrom, Thomas
thomas.hellstrom at intel.com
Tue May 9 05:51:53 UTC 2023
On Mon, 2023-05-08 at 22:24 -0700, Niranjana Vishwanathapura wrote:
> With multiple active VMs, under memory pressure, it is possible that
> ttm_bo_validate() run into -EDEADLK in ttm_mem_evict_wait_busy() and
> return -ENOMEM.
>
> Until ttm properly handles locking in such scenarios, best thing the
> driver can do is unwind the lock and retry.
>
> Update xe_exec_begin to retry validating BOs with a timeout upon
> -ENOMEM.
>
> Signed-off-by: Niranjana Vishwanathapura
> <niranjana.vishwanathapura at intel.com>
> ---
> drivers/gpu/drm/xe/xe_exec.c | 27 ++++++++++++++++++++++++---
> 1 file changed, 24 insertions(+), 3 deletions(-)
Reviewed-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
>
> diff --git a/drivers/gpu/drm/xe/xe_exec.c
> b/drivers/gpu/drm/xe/xe_exec.c
> index ea869f2452ef..3db1b159586e 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -8,6 +8,7 @@
> #include <drm/drm_device.h>
> #include <drm/drm_file.h>
> #include <drm/xe_drm.h>
> +#include <linux/delay.h>
>
> #include "xe_bo.h"
> #include "xe_device.h"
> @@ -91,6 +92,8 @@
> * Unlock all
> */
>
> +#define XE_EXEC_BIND_RETRY_TIMEOUT_MS 1000
> +
> static int xe_exec_begin(struct xe_engine *e, struct ww_acquire_ctx
> *ww,
> struct ttm_validate_buffer tv_onstack[],
> struct ttm_validate_buffer **tv,
> @@ -99,12 +102,14 @@ static int xe_exec_begin(struct xe_engine *e,
> struct ww_acquire_ctx *ww,
> struct xe_vm *vm = e->vm;
> struct xe_vma *vma;
> LIST_HEAD(dups);
> - int err;
> + ktime_t end = 0;
> + int err = 0;
>
> *tv = NULL;
> if (xe_vm_no_dma_fences(e->vm))
> return 0;
>
> +retry:
> err = xe_vm_lock_dma_resv(vm, ww, tv_onstack, tv, objs, true,
> 1);
> if (err)
> return err;
> @@ -122,11 +127,27 @@ static int xe_exec_begin(struct xe_engine *e,
> struct ww_acquire_ctx *ww,
> if (err) {
> xe_vm_unlock_dma_resv(vm, tv_onstack, *tv,
> ww, objs);
> *tv = NULL;
> - return err;
> + break;
> + }
> + }
> +
> + /*
> + * With multiple active VMs, under memory pressure, it is
> possible that
> + * ttm_bo_validate() run into -EDEADLK and in such case
> returns -ENOMEM.
> + * Until ttm properly handles locking in such scenarios, best
> thing the
> + * driver can do is retry with a timeout.
> + */
> + if (err == -ENOMEM) {
> + ktime_t cur = ktime_get();
> +
> + end = end ? : ktime_add_ms(cur,
> XE_EXEC_BIND_RETRY_TIMEOUT_MS);
> + if (ktime_before(cur, end)) {
> + msleep(20);
> + goto retry;
> }
> }
>
> - return 0;
> + return err;
> }
>
> static void xe_exec_end(struct xe_engine *e,
More information about the Intel-xe
mailing list