[Intel-xe] [RFC] drm/xe: Handle -EDEADLK case in preempt worker
Niranjana Vishwanathapura
niranjana.vishwanathapura at intel.com
Mon May 8 06:27:30 UTC 2023
With multiple active VMs, under memory pressure, it is possible that
ttm_bo_validate() run into -EDEADLK in ttm_mem_evict_wait_busy() and
return -ENOMEM.
Until ttm properly handles locking in such scenarios, best thing the
driver can do is unwind the lock and retry.
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura at intel.com>
---
drivers/gpu/drm/xe/xe_vm.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 91576cec000d..df1214f00a06 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -10,6 +10,7 @@
#include <drm/ttm/ttm_execbuf_util.h>
#include <drm/ttm/ttm_tt.h>
#include <drm/xe_drm.h>
+#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/mm.h>
#include <linux/swap.h>
@@ -508,11 +509,15 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm,
kvfree(tv);
}
+#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
+
static void preempt_rebind_work_func(struct work_struct *w)
{
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
struct xe_vma *vma;
struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
+ const ktime_t retry_end = ktime_add_ms(ktime_get(),
+ XE_VM_REBIND_RETRY_TIMEOUT_MS);
struct ttm_validate_buffer *tv;
struct ww_acquire_ctx ww;
struct list_head objs;
@@ -637,6 +642,19 @@ static void preempt_rebind_work_func(struct work_struct *w)
trace_xe_vm_rebind_worker_retry(vm);
goto retry;
}
+
+ /*
+ * With multiple active VMs, under memory pressure, it is possible that
+ * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
+ * Until ttm properly handles locking in such scenarios, best thing the
+ * driver can do is retry with a timeout. Killing the VM or putting it
+ * in error state after timeout or other error scenarios is still TBD.
+ */
+ if ((err == -ENOMEM) && ktime_before(ktime_get(), retry_end)) {
+ msleep(20);
+ trace_xe_vm_rebind_worker_retry(vm);
+ goto retry;
+ }
up_write(&vm->lock);
free_preempt_fences(&preempt_fences);
--
2.21.0.rc0.32.g243a4c7e27
More information about the Intel-xe
mailing list