[Intel-xe] [PATCH 1/2] drm/xe: Handle -EDEADLK case in preempt worker

Tue May 9 05:24:18 UTC 2023

With multiple active VMs, under memory pressure, it is possible that
ttm_bo_validate() run into -EDEADLK in ttm_mem_evict_wait_busy() and
return -ENOMEM.

Until ttm properly handles locking in such scenarios, best thing the
driver can do is unwind the lock and retry.

Update preempt worker to retry validating BOs with a timeout upon
-ENOMEM and while at it, apply the timeout for retries upon -EAGAIN
as well.

Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura at intel.com>
---
 drivers/gpu/drm/xe/xe_vm.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 91576cec000d..d92c9f659ab3 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -10,6 +10,7 @@
 #include <drm/ttm/ttm_execbuf_util.h>
 #include <drm/ttm/ttm_tt.h>
 #include <drm/xe_drm.h>
+#include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -508,6 +509,8 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm,
 		kvfree(tv);
 }
 
+#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
+
 static void preempt_rebind_work_func(struct work_struct *w)
 {
 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
@@ -519,6 +522,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
 	struct dma_fence *rebind_fence;
 	unsigned int fence_count = 0;
 	LIST_HEAD(preempt_fences);
+	ktime_t end = 0;
 	int err;
 	long wait;
 	int __maybe_unused tries = 0;
@@ -633,9 +637,24 @@ static void preempt_rebind_work_func(struct work_struct *w)
 out_unlock:
 	xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
 out_unlock_outer:
-	if (err == -EAGAIN) {
-		trace_xe_vm_rebind_worker_retry(vm);
-		goto retry;
+	/*
+	 * With multiple active VMs, under memory pressure, it is possible that
+	 * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
+	 * Until ttm properly handles locking in such scenarios, best thing the
+	 * driver can do is retry with a timeout. Apply timeout for retries upon
+	 * -EAGAIN as well instead of indefinitely retrying. Killing the VM or
+	 * putting it in error state after timeout or other error scenarios is
+	 * still TBD.
+	 */
+	if ((err == -EAGAIN) || (err == -ENOMEM)) {
+		ktime_t cur = ktime_get();
+
+		end = end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
+		if (ktime_before(cur, end)) {
+			msleep(20);
+			trace_xe_vm_rebind_worker_retry(vm);
+			goto retry;
+		}
 	}
 	up_write(&vm->lock);
 
-- 
2.21.0.rc0.32.g243a4c7e27