[Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects

Sviatoslav Peleshko sviatoslav.peleshko at globallogic.com
Wed Aug 17 06:55:41 UTC 2022


The i915_gem_object_trylock we had in the grab_vma() makes it return false
when the vma->obj is already locked. In this case we'll skip this vma
during eviction, and eventually might be forced to return -ENOSPC even
though we could've evicted this vma if we waited for the lock a bit.

To fix this, replace the i915_gem_object_trylock with i915_gem_object_lock.
And because we have to worry about the potential deadlock now, bubble-up
the error code, so it will be correctly handled by the WW mechanism.

This fixes the issue https://gitlab.freedesktop.org/drm/intel/-/issues/6564

Fixes: 7e00897be8bf ("drm/i915: Add object locking to i915_gem_evict_for_node and i915_gem_evict_something, v2.")
Signed-off-by: Sviatoslav Peleshko <sviatoslav.peleshko at globallogic.com>
---
 drivers/gpu/drm/i915/i915_gem_evict.c | 69 ++++++++++++++++++---------
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index f025ee4fa526..9d43f213f68f 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -55,49 +55,58 @@ static int ggtt_flush(struct intel_gt *gt)
 	return intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
 }
 
-static bool grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx *ww)
+static int grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx *ww)
 {
+	int ret = 0;
+
 	/*
 	 * We add the extra refcount so the object doesn't drop to zero until
 	 * after ungrab_vma(), this way trylock is always paired with unlock.
 	 */
 	if (i915_gem_object_get_rcu(vma->obj)) {
-		if (!i915_gem_object_trylock(vma->obj, ww)) {
+		ret = i915_gem_object_lock(vma->obj, ww);
+		if (ret)
 			i915_gem_object_put(vma->obj);
-			return false;
-		}
 	} else {
 		/* Dead objects don't need pins */
 		atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
 	}
 
-	return true;
+	return ret;
 }
 
-static void ungrab_vma(struct i915_vma *vma)
+static void ungrab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx *ww)
 {
 	if (dying_vma(vma))
 		return;
 
-	i915_gem_object_unlock(vma->obj);
+	if (!ww)
+		i915_gem_object_unlock(vma->obj);
+
 	i915_gem_object_put(vma->obj);
 }
 
-static bool
+static int
 mark_free(struct drm_mm_scan *scan,
 	  struct i915_gem_ww_ctx *ww,
 	  struct i915_vma *vma,
 	  unsigned int flags,
 	  struct list_head *unwind)
 {
+	int err;
+
 	if (i915_vma_is_pinned(vma))
-		return false;
+		return -ENOSPC;
 
-	if (!grab_vma(vma, ww))
-		return false;
+	err = grab_vma(vma, ww);
+	if (err)
+		return err;
 
 	list_add(&vma->evict_link, unwind);
-	return drm_mm_scan_add_block(scan, &vma->node);
+	if (!drm_mm_scan_add_block(scan, &vma->node))
+		return -ENOSPC;
+
+	return 0;
 }
 
 static bool defer_evict(struct i915_vma *vma)
@@ -150,6 +159,7 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	enum drm_mm_insert_mode mode;
 	struct i915_vma *active;
 	int ret;
+	int err = 0;
 
 	lockdep_assert_held(&vm->mutex);
 	trace_i915_gem_evict(vm, min_size, alignment, flags);
@@ -210,17 +220,23 @@ i915_gem_evict_something(struct i915_address_space *vm,
 			continue;
 		}
 
-		if (mark_free(&scan, ww, vma, flags, &eviction_list))
+		err = mark_free(&scan, ww, vma, flags, &eviction_list);
+		if (!err)
 			goto found;
+		if (err == -EDEADLK)
+			break;
 	}
 
 	/* Nothing found, clean up and bail out! */
 	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		ret = drm_mm_scan_remove_block(&scan, &vma->node);
 		BUG_ON(ret);
-		ungrab_vma(vma);
+		ungrab_vma(vma, ww);
 	}
 
+	if (err == -EDEADLK)
+		return err;
+
 	/*
 	 * Can we unpin some objects such as idle hw contents,
 	 * or pending flips? But since only the GGTT has global entries
@@ -267,7 +283,7 @@ i915_gem_evict_something(struct i915_address_space *vm,
 			__i915_vma_pin(vma);
 		} else {
 			list_del(&vma->evict_link);
-			ungrab_vma(vma);
+			ungrab_vma(vma, ww);
 		}
 	}
 
@@ -277,17 +293,21 @@ i915_gem_evict_something(struct i915_address_space *vm,
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = __i915_vma_unbind(vma);
-		ungrab_vma(vma);
+		ungrab_vma(vma, ww);
 	}
 
 	while (ret == 0 && (node = drm_mm_scan_color_evict(&scan))) {
 		vma = container_of(node, struct i915_vma, node);
 
 		/* If we find any non-objects (!vma), we cannot evict them */
-		if (vma->node.color != I915_COLOR_UNEVICTABLE &&
-		    grab_vma(vma, ww)) {
-			ret = __i915_vma_unbind(vma);
-			ungrab_vma(vma);
+		if (vma->node.color != I915_COLOR_UNEVICTABLE) {
+			ret = grab_vma(vma, ww);
+			if (!ret) {
+				ret = __i915_vma_unbind(vma);
+				ungrab_vma(vma, ww);
+			} else if (ret != -EDEADLK) {
+				ret = -ENOSPC;
+			}
 		} else {
 			ret = -ENOSPC;
 		}
@@ -382,8 +402,11 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 			break;
 		}
 
-		if (!grab_vma(vma, ww)) {
-			ret = -ENOSPC;
+		ret = grab_vma(vma, ww);
+		if (ret) {
+			if (ret != -EDEADLK)
+				ret = -ENOSPC;
+
 			break;
 		}
 
@@ -405,7 +428,7 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
 		if (ret == 0)
 			ret = __i915_vma_unbind(vma);
 
-		ungrab_vma(vma);
+		ungrab_vma(vma, ww);
 	}
 
 	return ret;
-- 
2.37.1



More information about the Intel-gfx mailing list