[PATCH] WIP: drm/i915/gem: Asynchronous GTT unbinding

Mon Sep 30 08:08:55 UTC 2024

It is reasonably common for userspace (even modern drivers like iris) to
reuse an active address for a new buffer. This would cause the
application to stall under its mutex (originally struct_mutex) until the
old batches were idle and it could synchronously remove the stale PTE.
However, we can queue up a job that waits on the signal for the old
nodes to complete and upon those signals, remove the old nodes replacing
them with the new ones for the batch. This is still CPU driven, but in
theory we can do the GTT patching from the GPU. The job itself has a
completion signal allowing the execbuf to wait upon the rebinding, and
also other observers to coordinate with the common VM activity.

Letting userspace queue up more work, lets it do more stuff without
blocking other clients. In turn, we take care not to let it too much
concurrent work, creating a small number of queues for each context to
limit the number of concurrent tasks.

The implementation relies on only scheduling one unbind operation per
vma as we use the unbound vma->node location to track the stale PTE.

Closes: https://gitlab.freedesktop.org/drm/intel/issues/1402
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

Signed-off-by: Nitin Gote <nitin.r.gote at intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index f151640c1d13..98016ae58f93 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -652,12 +652,72 @@ static int use_cpu_reloc(const struct reloc_cache *cache,
 		!i915_gem_object_has_cache_level(obj, I915_CACHE_NONE));
 }
 
+static struct drm_mm_node *__best_hole(struct drm_mm *mm, u64 size)
+{
+	struct rb_node *rb = mm->holes_size.rb_root.rb_node;
+	struct drm_mm_node *best = NULL;
+
+	while (rb) {
+		struct drm_mm_node *node =
+			rb_entry(rb, struct drm_mm_node, rb_hole_size);
+
+		if (size <= node->hole_size) {
+			best = node;
+			rb = rb->rb_right;
+		} else {
+			rb = rb->rb_left;
+		}
+	}
+
+	return best;
+}
+
+static int best_hole(struct drm_mm *mm, struct drm_mm_node *node,
+		     u64 start, u64 end, u64 align)
+{
+	struct drm_mm_node *hole;
+	u64 size = node->size;
+
+	do {
+		hole = __best_hole(mm, size);
+		if (!hole)
+			return -ENOSPC;
+
+		node->start = round_up(max(start, drm_mm_hole_node_start(hole)),
+				       align);
+		if (min(drm_mm_hole_node_end(hole), end) >=
+		    node->start + node->size)
+			return drm_mm_reserve_node(mm, node);
+
+		/*
+		 * Too expensive to search for every single hole every time,
+		 * so just look for the next bigger hole, introducing enough
+		 * space for alignments. Finding the smallest hole with ideal
+		 * alignment scales very poorly, so we choose to waste space
+		 * if an alignment is forced. On the other hand, simply
+		 * randomly selecting an offset in 48b space will cause us
+		 * to use the majority of that space and exhaust all memory
+		 * in storing the page directories. Compromise is required.
+		 */
+		size = hole->hole_size + align;
+	} while (1);
+}
+
+static u64 align0(u64 align)
+{
+	return align <= I915_GTT_MIN_ALIGNMENT ? 0 : align;
+}
+
 static int eb_reserve_vma(struct i915_execbuffer *eb,
 			  struct eb_vma *ev,
 			  u64 pin_flags)
 {
 	struct drm_i915_gem_exec_object2 *entry = ev->exec;
+	const unsigned int exec_flags = ev->flags;
 	struct i915_vma *vma = ev->vma;
+	struct i915_address_space *vm = vma->vm;
+	u64 start = 0, end = vm->total;
+	u64 align = entry->alignment ?: I915_GTT_MIN_ALIGNMENT;
 	int err;
 
 	if (drm_mm_node_allocated(&vma->node) &&
@@ -678,6 +738,26 @@ static int eb_reserve_vma(struct i915_execbuffer *eb,
 		eb->args->flags |= __EXEC_HAS_RELOC;
 	}
 
+	if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS)
+		start = BATCH_OFFSET_BIAS;
+	/* Try the first available free space */
+
+	if (!best_hole(&vm->mm, &vma->node, start, end, align))
+		goto pin;
+
+	/* Otherwise search all free space [degrades to O(N^2)] */
+	if (drm_mm_insert_node_in_range(&vm->mm, &vma->node,
+					vma->node.size,
+					align0(align),
+					vma->node.color,
+					start, end,
+					DRM_MM_INSERT_BEST) == 0)
+		goto pin;
+
+	GEM_BUG_ON(eb_vma_misplaced(entry, vma, ev->flags));
+
+	return 0;
+pin:
 	if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_FENCE)) {
 		err = i915_vma_pin_fence(vma);
 		if (unlikely(err))
@@ -688,6 +768,7 @@ static int eb_reserve_vma(struct i915_execbuffer *eb,
 	}
 
 	ev->flags |= __EXEC_OBJECT_HAS_PIN;
+
 	GEM_BUG_ON(eb_vma_misplaced(entry, vma, ev->flags));
 
 	return 0;
-- 
2.25.1