[PATCH] drm/i915/ttm: stop stalling on paging structures

Fri Dec 2 15:33:17 UTC 2022

If we need to evict lmem objects, we can already pipeline unbinding the
vma(s), and moving the pages, so we don't block, but soon after evicting
an object, userspace will always want to bind some object. However
page-tables always need to be allocated in lmem on dgfx, and since we
always sync against allocating them (and any other kernel internal
object) we end up stalling anyway. Try to pipeline allocating
page-tables, if there is a moving fence, like from in progress async
eviction.

Signed-off-by: Matthew Auld <matthew.auld at intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 10 +++---
 drivers/gpu/drm/i915/gem/i915_gem_pages.c     | 10 +++---
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 11 ++++++-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c          | 10 ++++++
 drivers/gpu/drm/i915/gt/intel_gtt.c           | 31 +++++++++++++++----
 drivers/gpu/drm/i915/gt/intel_gtt.h           |  3 ++
 drivers/gpu/drm/i915/i915_vma.c               | 27 +++++++++++++++-
 7 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index d0d6772e6f36..252cdd052c0d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -327,16 +327,18 @@ struct drm_i915_gem_object {
  * dealing with userspace objects the CPU fault handler is free to ignore this.
  */
 #define I915_BO_ALLOC_GPU_ONLY	  BIT(6)
+#define I915_BO_ALLOC_NO_SYNC	  BIT(7)
 #define I915_BO_ALLOC_FLAGS (I915_BO_ALLOC_CONTIGUOUS | \
 			     I915_BO_ALLOC_VOLATILE | \
 			     I915_BO_ALLOC_CPU_CLEAR | \
 			     I915_BO_ALLOC_USER | \
 			     I915_BO_ALLOC_PM_VOLATILE | \
 			     I915_BO_ALLOC_PM_EARLY | \
-			     I915_BO_ALLOC_GPU_ONLY)
-#define I915_BO_READONLY          BIT(7)
-#define I915_TILING_QUIRK_BIT     8 /* unknown swizzling; do not release! */
-#define I915_BO_PROTECTED         BIT(9)
+			     I915_BO_ALLOC_GPU_ONLY | \
+			     I915_BO_ALLOC_NO_SYNC)
+#define I915_BO_READONLY          BIT(8)
+#define I915_TILING_QUIRK_BIT     9 /* unknown swizzling; do not release! */
+#define I915_BO_PROTECTED         BIT(10)
 	/**
 	 * @mem_flags - Mutable placement-related flags
 	 *
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 05a27723ebb8..fe2a6a7e0496 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -428,10 +428,12 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 	}
 
 	if (!ptr) {
-		err = i915_gem_object_wait_moving_fence(obj, true);
-		if (err) {
-			ptr = ERR_PTR(err);
-			goto err_unpin;
+		if (!(obj->flags & I915_BO_ALLOC_NO_SYNC)) {
+			err = i915_gem_object_wait_moving_fence(obj, true);
+			if (err) {
+				ptr = ERR_PTR(err);
+				goto err_unpin;
+			}
 		}
 
 		if (GEM_WARN_ON(type == I915_MAP_WC && !pat_enabled()))
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index f59f812dc6d2..28f73e2bbe61 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -620,7 +620,16 @@ int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
 		}
 		dma_fence_put(migration_fence);
 	} else {
-		ttm_bo_move_sync_cleanup(bo, dst_mem);
+		/*
+		 * If there is no actual move then we only need to sync against
+		 * fences that were attached to the resource manager. In the
+		 * case if I915_BO_ALLOC_NO_SYNC the caller can handle this, and
+		 * correctly pipleline the work, like for page-tables.
+		 */
+		if (obj->flags & I915_BO_ALLOC_NO_SYNC)
+			ttm_bo_move_null(bo, dst_mem);
+		else
+			ttm_bo_move_sync_cleanup(bo, dst_mem);
 	}
 
 	i915_ttm_adjust_domains_after_move(obj);
diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index 4daaa6f55668..885df010dd61 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -876,6 +876,12 @@ static int gen8_preallocate_top_level_pdp(struct i915_ppgtt *ppgtt)
 			return err;
 		}
 
+		err = i915_gem_object_wait_moving_fence(pde->pt.base, false);
+		if (err) {
+			free_pd(vm, pde);
+			return err;
+		}
+
 		fill_px(pde, vm->scratch[1]->encode);
 		set_pd_entry(pd, idx, pde);
 		atomic_inc(px_used(pde)); /* keep pinned */
@@ -909,6 +915,10 @@ gen8_alloc_top_pd(struct i915_address_space *vm)
 	if (err)
 		goto err_pd;
 
+	err = i915_gem_object_wait_moving_fence(pd->pt.base, false);
+	if (err)
+		goto err_pd;
+
 	fill_page_dma(px_base(pd), vm->scratch[vm->top]->encode, count);
 	atomic_inc(px_used(pd)); /* mark as pinned */
 	return pd;
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c
index e37164a60d37..db44fa725c2d 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -48,7 +48,8 @@ struct drm_i915_gem_object *alloc_pt_lmem(struct i915_address_space *vm, int sz)
 	 * also has the same alignment.
 	 */
 	obj = __i915_gem_object_create_lmem_with_ps(vm->i915, sz, sz,
-						    vm->lmem_pt_obj_flags);
+						    vm->lmem_pt_obj_flags |
+						    I915_BO_ALLOC_NO_SYNC);
 	/*
 	 * Ensure all paging structures for this vm share the same dma-resv
 	 * object underneath, with the idea that one object_lock() will lock
@@ -161,20 +162,38 @@ static void __i915_vm_close(struct i915_address_space *vm)
 	mutex_unlock(&vm->mutex);
 }
 
-/* lock the vm into the current ww, if we lock one, we lock all */
-int i915_vm_lock_objects(struct i915_address_space *vm,
-			 struct i915_gem_ww_ctx *ww)
+struct drm_i915_gem_object *i915_vm_get_shared_object(struct i915_address_space *vm)
 {
 	if (vm->scratch[0]->base.resv == &vm->_resv) {
-		return i915_gem_object_lock(vm->scratch[0], ww);
+		return vm->scratch[0];
 	} else {
 		struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
 
 		/* We borrowed the scratch page from ggtt, take the top level object */
-		return i915_gem_object_lock(ppgtt->pd->pt.base, ww);
+		return ppgtt->pd->pt.base;
 	}
 }
 
+/* lock the vm into the current ww, if we lock one, we lock all */
+int i915_vm_lock_objects(struct i915_address_space *vm,
+			 struct i915_gem_ww_ctx *ww)
+{
+	return i915_gem_object_lock(i915_vm_get_shared_object(vm), ww);
+}
+
+int i915_vm_get_moving_fence(struct i915_address_space *vm,
+			     struct dma_fence **moving)
+{
+	return i915_gem_object_get_moving_fence(i915_vm_get_shared_object(vm),
+						moving);
+}
+
+int i915_vm_wait_moving_fence(struct i915_address_space *vm, bool intr)
+{
+	return i915_gem_object_wait_moving_fence(i915_vm_get_shared_object(vm),
+						 intr);
+}
+
 void i915_address_space_fini(struct i915_address_space *vm)
 {
 	drm_mm_takedown(&vm->mm);
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h
index d1900fec6cd1..8128df3fdc72 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -409,6 +409,9 @@ bool intel_vm_no_concurrent_access_wa(struct drm_i915_private *i915);
 
 int __must_check
 i915_vm_lock_objects(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww);
+int i915_vm_get_moving_fence(struct i915_address_space *vm,
+			     struct dma_fence **moving);
+int i915_vm_wait_moving_fence(struct i915_address_space *vm, bool intr);
 
 static inline bool
 i915_vm_is_4lvl(const struct i915_address_space *vm)
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 726705b10637..b3e88679fa0e 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -536,6 +536,10 @@ int i915_vma_bind(struct i915_vma *vma,
 
 			return ret;
 		}
+
+		ret = i915_vm_wait_moving_fence(vma->vm, true);
+		if (ret)
+			return ret;
 		vma->ops->bind_vma(vma->vm, NULL, vma->resource, cache_level,
 				   bind_flags);
 	}
@@ -1360,7 +1364,7 @@ int i915_vma_pin_ww(struct i915_vma *vma, struct i915_gem_ww_ctx *ww,
 		    u64 size, u64 alignment, u64 flags)
 {
 	struct i915_vma_work *work = NULL;
-	struct dma_fence *moving = NULL;
+	struct dma_fence *moving = NULL, *vm_moving = NULL;
 	struct i915_vma_resource *vma_res = NULL;
 	intel_wakeref_t wakeref = 0;
 	unsigned int bound;
@@ -1416,6 +1420,24 @@ int i915_vma_pin_ww(struct i915_vma *vma, struct i915_gem_ww_ctx *ww,
 			err = i915_vm_map_pt_stash(vma->vm, &work->stash);
 			if (err)
 				goto err_fence;
+
+			/*
+			 * We share the same dma-resv object for all paging
+			 * structures, for a given ppGTT. We have already locked
+			 * it, and allocated any required stash objects (which
+			 * would have added any potential kernel fences that
+			 * were attached to the resource manager at the time of
+			 * allocation, like with async eviction). The same
+			 * applies to paging structures that are already present
+			 * in the VM. Both will be scooped up here.
+			 */
+			if (vma->obj->flags & I915_BO_ALLOC_NO_SYNC) {
+				err = i915_vm_get_moving_fence(vma->vm, &vm_moving);
+				if (err)
+					goto err_rpm;
+
+				dma_fence_work_chain(&work->base, vm_moving);
+			}
 		}
 	}
 
@@ -1525,6 +1547,9 @@ int i915_vma_pin_ww(struct i915_vma *vma, struct i915_gem_ww_ctx *ww,
 	if (moving)
 		dma_fence_put(moving);
 
+	if (vm_moving)
+		dma_fence_put(vm_moving);
+
 	i915_vma_put_pages(vma);
 	return err;
 }
-- 
2.38.1