[PATCH 9/9] drm/amdgpu: Reduce lock contention when evicting from visible VRAM

Fri Jun 23 17:39:40 UTC 2017

Performing expensive BO moves asynchronously reduces the direct burden on
the CS path, but it can still indirectly cause occasional stalls because
the worker may reserve the BO for a long time during evictions, and if this
coincides with it being needed by CS, the CS path will have to wait.

Instead of reserving the BO and keeping it reserved while we wait for
ttm_bo_validate() to move it and perform any evictions, we can afford to be
more surgical and re-implement the ttm_bo_validate() path in the worker
function with some changes to make it friendlier to other threads.

Specifically, if evictions are needed when moving a BO to visible VRAM,
unreserve the BO while performing them, so as to not block other tasks for
too long. Also, sleep for an interval between each eviction so that the
worker doesn't hog lru_lock.

Signed-off-by: John Brooks <john at fastquake.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 88 +++++++++++++++++++++++++++++-
 drivers/gpu/drm/ttm/ttm_bo.c               | 34 +++++++-----
 include/drm/ttm/ttm_bo_driver.h            | 13 +++++
 include/drm/ttm/ttm_placement.h            |  6 ++
 4 files changed, 125 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index a69441d..854e037 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -337,10 +337,16 @@ static void amdgpu_bo_move_vis_vram_work_func(struct work_struct *work)
 	struct amdgpu_bo *bo = container_of(work, struct amdgpu_bo,
 					    move_vis_vram_work);
 	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
+	struct ttm_placement placement;
+	struct ttm_mem_reg mem;
+	struct ttm_mem_type_manager *man = &bo->tbo.bdev->man[TTM_PL_VRAM];
 	u64 initial_bytes_moved, bytes_moved;
 	uint32_t old_mem;
+	uint32_t new_flags;
 	int r;
 
+	mem.mm_node = NULL;
+
 	spin_lock(&adev->mm_stats.lock);
 	if (adev->mm_stats.accum_us_vis <= 0 ||
 	    adev->mm_stats.accum_us <= 0) {
@@ -359,17 +365,97 @@ static void amdgpu_bo_move_vis_vram_work_func(struct work_struct *work)
 		goto out;
 
 	amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
+	placement = bo->placement;
+
+	if (ttm_bo_mem_compat(&placement, &bo->tbo.mem, &new_flags))
+		goto out;
+
+	mem.num_pages = bo->tbo.num_pages;
+	mem.size = mem.num_pages << PAGE_SHIFT;
+	mem.page_alignment = bo->tbo.mem.page_alignment;
+	mem.bus.io_reserved_vm = false;
+	mem.bus.io_reserved_count = 0;
+
+	placement.num_busy_placement = 0;
+
 	old_mem = bo->tbo.mem.mem_type;
 	initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
-	ttm_bo_validate(&bo->tbo, &bo->placement, false, false);
+
+	r = ttm_bo_mem_space(&bo->tbo, &placement, &mem, false, false);
+	if (r == -ENOMEM) {
+		/* Unreserve the BO while we make space for it */
+		struct ttm_bo_device *bdev = bo->tbo.bdev;
+
+		amdgpu_bo_unreserve(bo);
+		do {
+			r = (*man->func->get_node)(man, NULL,
+						   &placement.placement[0],
+						   &mem);
+			if (unlikely(r != 0))
+				return;
+
+			if (mem.mm_node)
+				break;
+
+			r = ttm_mem_evict_first(bdev, TTM_PL_VRAM,
+						&placement.placement[0], false,
+						false);
+			if (unlikely(r != 0))
+				return;
+
+			/* Sleep to give other threads the opportunity to grab
+			 * lru_lock
+			 */
+			msleep(20);
+		} while (1);
+
+		if (!kref_read(&bo->tbo.kref)) {
+			/* The BO was deleted since we last held it. Abort. */
+			if (mem.mm_node)
+				(*man->func->put_node)(man, &mem);
+			return;
+		}
+
+		r = amdgpu_bo_reserve(bo, true);
+		if (r != 0) {
+			if (mem.mm_node)
+				(*man->func->put_node)(man, &mem);
+			return;
+		}
+
+		mem.mem_type = TTM_PL_VRAM;
+
+		r = ttm_bo_add_move_fence(&bo->tbo, man, &mem);
+		if (unlikely(r != 0))
+			goto out;
+
+		mem.placement = TTM_PL_FLAG_VRAM;
+		mem.placement |= (placement.placement[0].flags &
+				  man->available_caching);
+		mem.placement |= ttm_bo_select_caching(man,
+						       bo->tbo.mem.placement,
+						       mem.placement);
+		ttm_flag_masked(&mem.placement, placement.placement[0].flags,
+				~TTM_PL_MASK_MEMTYPE);
+	} else if (unlikely(r != 0)) {
+		goto out;
+	}
+
+	r = ttm_bo_handle_move_mem(&bo->tbo, &mem, false, false, false);
+
 	bytes_moved = atomic64_read(&adev->num_bytes_moved) -
 				    initial_bytes_moved;
 	amdgpu_cs_report_moved_bytes(adev, bytes_moved, bytes_moved);
 
+	if (unlikely(r != 0))
+		goto out;
+
 	if (bo->tbo.mem.mem_type != old_mem)
 		bo->last_cs_move_jiffies = jiffies;
 
 out:
+	if (r && mem.mm_node)
+		ttm_bo_mem_put(&bo->tbo, &mem);
 	amdgpu_bo_unreserve(bo);
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index a6d7fcb..f783aa3 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -268,10 +268,10 @@ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc)
 	return ret;
 }
 
-static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo,
-				  struct ttm_mem_reg *mem,
-				  bool evict, bool interruptible,
-				  bool no_wait_gpu)
+int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo,
+			   struct ttm_mem_reg *mem,
+			   bool evict, bool interruptible,
+			   bool no_wait_gpu)
 {
 	struct ttm_bo_device *bdev = bo->bdev;
 	bool old_is_pci = ttm_mem_reg_is_pci(bdev, &bo->mem);
@@ -373,6 +373,7 @@ static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo,
 
 	return ret;
 }
+EXPORT_SYMBOL(ttm_bo_handle_move_mem);
 
 /**
  * Call bo::reserved.
@@ -695,11 +696,11 @@ bool ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 }
 EXPORT_SYMBOL(ttm_bo_eviction_valuable);
 
-static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
-				uint32_t mem_type,
-				const struct ttm_place *place,
-				bool interruptible,
-				bool no_wait_gpu)
+int ttm_mem_evict_first(struct ttm_bo_device *bdev,
+			uint32_t mem_type,
+			const struct ttm_place *place,
+			bool interruptible,
+			bool no_wait_gpu)
 {
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
@@ -753,6 +754,7 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	kref_put(&bo->list_kref, ttm_bo_release_list);
 	return ret;
 }
+EXPORT_SYMBOL(ttm_mem_evict_first);
 
 void ttm_bo_mem_put(struct ttm_buffer_object *bo, struct ttm_mem_reg *mem)
 {
@@ -766,9 +768,9 @@ EXPORT_SYMBOL(ttm_bo_mem_put);
 /**
  * Add the last move fence to the BO and reserve a new shared slot.
  */
-static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
-				 struct ttm_mem_type_manager *man,
-				 struct ttm_mem_reg *mem)
+int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
+			  struct ttm_mem_type_manager *man,
+			  struct ttm_mem_reg *mem)
 {
 	struct dma_fence *fence;
 	int ret;
@@ -790,6 +792,7 @@ static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
 
 	return 0;
 }
+EXPORT_SYMBOL(ttm_bo_add_move_fence);
 
 /**
  * Repeatedly evict memory from the LRU for @mem_type until we create enough
@@ -821,9 +824,9 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 	return ttm_bo_add_move_fence(bo, man, mem);
 }
 
-static uint32_t ttm_bo_select_caching(struct ttm_mem_type_manager *man,
-				      uint32_t cur_placement,
-				      uint32_t proposed_placement)
+uint32_t ttm_bo_select_caching(struct ttm_mem_type_manager *man,
+			       uint32_t cur_placement,
+			       uint32_t proposed_placement)
 {
 	uint32_t caching = proposed_placement & TTM_PL_MASK_CACHING;
 	uint32_t result = proposed_placement & ~TTM_PL_MASK_CACHING;
@@ -845,6 +848,7 @@ static uint32_t ttm_bo_select_caching(struct ttm_mem_type_manager *man,
 
 	return result;
 }
+EXPORT_SYMBOL(ttm_bo_select_caching);
 
 static bool ttm_bo_mt_compatible(struct ttm_mem_type_manager *man,
 				 uint32_t mem_type,
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 990d529..3bf267c 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -730,6 +730,11 @@ extern int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 				bool interruptible,
 				bool no_wait_gpu);
 
+extern int ttm_mem_evict_first(struct ttm_bo_device *bdev,
+			       uint32_t mem_type,
+			       const struct ttm_place *place,
+			       bool interruptible, bool no_wait_gpu);
+
 extern void ttm_bo_mem_put(struct ttm_buffer_object *bo,
 			   struct ttm_mem_reg *mem);
 extern void ttm_bo_mem_put_locked(struct ttm_buffer_object *bo,
@@ -740,6 +745,14 @@ extern int ttm_bo_global_init(struct drm_global_reference *ref);
 
 extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
 
+extern int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo,
+				  struct ttm_mem_reg *mem,
+				  bool evict, bool interruptible,
+				  bool no_wait_gpu);
+extern int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
+				 struct ttm_mem_type_manager *man,
+				 struct ttm_mem_reg *mem);
+
 /**
  * ttm_bo_device_init
  *
diff --git a/include/drm/ttm/ttm_placement.h b/include/drm/ttm/ttm_placement.h
index e88a8e3..6455214 100644
--- a/include/drm/ttm/ttm_placement.h
+++ b/include/drm/ttm/ttm_placement.h
@@ -105,4 +105,10 @@ struct ttm_placement {
 	const struct ttm_place	*busy_placement;
 };
 
+struct ttm_mem_type_manager;
+
+extern uint32_t ttm_bo_select_caching(struct ttm_mem_type_manager *man,
+				      uint32_t cur_placement,
+				      uint32_t proposed_placement);
+
 #endif
-- 
2.7.4