[PATCH 8/9] drm/amdgpu: Asynchronously move BOs to visible VRAM

Fri Jun 23 17:39:39 UTC 2017

Moving CPU-accessible BOs from GTT to visible VRAM reduces latency on the
GPU and improves average framerate. However, it's an expensive operation.
When visible VRAM is full and evictions are necessary, it can easily take
tens of milliseconds. On the CS path, that directly increases the frame
time and causes noticeable momentary stutters.

Unlike other BO move operations, moving BOs to visible VRAM is a
housekeeping task and does not have to happen immediately. As a compromise
to allow evictions to occur and keep the contents of visible VRAM fresh,
but without stalling the rendering pipeline, we can defer these moves to a
worker thread.

Add a work function that moves a BO into visible VRAM and evicting other
BOs if necessary. And during CS, queue this work function for all requested
CPU_ACCESS_REQUIRED BOs (subject to the usual move throttling).

This decreases the frequency and severity of visible-VRAM-related
stuttering.

Signed-off-by: John Brooks <john at fastquake.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  5 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 14 ++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 45 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    |  7 +++++
 4 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 81dbb93..a809742 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -436,6 +436,10 @@ struct amdgpu_bo {
 	 * is associated to
 	 */
 	struct list_head		va;
+
+	/* Work item for moving this BO to visible VRAM asynchronously */
+	struct work_struct		move_vis_vram_work;
+
 	/* Constant after initialization */
 	struct drm_gem_object		gem_base;
 	struct amdgpu_bo		*parent;
@@ -1583,6 +1587,7 @@ struct amdgpu_device {
 	struct amdgpu_mman		mman;
 	struct amdgpu_vram_scratch	vram_scratch;
 	struct amdgpu_wb		wb;
+	struct workqueue_struct		*vis_vram_wq;
 	atomic64_t			vram_usage;
 	atomic64_t			vram_vis_usage;
 	atomic64_t			gtt_usage;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 25d6df6..9215611 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -341,14 +341,16 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
 	if (p->bytes_moved < p->bytes_moved_threshold) {
 		if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
 		    (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
-			/* And don't move a CPU_ACCESS_REQUIRED BO to limited
-			 * visible VRAM if we've depleted our allowance to do
-			 * that.
+			/* Move CPU_ACCESS_REQUIRED BOs to limited visible VRAM
+			 * asynchronously, if we're allowed.
 			 */
-			if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
-				domain = bo->prefered_domains;
-			else
+			if (p->bytes_moved_vis < p->bytes_moved_vis_threshold) {
+				queue_work(adev->vis_vram_wq,
+					   &bo->move_vis_vram_work);
+				return 0;
+			} else {
 				domain = bo->allowed_domains;
+			}
 		} else {
 			domain = bo->prefered_domains;
 		}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 27d8c77..a69441d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -93,6 +93,8 @@ static void amdgpu_ttm_bo_destroy(struct ttm_buffer_object *tbo)
 
 	bo = container_of(tbo, struct amdgpu_bo, tbo);
 
+	cancel_work_sync(&bo->move_vis_vram_work);
+
 	amdgpu_update_memory_usage(adev, &bo->tbo.mem, NULL);
 
 	drm_gem_object_release(&bo->gem_base);
@@ -330,6 +332,47 @@ void amdgpu_bo_free_kernel(struct amdgpu_bo **bo, u64 *gpu_addr,
 		*cpu_addr = NULL;
 }
 
+static void amdgpu_bo_move_vis_vram_work_func(struct work_struct *work)
+{
+	struct amdgpu_bo *bo = container_of(work, struct amdgpu_bo,
+					    move_vis_vram_work);
+	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
+	u64 initial_bytes_moved, bytes_moved;
+	uint32_t old_mem;
+	int r;
+
+	spin_lock(&adev->mm_stats.lock);
+	if (adev->mm_stats.accum_us_vis <= 0 ||
+	    adev->mm_stats.accum_us <= 0) {
+		spin_unlock(&adev->mm_stats.lock);
+		return;
+	}
+	spin_unlock(&adev->mm_stats.lock);
+
+	r = amdgpu_bo_reserve(bo, true);
+	if (r != 0)
+		return;
+
+	amdgpu_bo_clear_cpu_access_required(bo);
+
+	if (!(bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED))
+		goto out;
+
+	amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
+	old_mem = bo->tbo.mem.mem_type;
+	initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
+	ttm_bo_validate(&bo->tbo, &bo->placement, false, false);
+	bytes_moved = atomic64_read(&adev->num_bytes_moved) -
+				    initial_bytes_moved;
+	amdgpu_cs_report_moved_bytes(adev, bytes_moved, bytes_moved);
+
+	if (bo->tbo.mem.mem_type != old_mem)
+		bo->last_cs_move_jiffies = jiffies;
+
+out:
+	amdgpu_bo_unreserve(bo);
+}
+
 int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
 				unsigned long size, int byte_align,
 				bool kernel, u32 domain, u64 flags,
@@ -382,6 +425,8 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
 
 	bo->flags = flags;
 
+	INIT_WORK(&bo->move_vis_vram_work, amdgpu_bo_move_vis_vram_work_func);
+
 #ifdef CONFIG_X86_32
 	/* XXX: Write-combined CPU mappings of GTT seem broken on 32-bit
 	 * See https://bugs.freedesktop.org/show_bug.cgi?id=84627
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 0676a78..5852ca1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1120,6 +1120,13 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 		return r;
 	}
 
+	/* Initialize workqueue for asynchronously moving BOs to visible VRAM.
+	 * We want to avoid lock contention (and therefore concurrency), so set
+	 * max_active = 1, and set unbound to disable concurrency management
+	 * (which can interleave sleeping workers).
+	 */
+	adev->vis_vram_wq = alloc_workqueue("amdgpu_vis_vram", WQ_UNBOUND, 1);
+
 	/* Reduce size of CPU-visible VRAM if requested */
 	vis_vram_limit = amdgpu_vis_vram_limit * 1024 * 1024;
 	if (amdgpu_vis_vram_limit > 0 &&
-- 
2.7.4