[PATCH 7/9] drm/amdgpu: Throttle visible VRAM moves separately

Fri Jun 23 17:39:38 UTC 2017

The BO move throttling code is designed to allow VRAM to fill quickly if it
is relatively empty. However, this does not take into account situations
where the visible VRAM is smaller than total VRAM, and total VRAM may not
be close to full but the visible VRAM segment is under pressure. In such
situations, visible VRAM would experience unrestricted swapping and
performance would drop.

Add a separate counter specifically for moves involving visible VRAM, and
check it before moving BOs there.

Fixes: 95844d20ae02 (drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2))
Signed-off-by: John Brooks <john at fastquake.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 76 +++++++++++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 10 ++--
 3 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 245234e..81dbb93 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1161,7 +1161,9 @@ struct amdgpu_cs_parser {
 	struct list_head		validated;
 	struct dma_fence		*fence;
 	uint64_t			bytes_moved_threshold;
+	uint64_t			bytes_moved_vis_threshold;
 	uint64_t			bytes_moved;
+	uint64_t			bytes_moved_vis;
 	struct amdgpu_bo_list_entry	*evictable;
 
 	/* user fence */
@@ -1595,6 +1597,7 @@ struct amdgpu_device {
 		spinlock_t		lock;
 		s64			last_update_us;
 		s64			accum_us; /* accumulated microseconds */
+		s64			accum_us_vis; /* for visible VRAM */
 		u32			log2_max_MBps;
 	} mm_stats;
 
@@ -1930,7 +1933,8 @@ bool amdgpu_need_post(struct amdgpu_device *adev);
 void amdgpu_update_display_priority(struct amdgpu_device *adev);
 
 int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data);
-void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes);
+void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
+				  u64 num_vis_bytes);
 void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain);
 bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
 int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 73d6882..25d6df6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -223,11 +223,13 @@ static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
  * ticks. The accumulated microseconds (us) are converted to bytes and
  * returned.
  */
-static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
+static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
+					      u64 *max_bytes,
+					      u64 *max_vis_bytes)
 {
 	s64 time_us, increment_us;
-	u64 max_bytes;
 	u64 free_vram, total_vram, used_vram;
+	u64 free_vis_vram, total_vis_vram, used_vis_vram;
 
 	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 	 * throttling.
@@ -238,13 +240,21 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 	 */
 	const s64 us_upper_bound = 200000;
 
-	if (!adev->mm_stats.log2_max_MBps)
-		return 0;
+	if (!adev->mm_stats.log2_max_MBps) {
+		*max_bytes = 0;
+		*max_vis_bytes = 0;
+		return;
+	}
 
 	total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
 	used_vram = atomic64_read(&adev->vram_usage);
 	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
 
+	total_vis_vram = adev->mc.visible_vram_size;
+	used_vis_vram = atomic64_read(&adev->vram_vis_usage);
+	free_vis_vram = used_vis_vram >= total_vis_vram ? 0 : total_vis_vram -
+							      used_vis_vram;
+
 	spin_lock(&adev->mm_stats.lock);
 
 	/* Increase the amount of accumulated us. */
@@ -252,7 +262,9 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 	increment_us = time_us - adev->mm_stats.last_update_us;
 	adev->mm_stats.last_update_us = time_us;
 	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
-                                      us_upper_bound);
+				      us_upper_bound);
+	adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
+					  increment_us, us_upper_bound);
 
 	/* This prevents the short period of low performance when the VRAM
 	 * usage is low and the driver is in debt or doesn't have enough
@@ -280,23 +292,32 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
 	}
 
-	/* This returns 0 if the driver is in debt to disallow (optional)
+	/* Do the same for visible VRAM if half of it is free */
+	if (free_vis_vram >= total_vis_vram / 2) {
+		adev->mm_stats.accum_us_vis = max(bytes_to_us(adev,
+						  free_vis_vram / 2),
+						  adev->mm_stats.accum_us_vis);
+	}
+
+	/* This is set to 0 if the driver is in debt to disallow (optional)
 	 * buffer moves.
 	 */
-	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
+	*max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
+	*max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
 
 	spin_unlock(&adev->mm_stats.lock);
-	return max_bytes;
 }
 
 /* Report how many bytes have really been moved for the last command
  * submission. This can result in a debt that can stop buffer migrations
  * temporarily.
  */
-void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes)
+void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
+				  u64 num_vis_bytes)
 {
 	spin_lock(&adev->mm_stats.lock);
 	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
+	adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
 	spin_unlock(&adev->mm_stats.lock);
 }
 
@@ -304,7 +325,7 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
 				 struct amdgpu_bo *bo)
 {
 	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
-	u64 initial_bytes_moved;
+	u64 initial_bytes_moved, bytes_moved;
 	uint32_t domain;
 	uint32_t old_mem;
 	int r;
@@ -312,22 +333,39 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
 	if (bo->pin_count)
 		return 0;
 
+	amdgpu_bo_clear_cpu_access_required(bo);
+
 	/* Don't move this buffer if we have depleted our allowance
 	 * to move it. Don't move anything if the threshold is zero.
 	 */
-	if (p->bytes_moved < p->bytes_moved_threshold)
-		domain = bo->prefered_domains;
-	else
+	if (p->bytes_moved < p->bytes_moved_threshold) {
+		if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
+		    (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
+			/* And don't move a CPU_ACCESS_REQUIRED BO to limited
+			 * visible VRAM if we've depleted our allowance to do
+			 * that.
+			 */
+			if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
+				domain = bo->prefered_domains;
+			else
+				domain = bo->allowed_domains;
+		} else {
+			domain = bo->prefered_domains;
+		}
+	} else {
 		domain = bo->allowed_domains;
+	}
 
-	amdgpu_bo_clear_cpu_access_required(bo);
 retry:
 	amdgpu_ttm_placement_from_domain(bo, domain);
 	initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
 	old_mem = bo->tbo.mem.mem_type;
 	r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
-	p->bytes_moved += atomic64_read(&adev->num_bytes_moved) -
+	bytes_moved = atomic64_read(&adev->num_bytes_moved) -
 		initial_bytes_moved;
+	p->bytes_moved += bytes_moved;
+	if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
+		p->bytes_moved_vis += bytes_moved;
 
 	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
 		domain = bo->allowed_domains;
@@ -560,8 +598,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 		list_splice(&need_pages, &p->validated);
 	}
 
-	p->bytes_moved_threshold = amdgpu_cs_get_threshold_for_moves(p->adev);
+	amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
+					  &p->bytes_moved_vis_threshold);
 	p->bytes_moved = 0;
+	p->bytes_moved_vis = 0;
 	p->evictable = list_last_entry(&p->validated,
 				       struct amdgpu_bo_list_entry,
 				       tv.head);
@@ -585,8 +625,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 		goto error_validate;
 	}
 
-	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
-
+	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
+				     p->bytes_moved_vis);
 	fpriv->vm.last_eviction_counter =
 		atomic64_read(&p->adev->num_evictions);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index a7d48a7..27d8c77 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -341,7 +341,7 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
 	struct amdgpu_bo *bo;
 	enum ttm_bo_type type;
 	unsigned long page_align;
-	u64 initial_bytes_moved;
+	u64 initial_bytes_moved, bytes_moved;
 	size_t acc_size;
 	int r;
 
@@ -417,8 +417,12 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
 	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
 				 &bo->placement, page_align, !kernel, NULL,
 				 acc_size, sg, resv, &amdgpu_ttm_bo_destroy);
-	amdgpu_cs_report_moved_bytes(adev,
-		atomic64_read(&adev->num_bytes_moved) - initial_bytes_moved);
+	bytes_moved = atomic64_read(&adev->num_bytes_moved) -
+		initial_bytes_moved;
+	if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
+		amdgpu_cs_report_moved_bytes(adev, bytes_moved, bytes_moved);
+	else
+		amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0);
 
 	if (unlikely(r != 0))
 		return r;
-- 
2.7.4