[PATCH] drm/amdgpu: optimize out a spin lock (v2)

Thu Jun 22 02:45:54 UTC 2017

Use atomic instead of spin lock.
v2: Adjust commit message

Signed-off-by: Alex Xie <AlexBin.Xie at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 110 +++++++++++++++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   1 -
 3 files changed, 76 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 7caf514..21d318b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1588,9 +1588,8 @@ struct amdgpu_device {
 
 	/* data for buffer migration throttling */
 	struct {
-		spinlock_t		lock;
-		s64			last_update_us;
-		s64			accum_us; /* accumulated microseconds */
+		atomic64_t		last_update_us;
+		atomic64_t		accum_us; /* accumulated microseconds */
 		u32			log2_max_MBps;
 	} mm_stats;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 82131d7..7b6f42e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -225,6 +225,9 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 	s64 time_us, increment_us;
 	u64 max_bytes;
 	u64 free_vram, total_vram, used_vram;
+	s64 old_update_us, head_time_us;
+	s64 accum_us;
+	s64 old_accum_us, head_accum_us;
 
 	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 	 * throttling.
@@ -242,47 +245,83 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 	used_vram = atomic64_read(&adev->vram_usage);
 	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
 
-	spin_lock(&adev->mm_stats.lock);
-
 	/* Increase the amount of accumulated us. */
-	time_us = ktime_to_us(ktime_get());
-	increment_us = time_us - adev->mm_stats.last_update_us;
-	adev->mm_stats.last_update_us = time_us;
-	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
-                                      us_upper_bound);
-
-	/* This prevents the short period of low performance when the VRAM
-	 * usage is low and the driver is in debt or doesn't have enough
-	 * accumulated us to fill VRAM quickly.
-	 *
-	 * The situation can occur in these cases:
-	 * - a lot of VRAM is freed by userspace
-	 * - the presence of a big buffer causes a lot of evictions
-	 *   (solution: split buffers into smaller ones)
-	 *
-	 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
-	 * accum_us to a positive number.
-	 */
-	if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
-		s64 min_us;
-
-		/* Be more aggresive on dGPUs. Try to fill a portion of free
-		 * VRAM now.
-		 */
-		if (!(adev->flags & AMD_IS_APU))
-			min_us = bytes_to_us(adev, free_vram / 4);
+	old_update_us = atomic64_read(&adev->mm_stats.last_update_us);
+	for (;;) {
+		time_us = ktime_to_us(ktime_get());
+		head_time_us = atomic64_cmpxchg(&adev->mm_stats.last_update_us,
+						old_update_us, time_us);
+
+		if (likely(head_time_us == old_update_us))
+			/*
+			 * No other task modified adev->mm_stats.last_update_us.
+			 * Update was successful.
+			 */
+			break;
 		else
-			min_us = 0; /* Reset accum_us on APUs. */
+			/* Another task modified the value after we read it.
+			 * A rare contention happens, let us retry.
+			 * In most case, one retry can do the job.
+			 * See function atomic64_add_unless as a similar idea.
+			 */
+			old_update_us = head_time_us;
+	}
+	increment_us = time_us - old_update_us;
+
+	old_accum_us = atomic64_read(&adev->mm_stats.accum_us);
+
+	for (;;) {
+		accum_us = min(old_accum_us + increment_us, us_upper_bound);
+
+		/* This prevents the short period of low performance when the
+		 * VRAM usage is low and the driver is in debt or doesn't have
+		 * enough accumulated us to fill VRAM quickly.
+		 *
+		 * The situation can occur in these cases:
+		 * - a lot of VRAM is freed by userspace
+		 * - the presence of a big buffer causes a lot of evictions
+		 *   (solution: split buffers into smaller ones)
+		 *
+		 * If 128 MB or 1/8th of VRAM is free, start filling it now by
+		 * setting accum_us to a positive number.
+		 */
+		if (free_vram >= 128 * 1024 * 1024 ||
+			free_vram >= total_vram / 8) {
+			s64 min_us;
+
+			/* Be more aggresive on dGPUs. Try to fill a portion of
+			 * free VRAM now.
+			 */
+			if (!(adev->flags & AMD_IS_APU))
+				min_us = bytes_to_us(adev, free_vram / 4);
+			else
+				min_us = 0; /* Reset accum_us on APUs. */
+
+			accum_us = max(min_us, accum_us);
+		}
+
+		head_accum_us = atomic64_cmpxchg(&adev->mm_stats.accum_us,
+							old_accum_us, accum_us);
 
-		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
+		if (likely(head_accum_us == old_accum_us))
+			/*
+			 * No other task modified adev->mm_stats.accum_us.
+			 * Update was successful.
+			 */
+			break;
+		else
+			/* Another task modified the value after we read it.
+			 * A rare contention happens, let us retry.
+			 * In most case, one retry can do the job.
+			 * See function atomic64_add_unless as a similar idea.
+			 */
+			old_accum_us = head_accum_us;
 	}
 
 	/* This returns 0 if the driver is in debt to disallow (optional)
 	 * buffer moves.
 	 */
-	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
-
-	spin_unlock(&adev->mm_stats.lock);
+	max_bytes = us_to_bytes(adev, accum_us);
 	return max_bytes;
 }
 
@@ -292,9 +331,8 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
  */
 void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes)
 {
-	spin_lock(&adev->mm_stats.lock);
-	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
-	spin_unlock(&adev->mm_stats.lock);
+	s64 i = bytes_to_us(adev, num_bytes);
+	atomic64_sub(i, &adev->mm_stats.accum_us);
 }
 
 static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ff90f78..9e9d592 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2117,7 +2117,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	spin_lock_init(&adev->didt_idx_lock);
 	spin_lock_init(&adev->gc_cac_idx_lock);
 	spin_lock_init(&adev->audio_endpt_idx_lock);
-	spin_lock_init(&adev->mm_stats.lock);
 
 	INIT_LIST_HEAD(&adev->shadow_list);
 	mutex_init(&adev->shadow_list_lock);
-- 
2.7.4