[PATCH 5/5] drm/amdgpu: Improve the software ring priority scheduler

Tue Oct 18 09:08:15 UTC 2022

From: Jiadong Zhu <Jiadong.Zhu at amd.com>

Using the drm scheduler, the software rings' scheduling threads with different
priorities have the same opportunity to get the spinlock and copy its packages
into the real ring. Though preemption may happen for the low priority ring, it
will catch up with the high priority ring by copying more data (the resubmit
package and the current ibs) on the next calling of set_wptr. As a result, the
priority is not quite effective.

Here are some details to improve the priority of software rings at the bottom
of drm scheduler by slowing down the low priority thread with following
strategy.
1. If all the high priority fences are signaled which indicates gpu is idle
   while there are low priority packages to submit, no delay happens.
2. When there are unsignaled fences on high priority rings, we account for the
   portion of the ibs sent from the low priority ring. If the portion exceeds
   a certain threshold(eg, 30%), a timeout wait happens on low priority
   threads till more high priority ibs submitted.
3. The mechanism is started when the first time mcbp triggered, ended when all
   the high priority fences are signaled.

Cc: Christian Koenig <Christian.Koenig at amd.com>
Cc: Michel Dänzer <michel at daenzer.net>
Signed-off-by: Jiadong Zhu <Jiadong.Zhu at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 93 ++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  3 +
 2 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
index 41b057b9358e..eac89094f1d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
@@ -27,7 +27,13 @@
 #include "amdgpu_ring.h"
 #include "amdgpu.h"
 
+/* The jiffies fallback resubmission happens */
 #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
+
+/* Maximum waiting jiffies on low priority ring thread */
+#define AMDGPU_MUX_DELAY_JIFFIES_TIMEOUT (HZ / 10)
+
+/* The time threshold of unsignaled fence that trigger mcbp */
 #define AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US 10000
 
 static const struct ring_info {
@@ -47,6 +53,69 @@ static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ri
 			&mux->ring_entry[ring->entry_index] : NULL;
 }
 
+static uint32_t ring_priority_to_ratio_pct(unsigned int hw_prio)
+{
+	uint32_t ratio;
+
+	switch (hw_prio) {
+	case AMDGPU_RING_PRIO_DEFAULT:
+		ratio = 30;
+		break;
+	case AMDGPU_RING_PRIO_2:
+		ratio = 100;
+		break;
+	default:
+		ratio = 100;
+	}
+	return ratio;
+}
+
+static void reset_wcnt_on_all_rings(struct amdgpu_ring_mux *mux)
+{
+	int i;
+
+	for (i = 0; i < mux->num_ring_entries; i++)
+		mux->ring_entry[i].w_cnt = 0;
+}
+
+/**
+ * Decide if the low priority ring task should be delayed when there are high
+ * priority ibs ongoing. If all the high priority fences are signaled at that
+ * time, gpu is idle, we do not need to wait. Otherwise we calculate the
+ * percentage of portions copying ibs on the current ring and compare with the
+ * threshold according to the priority.
+ */
+static bool proceed_on_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	struct amdgpu_ring *high_prio_ring;
+	u64 current_cnt, total_cnt = 0;
+	int i;
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		if (mux->ring_entry[i].ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
+			high_prio_ring = mux->ring_entry[i].ring;
+			break;
+		}
+	}
+
+	/*All high priority fences signaled indicates gpu is idle.*/
+	if (amdgpu_fence_count_emitted(high_prio_ring) == 0) {
+		reset_wcnt_on_all_rings(mux);
+		return true;
+	}
+
+	for (i = 0; i < mux->num_ring_entries; i++) {
+		if (mux->ring_entry[i].ring->hw_prio == ring->hw_prio)
+			current_cnt = mux->ring_entry[i].w_cnt;
+		total_cnt += mux->ring_entry[i].w_cnt;
+	}
+
+	if (total_cnt == 0)
+		return true;
+
+	return ring_priority_to_ratio_pct(ring->hw_prio) > current_cnt * 100 / total_cnt;
+}
+
 /* copy packages on sw ring range[begin, end) */
 static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
 						  struct amdgpu_ring *ring,
@@ -73,6 +142,13 @@ static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
 	}
 }
 
+/* delay low priotiry task depending on high priority rings fence signal condition*/
+static void wait_on_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
+{
+	wait_event_interruptible_timeout(mux->wait, proceed_on_ring(mux, ring),
+					 AMDGPU_MUX_DELAY_JIFFIES_TIMEOUT);
+}
+
 static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux)
 {
 	struct amdgpu_mux_entry *e = NULL;
@@ -126,7 +202,6 @@ static void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux)
 static void amdgpu_mux_resubmit_fallback(struct timer_list *t)
 {
 	struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
-
 	if (!spin_trylock(&mux->lock)) {
 		amdgpu_ring_mux_schedule_resubmit(mux);
 		DRM_ERROR("reschedule resubmit\n");
@@ -158,6 +233,7 @@ int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
 	}
 
 	spin_lock_init(&mux->lock);
+	init_waitqueue_head(&mux->wait);
 	timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
 
 	return 0;
@@ -205,8 +281,10 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
 {
 	struct amdgpu_mux_entry *e;
 
-	spin_lock(&mux->lock);
+	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && !proceed_on_ring(mux, ring))
+		wait_on_ring(mux, ring);
 
+	spin_lock(&mux->lock);
 	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
 		amdgpu_mux_resubmit_chunks(mux);
 
@@ -238,7 +316,12 @@ void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
 	} else {
 		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
 	}
+
+	e->w_cnt++;
 	spin_unlock(&mux->lock);
+
+	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
+		wake_up_interruptible(&mux->wait);
 }
 
 u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
@@ -373,7 +456,9 @@ int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
 	spin_lock(&mux->lock);
 	mux->pending_trailing_fence_signaled = true;
 	r = amdgpu_ring_preempt_ib(mux->real_ring);
+	reset_wcnt_on_all_rings(mux);
 	spin_unlock(&mux->lock);
+
 	return r;
 }
 
@@ -408,10 +493,6 @@ void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r
 	struct amdgpu_mux_entry *e;
 	struct amdgpu_mux_chunk *chunk;
 
-	spin_lock(&mux->lock);
-	amdgpu_mux_resubmit_chunks(mux);
-	spin_unlock(&mux->lock);
-
 	e = amdgpu_ring_mux_sw_entry(mux, ring);
 	if (!e) {
 		DRM_ERROR("cannot find entry!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
index aa758ebc86ae..a99647e33c9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
@@ -39,6 +39,7 @@ struct amdgpu_ring;
  * @sw_rptr: the read pointer in software ring.
  * @sw_wptr: the write pointer in software ring.
  * @list: list head for amdgpu_mux_chunk
+ * @w_cnt: the write count of the current ring.
  */
 struct amdgpu_mux_entry {
 	struct                  amdgpu_ring *ring;
@@ -48,6 +49,7 @@ struct amdgpu_mux_entry {
 	u64                     sw_rptr;
 	u64                     sw_wptr;
 	struct list_head        list;
+	u64                     w_cnt;
 };
 
 struct amdgpu_ring_mux {
@@ -64,6 +66,7 @@ struct amdgpu_ring_mux {
 	struct timer_list       resubmit_timer;
 
 	bool                    pending_trailing_fence_signaled;
+	wait_queue_head_t       wait;
 };
 
 /**
-- 
2.25.1