[Intel-gfx] [PATCH 20/41] drm/i915: Replace priolist rbtree with a skiplist

Wed Jan 27 15:10:43 UTC 2021

On 25/01/2021 14:01, Chris Wilson wrote:
> Replace the priolist rbtree with a skiplist. The crucial difference is
> that walking and removing the first element of a skiplist is O(1), but

I wasn't (and am not) familiar with them, but wikipedia page says 
removal is O(logN) average case to O(N) worst case.

If I understand correctly O(1) could be ignoring the need to traverse 
from top to bottom level and removing the element from all. But since 
I915_PRIOLIST_HEIGHT is fixed maybe it is okay to call it O(1).

I wonder though why this wouldn't mean skip list would be worse for both 
lightly loaded and highly-loaded scenarios? Presumably height would need 
to be balanced to compensate for that.

In summary I have no idea for what number of in-flight requests would 
they be better.

How about putting this patch aside for now since it doesn't sound it is 
critical for deadline scheduling per se?

Regards,

Tvrtko

> O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> hindrance for submission latency as it occurs between picking a request
> for the priolist and submitting it to hardware, as well effectively
> trippling the number of O(lgN) operations required under the irqoff lock.
> This is critical to reducing the latency jitter with multiple clients.
> 
> The downsides to skiplists are that lookup/insertion is only
> probablistically O(lgN) and there is a significant memory penalty to
> as each skip node is larger than the rbtree equivalent. Furthermore, we
> don't use dynamic arrays for the skiplist, so the allocation is fixed,
> and imposes an upper bound on the scalability wrt to the number of
> inflight requests.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   .../drm/i915/gt/intel_execlists_submission.c  |  63 +++--
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  30 +--
>   drivers/gpu/drm/i915/i915_priolist_types.h    |  28 +-
>   drivers/gpu/drm/i915/i915_scheduler.c         | 244 ++++++++++++++----
>   drivers/gpu/drm/i915/i915_scheduler.h         |  11 +-
>   drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
>   .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
>   .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 +++-
>   8 files changed, 316 insertions(+), 116 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 1103c8a00af1..129144dd86b0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -244,11 +244,6 @@ static void ring_set_paused(const struct intel_engine_cs *engine, int state)
>   		wmb();
>   }
>   
> -static struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static int rq_prio(const struct i915_request *rq)
>   {
>   	return READ_ONCE(rq->sched.attr.priority);
> @@ -272,15 +267,31 @@ static int effective_prio(const struct i915_request *rq)
>   	return prio;
>   }
>   
> -static int queue_prio(const struct i915_sched_engine *se)
> +static struct i915_request *first_request(struct i915_sched_engine *se)
>   {
> -	struct rb_node *rb;
> +	struct i915_priolist *pl;
>   
> -	rb = rb_first_cached(&se->queue);
> -	if (!rb)
> +	for_each_priolist(pl, &se->queue) {
> +		if (likely(!list_empty(&pl->requests)))
> +			return list_first_entry(&pl->requests,
> +						struct i915_request,
> +						sched.link);
> +
> +		i915_priolist_advance(&se->queue, pl);
> +	}
> +
> +	return NULL;
> +}
> +
> +static int queue_prio(struct i915_sched_engine *se)
> +{
> +	struct i915_request *rq;
> +
> +	rq = first_request(se);
> +	if (!rq)
>   		return INT_MIN;
>   
> -	return to_priolist(rb)->priority;
> +	return rq_prio(rq);
>   }
>   
>   static int virtual_prio(const struct intel_engine_execlists *el)
> @@ -290,7 +301,7 @@ static int virtual_prio(const struct intel_engine_execlists *el)
>   	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
>   }
>   
> -static bool need_preempt(const struct intel_engine_cs *engine,
> +static bool need_preempt(struct intel_engine_cs *engine,
>   			 const struct i915_request *rq)
>   {
>   	int last_prio;
> @@ -1136,6 +1147,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request ** const last_port = port + execlists->port_mask;
>   	struct i915_request *last, * const *active;
>   	struct virtual_engine *ve;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	bool submit = false;
>   
> @@ -1346,11 +1358,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			break;
>   	}
>   
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>   		struct i915_request *rq, *rn;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			bool merge = true;
>   
>   			/*
> @@ -1425,8 +1436,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			}
>   		}
>   
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   done:
>   	*port++ = i915_request_get(last);
> @@ -2631,6 +2641,7 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_request *rq, *rn;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	unsigned long flags;
>   
> @@ -2661,16 +2672,12 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	intel_engine_signal_breadcrumbs(engine);
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			i915_request_mark_eio(rq);
>   			__i915_request_submit(rq);
>   		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>   
> @@ -2703,7 +2710,6 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>   
>   	GEM_BUG_ON(__tasklet_is_enabled(&engine->active.tasklet));
>   	engine->active.tasklet.func = nop_submission_tasklet;
> @@ -3089,6 +3095,8 @@ static void virtual_context_exit(struct intel_context *ce)
>   
>   	for (n = 0; n < ve->num_siblings; n++)
>   		intel_engine_pm_put(ve->siblings[n]);
> +
> +	i915_sched_park_engine(&ve->base.active);
>   }
>   
>   static const struct intel_context_ops virtual_context_ops = {
> @@ -3501,6 +3509,7 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   {
>   	const struct intel_engine_execlists *execlists = &engine->execlists;
>   	struct i915_request *rq, *last;
> +	struct i915_priolist *pl;
>   	unsigned long flags;
>   	unsigned int count;
>   	struct rb_node *rb;
> @@ -3530,10 +3539,8 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   
>   	last = NULL;
>   	count = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -
> -		priolist_for_each_request(rq, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request(rq, pl) {
>   			if (count++ < max - 1)
>   				show_request(m, rq, "\t\t", 0);
>   			else
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 2d7339ef3b4c..8d0c6cd277b3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -59,11 +59,6 @@
>   
>   #define GUC_REQUEST_SIZE 64 /* bytes */
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
>   {
>   	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
> @@ -185,8 +180,8 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request ** const last_port = first + execlists->port_mask;
>   	struct i915_request *last = first[0];
>   	struct i915_request **port;
> +	struct i915_priolist *pl;
>   	bool submit = false;
> -	struct rb_node *rb;
>   
>   	lockdep_assert_held(&engine->active.lock);
>   
> @@ -203,11 +198,10 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	 * event.
>   	 */
>   	port = first;
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>   		struct i915_request *rq, *rn;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			if (last && rq->context != last->context) {
>   				if (port == last_port)
>   					goto done;
> @@ -223,12 +217,11 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   			last = rq;
>   		}
>   
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   done:
>   	execlists->queue_priority_hint =
> -		rb ? to_priolist(rb)->priority : INT_MIN;
> +		pl != &engine->active.queue.sentinel ? pl->priority : INT_MIN;
>   	if (submit) {
>   		*port = schedule_in(last, port - execlists->inflight);
>   		*++port = NULL;
> @@ -327,7 +320,7 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_request *rq, *rn;
> -	struct rb_node *rb;
> +	struct i915_priolist *p;
>   	unsigned long flags;
>   
>   	ENGINE_TRACE(engine, "\n");
> @@ -355,25 +348,20 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   	}
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(p, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, p) {
>   			list_del_init(&rq->sched.link);
>   			__i915_request_submit(rq);
>   			dma_fence_set_error(&rq->fence, -EIO);
>   			i915_request_mark_complete(rq);
>   		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, p);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>   
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>   
>   	spin_unlock_irqrestore(&engine->active.lock, flags);
>   }
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index bc2fa84f98a8..1200c3df6a4a 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -38,10 +38,36 @@ enum {
>   #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>   #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>   
> +#ifdef CONFIG_64BIT
> +#define I915_PRIOLIST_HEIGHT 12
> +#else
> +#define I915_PRIOLIST_HEIGHT 11
> +#endif
> +
>   struct i915_priolist {
>   	struct list_head requests;
> -	struct rb_node node;
>   	int priority;
> +
> +	int level;
> +	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
>   };
>   
> +struct i915_priolist_root {
> +	struct i915_priolist sentinel;
> +	u32 prng;
> +};
> +
> +#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
> +
> +#define for_each_priolist(p, root) \
> +	for ((p) = (root)->sentinel.next[0]; \
> +	     (p) != &(root)->sentinel; \
> +	     (p) = (p)->next[0])
> +
> +#define priolist_for_each_request(it, plist) \
> +	list_for_each_entry(it, &(plist)->requests, sched.link)
> +
> +#define priolist_for_each_request_safe(it, n, plist) \
> +	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> +
>   #endif /* _I915_PRIOLIST_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index a3ee06cb66d7..74000d3eebb1 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -4,7 +4,9 @@
>    * Copyright © 2018 Intel Corporation
>    */
>   
> +#include <linux/bitops.h>
>   #include <linux/mutex.h>
> +#include <linux/prandom.h>
>   
>   #include "gt/intel_ring.h"
>   #include "gt/intel_lrc_reg.h"
> @@ -91,15 +93,24 @@ static void i915_sched_init_ipi(struct i915_sched_ipi *ipi)
>   	ipi->list = NULL;
>   }
>   
> +static void init_priolist(struct i915_priolist_root *const root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +
> +	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
> +	pl->priority = INT_MIN;
> +	pl->level = -1;
> +}
> +
>   void i915_sched_init_engine(struct i915_sched_engine *se,
>   			    unsigned int subclass)
>   {
>   	spin_lock_init(&se->lock);
>   	lockdep_set_subclass(&se->lock, subclass);
>   
> +	init_priolist(&se->queue);
>   	INIT_LIST_HEAD(&se->requests);
>   	INIT_LIST_HEAD(&se->hold);
> -	se->queue = RB_ROOT_CACHED;
>   
>   	i915_sched_init_ipi(&se->ipi);
>   
> @@ -116,8 +127,57 @@ void i915_sched_init_engine(struct i915_sched_engine *se,
>   #endif
>   }
>   
> +__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +	int lvl;
> +
> +	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
> +		if (pl->next[lvl] != pl) {
> +			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
> +			return false;
> +		}
> +	}
> +
> +	if (pl->level != -1) {
> +		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static void pl_push(struct i915_priolist *pl, struct list_head *head)
> +{
> +	pl->requests.next = head->next;
> +	head->next = &pl->requests;
> +}
> +
> +static struct i915_priolist *pl_pop(struct list_head *head)
> +{
> +	struct i915_priolist *pl;
> +
> +	pl = container_of(head->next, typeof(*pl), requests);
> +	head->next = pl->requests.next;
> +
> +	return pl;
> +}
> +
> +static bool pl_empty(struct list_head *head)
> +{
> +	return !head->next;
> +}
> +
>   void i915_sched_park_engine(struct i915_sched_engine *se)
>   {
> +	struct i915_priolist_root *root = &se->queue;
> +	struct list_head *list = &root->sentinel.requests;
> +
> +	GEM_BUG_ON(!priolist_idle(root));
> +
> +	while (!pl_empty(list))
> +		kmem_cache_free(global.slab_priorities, pl_pop(list));
> +
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   	se->no_priolist = false;
>   }
> @@ -183,71 +243,55 @@ static inline bool node_signaled(const struct i915_sched_node *node)
>   	return i915_request_completed(node_to_request(node));
>   }
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +static inline unsigned int random_level(struct i915_priolist_root *root)
>   {
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> -static void assert_priolists(struct i915_sched_engine * const se)
> -{
> -	struct rb_node *rb;
> -	long last_prio;
> -
> -	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> -		return;
> -
> -	GEM_BUG_ON(rb_first_cached(&se->queue) !=
> -		   rb_first(&se->queue.rb_root));
> -
> -	last_prio = INT_MAX;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		const struct i915_priolist *p = to_priolist(rb);
> -
> -		GEM_BUG_ON(p->priority > last_prio);
> -		last_prio = p->priority;
> -	}
> +	root->prng = next_pseudo_random32(root->prng);
> +	return  __ffs(root->prng) / 2;
>   }
>   
>   static struct list_head *
>   lookup_priolist(struct intel_engine_cs *engine, int prio)
>   {
> +	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
>   	struct i915_sched_engine * const se = &engine->active;
> -	struct i915_priolist *p;
> -	struct rb_node **parent, *rb;
> -	bool first = true;
> -
> -	lockdep_assert_held(&engine->active.lock);
> -	assert_priolists(se);
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	int lvl;
>   
> +	lockdep_assert_held(&se->lock);
>   	if (unlikely(se->no_priolist))
>   		prio = I915_PRIORITY_NORMAL;
>   
> +	for_each_priolist(pl, root) { /* recycle any empty elements before us */
> +		if (pl->priority >= prio || !list_empty(&pl->requests))
> +			break;
> +
> +		i915_priolist_advance(root, pl);
> +	}
> +
>   find_priolist:
> -	/* most positive priority is scheduled first, equal priorities fifo */
> -	rb = NULL;
> -	parent = &se->queue.rb_root.rb_node;
> -	while (*parent) {
> -		rb = *parent;
> -		p = to_priolist(rb);
> -		if (prio > p->priority) {
> -			parent = &rb->rb_left;
> -		} else if (prio < p->priority) {
> -			parent = &rb->rb_right;
> -			first = false;
> -		} else {
> -			return &p->requests;
> -		}
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	while (lvl >= 0) {
> +		while (tmp = pl->next[lvl], tmp->priority >= prio)
> +			pl = tmp;
> +		if (pl->priority == prio)
> +			goto out;
> +		update[lvl--] = pl;
>   	}
>   
>   	if (prio == I915_PRIORITY_NORMAL) {
> -		p = &se->default_priolist;
> +		pl = &se->default_priolist;
> +	} else if (!pl_empty(&root->sentinel.requests)) {
> +		pl = pl_pop(&root->sentinel.requests);
>   	} else {
> -		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> +		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>   		/* Convert an allocation failure to a priority bump */
> -		if (unlikely(!p)) {
> +		if (unlikely(!pl)) {
>   			prio = I915_PRIORITY_NORMAL; /* recurses just once */
>   
> -			/* To maintain ordering with all rendering, after an
> +			/*
> +			 * To maintain ordering with all rendering, after an
>   			 * allocation failure we have to disable all scheduling.
>   			 * Requests will then be executed in fifo, and schedule
>   			 * will ensure that dependencies are emitted in fifo.
> @@ -260,18 +304,103 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>   		}
>   	}
>   
> -	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	pl->priority = prio;
> +	INIT_LIST_HEAD(&pl->requests);
>   
> -	rb_link_node(&p->node, rb, parent);
> -	rb_insert_color_cached(&p->node, &se->queue, first);
> +	lvl = random_level(root);
> +	if (lvl > root->sentinel.level) {
> +		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> +			lvl = ++root->sentinel.level;
> +			update[lvl] = &root->sentinel;
> +		} else {
> +			lvl = I915_PRIOLIST_HEIGHT - 1;
> +		}
> +	}
> +	GEM_BUG_ON(lvl < 0);
> +	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
>   
> -	return &p->requests;
> +	pl->level = lvl;
> +	do {
> +		tmp = update[lvl];
> +		pl->next[lvl] = update[lvl]->next[lvl];
> +		tmp->next[lvl] = pl;
> +	} while (--lvl >= 0);
> +
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
> +		struct i915_priolist *chk;
> +
> +		chk = &root->sentinel;
> +		lvl = chk->level;
> +		do {
> +			while (tmp = chk->next[lvl], tmp->priority >= prio)
> +				chk = tmp;
> +		} while (--lvl >= 0);
> +
> +		GEM_BUG_ON(chk != pl);
> +	}
> +
> +out:
> +	GEM_BUG_ON(pl == &root->sentinel);
> +	return &pl->requests;
>   }
>   
> -void __i915_priolist_free(struct i915_priolist *p)
> +static void remove_priolist(struct intel_engine_cs *engine,
> +			    struct list_head *plist)
>   {
> -	kmem_cache_free(global.slab_priorities, p);
> +	struct i915_sched_engine * const se = &engine->active;
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	struct i915_priolist *old =
> +		container_of(plist, struct i915_priolist, requests);
> +	int prio = old->priority;
> +	int lvl;
> +
> +	lockdep_assert_held(&se->lock);
> +	GEM_BUG_ON(!list_empty(plist));
> +
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +
> +	if (prio != I915_PRIORITY_NORMAL)
> +		pl_push(old, &pl->requests);
> +
> +	do {
> +		while (tmp = pl->next[lvl], tmp->priority > prio)
> +			pl = tmp;
> +		if (lvl <= old->level) {
> +			pl->next[lvl] = old->next[lvl];
> +			if (pl == &root->sentinel && old->next[lvl] == pl) {
> +				GEM_BUG_ON(pl->level != lvl);
> +				pl->level--;
> +			}
> +		}
> +	} while (--lvl >= 0);
> +	GEM_BUG_ON(tmp != old);
> +}
> +
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *pl)
> +{
> +	struct i915_priolist * const s = &root->sentinel;
> +	int lvl;
> +
> +	GEM_BUG_ON(!list_empty(&pl->requests));
> +	GEM_BUG_ON(pl != s->next[0]);
> +	GEM_BUG_ON(pl == s);
> +
> +	if (pl->priority != I915_PRIORITY_NORMAL)
> +		pl_push(pl, &s->requests);
> +
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +	do {
> +		s->next[lvl] = pl->next[lvl];
> +		if (pl->next[lvl] == s) {
> +			GEM_BUG_ON(s->level != lvl);
> +			s->level--;
> +		}
> +	} while (--lvl >= 0);
>   }
>   
>   static struct i915_request *
> @@ -420,8 +549,13 @@ static void __i915_request_set_priority(struct i915_request *rq, int prio)
>   			continue;
>   
>   		GEM_BUG_ON(rq->engine != engine);
> -		if (i915_request_in_priority_queue(rq))
> +		if (i915_request_in_priority_queue(rq)) {
> +			struct list_head *prev = rq->sched.link.prev;
> +
>   			list_move_tail(&rq->sched.link, plist);
> +			if (list_empty(prev))
> +				remove_priolist(engine, prev);
> +		}
>   
>   		/* Defer (tasklet) submission until after all updates. */
>   		kick_submission(engine, rq, prio);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 0ab47cbf0e9c..bca89a58d953 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -16,12 +16,6 @@
>   
>   struct drm_printer;
>   
> -#define priolist_for_each_request(it, plist) \
> -	list_for_each_entry(it, &(plist)->requests, sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist) \
> -	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> -
>   void i915_sched_node_init(struct i915_sched_node *node);
>   void i915_sched_node_reinit(struct i915_sched_node *node);
>   
> @@ -69,7 +63,7 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>   
>   static inline bool i915_sched_is_idle(const struct i915_sched_engine *se)
>   {
> -	return RB_EMPTY_ROOT(&se->queue.rb_root);
> +	return i915_priolist_is_empty(&se->queue);
>   }
>   
>   static inline bool
> @@ -99,6 +93,9 @@ static inline void i915_sched_kick(struct i915_sched_engine *se)
>   	tasklet_hi_schedule(&se->tasklet);
>   }
>   
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *old);
> +
>   void i915_request_show_with_schedule(struct drm_printer *m,
>   				     const struct i915_request *rq,
>   				     const char *prefix,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index f668c680d290..e64750be4e77 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -89,7 +89,7 @@ struct i915_sched_engine {
>   	/**
>   	 * @queue: queue of requests, in priority lists
>   	 */
> -	struct rb_root_cached queue;
> +	struct i915_priolist_root queue;
>   
>   	struct i915_sched_ipi ipi;
>   
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 3db34d3eea58..946c93441c1f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -25,6 +25,7 @@ selftest(ring, intel_ring_mock_selftests)
>   selftest(engine, intel_engine_cs_mock_selftests)
>   selftest(timelines, intel_timeline_mock_selftests)
>   selftest(requests, i915_request_mock_selftests)
> +selftest(scheduler, i915_scheduler_mock_selftests)
>   selftest(objects, i915_gem_object_mock_selftests)
>   selftest(phys, i915_gem_phys_mock_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> index de44a66210b7..de5b1443129b 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> @@ -12,6 +12,54 @@
>   #include "selftests/igt_spinner.h"
>   #include "selftests/i915_random.h"
>   
> +static int mock_skiplist_levels(void *dummy)
> +{
> +	struct i915_priolist_root root = {};
> +	struct i915_priolist *pl = &root.sentinel;
> +	IGT_TIMEOUT(end_time);
> +	unsigned long total;
> +	int count, lvl;
> +
> +	total = 0;
> +	do {
> +		for (count = 0; count < 16384; count++) {
> +			lvl = random_level(&root);
> +			if (lvl > pl->level) {
> +				if (lvl < I915_PRIOLIST_HEIGHT - 1)
> +					lvl = ++pl->level;
> +				else
> +					lvl = I915_PRIOLIST_HEIGHT - 1;
> +			}
> +
> +			pl->next[lvl] = ptr_inc(pl->next[lvl]);
> +		}
> +		total += count;
> +	} while (!__igt_timeout(end_time, NULL));
> +
> +	pr_info("Total %9lu\n", total);
> +	for (lvl = 0; lvl <= pl->level; lvl++) {
> +		int x = ilog2((unsigned long)pl->next[lvl]);
> +		char row[80];
> +
> +		memset(row, '*', x);
> +		row[x] = '\0';
> +
> +		pr_info(" [%2d] %9lu %s\n",
> +			lvl, (unsigned long)pl->next[lvl], row);
> +	}
> +
> +	return 0;
> +}
> +
> +int i915_scheduler_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(mock_skiplist_levels),
> +	};
> +
> +	return i915_subtests(tests, NULL);
> +}
> +
>   static void scheduling_disable(struct intel_engine_cs *engine)
>   {
>   	engine->props.preempt_timeout_ms = 0;
> @@ -80,9 +128,9 @@ static int all_engines(struct drm_i915_private *i915,
>   static bool check_context_order(struct intel_engine_cs *engine)
>   {
>   	u64 last_seqno, last_context;
> +	struct i915_priolist *p;
>   	unsigned long count;
>   	bool result = false;
> -	struct rb_node *rb;
>   	int last_prio;
>   
>   	/* We expect the execution order to follow ascending fence-context */
> @@ -92,8 +140,7 @@ static bool check_context_order(struct intel_engine_cs *engine)
>   	last_context = 0;
>   	last_seqno = 0;
>   	last_prio = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	for_each_priolist(p, &engine->active.queue) {
>   		struct i915_request *rq;
>   
>   		priolist_for_each_request(rq, p) {
>