[Intel-gfx] [PATCH 09/31] drm/i915: Replace priolist rbtree with a skiplist

Mon Feb 8 12:29:14 UTC 2021

On 08/02/2021 10:52, Chris Wilson wrote:
> Replace the priolist rbtree with a skiplist. The crucial difference is
> that walking and removing the first element of a skiplist is O(1), but
> O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> hindrance for submission latency as it occurs between picking a request
> for the priolist and submitting it to hardware, as well effectively
> tripling the number of O(lgN) operations required under the irqoff lock.
> This is critical to reducing the latency jitter with multiple clients.
> 
> The downsides to skiplists are that lookup/insertion is only
> probabilistically O(lgN) and there is a significant memory penalty to
> as each skip node is larger than the rbtree equivalent. Furthermore, we
> don't use dynamic arrays for the skiplist, so the allocation is fixed,
> and imposes an upper bound on the scalability wrt to the number of
> inflight requests.
> 
> In the following patches, we introduce a new sort key to the scheduler,
> a virtual deadline. This imposes a different structure to the tree.
> Using a priority sort, we have very few priority levels active at any
> time, most likely just the default priority and so the rbtree degenerates
> to a single elements containing the list of all ready requests. The
> deadlines in contrast are very sparse, and typically each request has a
> unique deadline. Instead of being able to simply walk the list during
> dequeue, with the deadline scheduler we have to iterate through the bst
> on the critical submission path. Skiplists are vastly superior in this
> instance due to the O(1) iteration during dequeue, with very similar
> characteristics [on average] to the rbtree for insertion.
> 
> This means that by using skiplists we can introduce a sparse sort key
> without degrading latency on the critical submission path.
> 
> As an example, one simple case where we try to do lots of
> semi-independent work without any priority management (gem_exec_parallel),
> the lock hold times were:
> [worst]        [total]    [avg]
>   973.05     6301584.84     0.35 # plain rbtree
>   559.82     5424915.25     0.33 # best rbtree with pruning
>   208.21     3898784.09     0.24 # skiplist
>    34.05     5784106.01     0.32 # rbtree without deadlines
>    23.35     4152999.80     0.24 # skiplist without deadlines
> 
> Based on the skiplist implementation by Dr Con Kolivas for MuQSS.
> 
> References: https://en.wikipedia.org/wiki/Skip_list
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   .../drm/i915/gt/intel_execlists_submission.c  | 168 +++++-----
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  41 +--
>   drivers/gpu/drm/i915/i915_priolist_types.h    |  64 +++-
>   drivers/gpu/drm/i915/i915_scheduler.c         | 304 +++++++++++++-----
>   drivers/gpu/drm/i915/i915_scheduler.h         |  16 +-
>   drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
>   .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
>   .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 ++-
>   8 files changed, 454 insertions(+), 195 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 78fda9b4f626..4a0258347c10 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -254,11 +254,6 @@ static void ring_set_paused(const struct intel_engine_cs *engine, int state)
>   		wmb();
>   }
>   
> -static struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static int rq_prio(const struct i915_request *rq)
>   {
>   	return READ_ONCE(rq->sched.attr.priority);
> @@ -282,15 +277,27 @@ static int effective_prio(const struct i915_request *rq)
>   	return prio;
>   }
>   
> +static struct i915_request *first_request(const struct i915_sched *se)
> +{
> +	struct i915_priolist *pl = se->queue.sentinel.next[0];
> +
> +	if (pl == &se->queue.sentinel)
> +		return NULL;
> +
> +	return list_first_entry_or_null(&pl->requests,
> +					struct i915_request,
> +					sched.link);
> +}
> +
>   static int queue_prio(const struct i915_sched *se)
>   {
> -	struct rb_node *rb;
> +	struct i915_request *rq;
>   
> -	rb = rb_first_cached(&se->queue);
> -	if (!rb)
> +	rq = first_request(se);
> +	if (!rq)
>   		return INT_MIN;
>   
> -	return to_priolist(rb)->priority;
> +	return rq_prio(rq);
>   }
>   
>   static int virtual_prio(const struct intel_engine_execlists *el)
> @@ -300,7 +307,7 @@ static int virtual_prio(const struct intel_engine_execlists *el)
>   	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
>   }
>   
> -static bool need_preempt(const struct intel_engine_cs *engine,
> +static bool need_preempt(struct intel_engine_cs *engine,
>   			 const struct i915_request *rq)
>   {
>   	const struct i915_sched *se = &engine->sched;
> @@ -1144,7 +1151,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request **port = execlists->pending;
>   	struct i915_request ** const last_port = port + execlists->port_mask;
>   	struct i915_request *last, * const *active;
> +	struct i915_request *rq, *rn;
>   	struct virtual_engine *ve;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	bool submit = false;
>   
> @@ -1355,87 +1364,79 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			break;
>   	}
>   
> -	while ((rb = rb_first_cached(&se->queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -		struct i915_request *rq, *rn;
> +	i915_sched_dequeue(se, pl, rq, rn) {
> +		bool merge = true;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> -			bool merge = true;
> +		/*
> +		 * Can we combine this request with the current port?
> +		 * It has to be the same context/ringbuffer and not
> +		 * have any exceptions (e.g. GVT saying never to
> +		 * combine contexts).
> +		 *
> +		 * If we can combine the requests, we can execute both
> +		 * by updating the RING_TAIL to point to the end of the
> +		 * second request, and so we never need to tell the
> +		 * hardware about the first.
> +		 */
> +		if (last && !can_merge_rq(last, rq)) {
> +			/*
> +			 * If we are on the second port and cannot
> +			 * combine this request with the last, then we
> +			 * are done.
> +			 */
> +			if (port == last_port)
> +				goto done;
>   
>   			/*
> -			 * Can we combine this request with the current port?
> -			 * It has to be the same context/ringbuffer and not
> -			 * have any exceptions (e.g. GVT saying never to
> -			 * combine contexts).
> -			 *
> -			 * If we can combine the requests, we can execute both
> -			 * by updating the RING_TAIL to point to the end of the
> -			 * second request, and so we never need to tell the
> -			 * hardware about the first.
> +			 * We must not populate both ELSP[] with the
> +			 * same LRCA, i.e. we must submit 2 different
> +			 * contexts if we submit 2 ELSP.
>   			 */
> -			if (last && !can_merge_rq(last, rq)) {
> -				/*
> -				 * If we are on the second port and cannot
> -				 * combine this request with the last, then we
> -				 * are done.
> -				 */
> -				if (port == last_port)
> -					goto done;
> +			if (last->context == rq->context)
> +				goto done;
>   
> -				/*
> -				 * We must not populate both ELSP[] with the
> -				 * same LRCA, i.e. we must submit 2 different
> -				 * contexts if we submit 2 ELSP.
> -				 */
> -				if (last->context == rq->context)
> -					goto done;
> +			if (i915_request_has_sentinel(last))
> +				goto done;
>   
> -				if (i915_request_has_sentinel(last))
> -					goto done;
> +			/*
> +			 * We avoid submitting virtual requests into
> +			 * the secondary ports so that we can migrate
> +			 * the request immediately to another engine
> +			 * rather than wait for the primary request.
> +			 */
> +			if (rq->execution_mask != engine->mask)
> +				goto done;
>   
> -				/*
> -				 * We avoid submitting virtual requests into
> -				 * the secondary ports so that we can migrate
> -				 * the request immediately to another engine
> -				 * rather than wait for the primary request.
> -				 */
> -				if (rq->execution_mask != engine->mask)
> -					goto done;
> +			/*
> +			 * If GVT overrides us we only ever submit
> +			 * port[0], leaving port[1] empty. Note that we
> +			 * also have to be careful that we don't queue
> +			 * the same context (even though a different
> +			 * request) to the second port.
> +			 */
> +			if (ctx_single_port_submission(last->context) ||
> +			    ctx_single_port_submission(rq->context))
> +				goto done;
>   
> -				/*
> -				 * If GVT overrides us we only ever submit
> -				 * port[0], leaving port[1] empty. Note that we
> -				 * also have to be careful that we don't queue
> -				 * the same context (even though a different
> -				 * request) to the second port.
> -				 */
> -				if (ctx_single_port_submission(last->context) ||
> -				    ctx_single_port_submission(rq->context))
> -					goto done;
> -
> -				merge = false;
> -			}
> -
> -			if (__i915_request_submit(rq)) {
> -				if (!merge) {
> -					*port++ = i915_request_get(last);
> -					last = NULL;
> -				}
> -
> -				GEM_BUG_ON(last &&
> -					   !can_merge_ctx(last->context,
> -							  rq->context));
> -				GEM_BUG_ON(last &&
> -					   i915_seqno_passed(last->fence.seqno,
> -							     rq->fence.seqno));
> -
> -				submit = true;
> -				last = rq;
> -			}
> +			merge = false;
>   		}
>   
> -		rb_erase_cached(&p->node, &se->queue);
> -		i915_priolist_free(p);
> +		if (__i915_request_submit(rq)) {
> +			if (!merge) {
> +				*port++ = i915_request_get(last);
> +				last = NULL;
> +			}
> +
> +			GEM_BUG_ON(last &&
> +				   !can_merge_ctx(last->context,
> +						  rq->context));
> +			GEM_BUG_ON(last &&
> +				   i915_seqno_passed(last->fence.seqno,
> +						     rq->fence.seqno));
> +
> +			submit = true;
> +			last = rq;
> +		}
>   	}
>   done:
>   	*port++ = i915_request_get(last);
> @@ -1456,7 +1457,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	 * request triggering preemption on the next dequeue (or subsequent
>   	 * interrupt for secondary ports).
>   	 */
> -	execlists->queue_priority_hint = queue_prio(se);
> +	execlists->queue_priority_hint = pl->priority;
>   	spin_unlock(&se->lock);
>   
>   	/*
> @@ -2716,7 +2717,6 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	}
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	se->queue = RB_ROOT_CACHED;
>   
>   	GEM_BUG_ON(__tasklet_is_enabled(&se->tasklet));
>   	se->tasklet.callback = nop_submission_tasklet;
> @@ -3173,6 +3173,8 @@ static void virtual_context_exit(struct intel_context *ce)
>   
>   	for (n = 0; n < ve->num_siblings; n++)
>   		intel_engine_pm_put(ve->siblings[n]);
> +
> +	i915_sched_park(intel_engine_get_scheduler(&ve->base));
>   }
>   
>   static const struct intel_context_ops virtual_context_ops = {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index d14b9db77df8..c16393df42a0 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -60,11 +60,6 @@
>   
>   #define GUC_REQUEST_SIZE 64 /* bytes */
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
>   {
>   	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
> @@ -186,9 +181,10 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request **first = execlists->inflight;
>   	struct i915_request ** const last_port = first + execlists->port_mask;
>   	struct i915_request *last = first[0];
> +	struct i915_request *rq, *rn;
>   	struct i915_request **port;
> +	struct i915_priolist *pl;
>   	bool submit = false;
> -	struct rb_node *rb;
>   
>   	lockdep_assert_held(&se->lock);
>   
> @@ -205,32 +201,22 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	 * event.
>   	 */
>   	port = first;
> -	while ((rb = rb_first_cached(&se->queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -		struct i915_request *rq, *rn;
> +	i915_sched_dequeue(se, pl, rq, rn) {
> +		if (last && rq->context != last->context) {
> +			if (port == last_port)
> +				goto done;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> -			if (last && rq->context != last->context) {
> -				if (port == last_port)
> -					goto done;
> -
> -				*port = schedule_in(last,
> -						    port - execlists->inflight);
> -				port++;
> -			}
> -
> -			list_del_init(&rq->sched.link);
> -			__i915_request_submit(rq);
> -			submit = true;
> -			last = rq;
> +			*port = schedule_in(last, port - execlists->inflight);
> +			port++;
>   		}
>   
> -		rb_erase_cached(&p->node, &se->queue);
> -		i915_priolist_free(p);
> +		list_del_init(&rq->sched.link);
> +		__i915_request_submit(rq);
> +		submit = true;
> +		last = rq;
>   	}
>   done:
> -	execlists->queue_priority_hint =
> -		rb ? to_priolist(rb)->priority : INT_MIN;
> +	execlists->queue_priority_hint = pl->priority;
>   	if (submit) {
>   		*port = schedule_in(last, port - execlists->inflight);
>   		*++port = NULL;
> @@ -361,7 +347,6 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   	__i915_sched_cancel_queue(se);
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	se->queue = RB_ROOT_CACHED;
>   
>   	spin_unlock_irqrestore(&se->lock, flags);
>   	intel_engine_signal_breadcrumbs(engine);
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index bc2fa84f98a8..ee7482b9c813 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -38,10 +38,72 @@ enum {
>   #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>   #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>   
> +/*
> + * The slab returns power-of-two chunks of memory, so fill out the
> + * node to the next cacheline.
> + *
> + * We can estimate how many requests the skiplist will scale to based
> + * on its height:
> + *   11 =>  4 million requests
> + *   12 => 16 million requests
> + */
> +#ifdef CONFIG_64BIT
> +#define I915_PRIOLIST_HEIGHT 12
> +#else
> +#define I915_PRIOLIST_HEIGHT 11
> +#endif
> +
> +/*
> + * i915_priolist forms a skiplist. The skiplist is built in layers,
> + * starting at the base [0] is a singly linked list of all i915_priolist.
> + * Each higher layer contains a fraction of the i915_priolist from the
> + * previous layer:
> + *
> + * S[0] 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF S
> + * E[1] >1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F E
> + * N[2] -->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F N
> + * T[3] ------>7------>F-------7------>F------>7------>F------>7------>F T
> + * I[4] -------------->F-------------->F-------------->F-------------->F I
> + * N[5] ------------------------------>F------------------------------>F N
> + * E[6] ------------------------------>F-------------------------------> E
> + * L[7] ---------------------------------------------------------------> L
> + *
> + * To iterate through all active i915_priolist, we only need to follow
> + * the chain in i915_priolist.next[0] (see for_each_priolist()).
> + *
> + * To quickly find a specific key (or insert point), we can perform a binary
> + * search by starting at the highest level and following the linked list
> + * at that level until we either find the node, or have gone passed the key.
> + * Then we descend a level, and start walking the list again starting from
> + * the current position, until eventually we find our key, or we run out of
> + * levels.
> + *
> + * https://en.wikipedia.org/wiki/Skip_list
> + */
>   struct i915_priolist {
>   	struct list_head requests;
> -	struct rb_node node;
>   	int priority;
> +
> +	int level;
> +	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
>   };
>   
> +struct i915_priolist_root {
> +	struct i915_priolist sentinel;
> +	u32 prng;
> +};
> +
> +#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
> +
> +#define for_each_priolist(p, root) \
> +	for ((p) = (root)->sentinel.next[0]; \
> +	     (p) != &(root)->sentinel; \
> +	     (p) = (p)->next[0])
> +
> +#define priolist_for_each_request(it, plist) \
> +	list_for_each_entry(it, &(plist)->requests, sched.link)
> +
> +#define priolist_for_each_request_safe(it, n, plist) \
> +	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> +
>   #endif /* _I915_PRIOLIST_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 312e1538d001..518eac67959e 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -4,7 +4,9 @@
>    * Copyright © 2018 Intel Corporation
>    */
>   
> +#include <linux/bitops.h>
>   #include <linux/mutex.h>
> +#include <linux/prandom.h>
>   
>   #include "gt/intel_ring.h"
>   #include "gt/intel_lrc_reg.h"
> @@ -168,6 +170,16 @@ void i915_sched_select_mode(struct i915_sched *se, enum i915_sched_mode mode)
>   	}
>   }
>   
> +static void init_priolist(struct i915_priolist_root *const root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +
> +	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
> +	pl->requests.prev = NULL;
> +	pl->priority = INT_MIN;
> +	pl->level = -1;
> +}
> +
>   void i915_sched_init(struct i915_sched *se,
>   		     struct device *dev,
>   		     const char *name,
> @@ -183,9 +195,9 @@ void i915_sched_init(struct i915_sched *se,
>   
>   	se->mask = mask;
>   
> +	init_priolist(&se->queue);
>   	INIT_LIST_HEAD(&se->requests);
>   	INIT_LIST_HEAD(&se->hold);
> -	se->queue = RB_ROOT_CACHED;
>   
>   	init_ipi(&se->ipi);
>   
> @@ -194,8 +206,60 @@ void i915_sched_init(struct i915_sched *se,
>   	se->revoke_context = i915_sched_default_revoke_context;
>   }
>   
> +__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +	int lvl;
> +
> +	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
> +		if (pl->next[lvl] != pl) {
> +			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
> +			return false;
> +		}
> +	}
> +
> +	if (pl->level != -1) {
> +		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool pl_empty(struct list_head *st)
> +{
> +	return !st->prev;
> +}
> +
> +static void pl_push(struct i915_priolist *pl, struct list_head *st)
> +{
> +	/* Keep list_empty(&pl->requests) valid for concurrent readers */
> +	pl->requests.prev = st->prev;
> +	st->prev = &pl->requests;
> +	GEM_BUG_ON(pl_empty(st));
> +}
> +
> +static struct i915_priolist *pl_pop(struct list_head *st)
> +{
> +	struct i915_priolist *pl;
> +
> +	GEM_BUG_ON(pl_empty(st));
> +	pl = container_of(st->prev, typeof(*pl), requests);
> +	st->prev = pl->requests.prev;
> +
> +	return pl;
> +}
> +
>   void i915_sched_park(struct i915_sched *se)
>   {
> +	struct i915_priolist_root *root = &se->queue;
> +	struct list_head *list = &root->sentinel.requests;
> +
> +	GEM_BUG_ON(!priolist_idle(root));
> +
> +	while (!pl_empty(list))
> +		kmem_cache_free(global.slab_priorities, pl_pop(list));
> +
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   	se->no_priolist = false;
>   }
> @@ -251,70 +315,71 @@ static inline bool node_signaled(const struct i915_sched_node *node)
>   	return i915_request_completed(node_to_request(node));
>   }
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +static inline unsigned int random_level(struct i915_priolist_root *root)
>   {
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> -static void assert_priolists(struct i915_sched * const se)
> -{
> -	struct rb_node *rb;
> -	long last_prio;
> -
> -	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> -		return;
> -
> -	GEM_BUG_ON(rb_first_cached(&se->queue) !=
> -		   rb_first(&se->queue.rb_root));
> -
> -	last_prio = INT_MAX;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		const struct i915_priolist *p = to_priolist(rb);
> -
> -		GEM_BUG_ON(p->priority > last_prio);
> -		last_prio = p->priority;
> -	}
> +	/*
> +	 * Given a uniform distribution of random numbers over the u32, then
> +	 * the probability each bit being unset is P=0.5. The probability of a
> +	 * successive sequence of bits being unset is P(n) = 0.5^n [n > 0].
> +	 *   P(level:1) = 0.5
> +	 *   P(level:2) = 0.25
> +	 *   P(level:3) = 0.125
> +	 *   P(level:4) = 0.0625
> +	 *   ...
> +	 * So we can use ffs() on a good random number generator to pick our
> +	 * level. We divide by two to reduce the probability of choosing a
> +	 * level to .25, as the cost of descending a level is the same as
> +	 * following an extra link in the chain at that level (so we can
> +	 * pack more nodes into fewer levels without incurring extra cost,
> +	 * and allow scaling to higher volumes of requests without expanding
> +	 * the height of the skiplist).
> +	 */
> +	root->prng = next_pseudo_random32(root->prng);
> +	return  __ffs(root->prng) / 2;
>   }
>   
>   static struct list_head *
>   lookup_priolist(struct i915_sched *se, int prio)
>   {
> -	struct i915_priolist *p;
> -	struct rb_node **parent, *rb;
> -	bool first = true;
> +	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
> +	struct i915_priolist_root *const root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	int lvl;
>   
>   	lockdep_assert_held(&se->lock);
> -	assert_priolists(se);
> -
>   	if (unlikely(se->no_priolist))
>   		prio = I915_PRIORITY_NORMAL;
>   
> +	for_each_priolist(pl, root) { /* recycle any empty elements before us */
> +		if (pl->priority <= prio || !list_empty(&pl->requests))
> +			break;
> +
> +		__i915_sched_dequeue_next(se);
> +	}
> +
>   find_priolist:
> -	/* most positive priority is scheduled first, equal priorities fifo */
> -	rb = NULL;
> -	parent = &se->queue.rb_root.rb_node;
> -	while (*parent) {
> -		rb = *parent;
> -		p = to_priolist(rb);
> -		if (prio > p->priority) {
> -			parent = &rb->rb_left;
> -		} else if (prio < p->priority) {
> -			parent = &rb->rb_right;
> -			first = false;
> -		} else {
> -			return &p->requests;
> -		}
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	while (lvl >= 0) {
> +		while (tmp = pl->next[lvl], tmp->priority >= prio)
> +			pl = tmp;
> +		if (pl->priority == prio)
> +			goto out;
> +		update[lvl--] = pl;
>   	}
>   
>   	if (prio == I915_PRIORITY_NORMAL) {
> -		p = &se->default_priolist;
> +		pl = &se->default_priolist;
> +	} else if (!pl_empty(&root->sentinel.requests)) {
> +		pl = pl_pop(&root->sentinel.requests);
>   	} else {
> -		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> +		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>   		/* Convert an allocation failure to a priority bump */
> -		if (unlikely(!p)) {
> +		if (unlikely(!pl)) {
>   			prio = I915_PRIORITY_NORMAL; /* recurses just once */
>   
> -			/* To maintain ordering with all rendering, after an
> +			/*
> +			 * To maintain ordering with all rendering, after an
>   			 * allocation failure we have to disable all scheduling.
>   			 * Requests will then be executed in fifo, and schedule
>   			 * will ensure that dependencies are emitted in fifo.
> @@ -327,18 +392,123 @@ lookup_priolist(struct i915_sched *se, int prio)
>   		}
>   	}
>   
> -	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	pl->priority = prio;
> +	INIT_LIST_HEAD(&pl->requests);
>   
> -	rb_link_node(&p->node, rb, parent);
> -	rb_insert_color_cached(&p->node, &se->queue, first);
> +	lvl = random_level(root);
> +	if (lvl > root->sentinel.level) {
> +		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> +			lvl = ++root->sentinel.level;
> +			update[lvl] = &root->sentinel;
> +		} else {
> +			lvl = I915_PRIOLIST_HEIGHT - 1;
> +		}
> +	}
> +	GEM_BUG_ON(lvl < 0);
> +	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
>   
> -	return &p->requests;
> +	pl->level = lvl;
> +	do {
> +		tmp = update[lvl];
> +		pl->next[lvl] = tmp->next[lvl];
> +		tmp->next[lvl] = pl;
> +	} while (--lvl >= 0);
> +
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
> +		struct i915_priolist *chk;
> +
> +		chk = &root->sentinel;
> +		lvl = chk->level;
> +		do {
> +			while (tmp = chk->next[lvl], tmp->priority >= prio)
> +				chk = tmp;
> +		} while (--lvl >= 0);
> +
> +		GEM_BUG_ON(chk != pl);
> +	}
> +
> +out:
> +	GEM_BUG_ON(pl == &root->sentinel);
> +	return &pl->requests;
>   }
>   
> -void __i915_priolist_free(struct i915_priolist *p)
> +static void __remove_priolist(struct i915_sched *se, struct list_head *plist)
>   {
> -	kmem_cache_free(global.slab_priorities, p);
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	struct i915_priolist *old =
> +		container_of(plist, struct i915_priolist, requests);
> +	int prio = old->priority;
> +	int lvl;
> +
> +	lockdep_assert_held(&se->lock);
> +	GEM_BUG_ON(!list_empty(plist));
> +
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +
> +	if (prio != I915_PRIORITY_NORMAL)
> +		pl_push(old, &pl->requests);
> +
> +	do {
> +		while (tmp = pl->next[lvl], tmp->priority > prio)
> +			pl = tmp;
> +		if (lvl <= old->level) {
> +			pl->next[lvl] = old->next[lvl];
> +			if (pl == &root->sentinel && old->next[lvl] == pl) {
> +				GEM_BUG_ON(pl->level != lvl);
> +				pl->level--;
> +			}
> +		}
> +	} while (--lvl >= 0);
> +	GEM_BUG_ON(tmp != old);
> +}
> +
> +static void remove_from_priolist(struct i915_sched *se,
> +				 struct i915_request *rq,
> +				 struct list_head *list,
> +				 bool tail)
> +{
> +	struct list_head *prev = rq->sched.link.prev;

This depends on rq being at the head of it's list?

> +
> +	GEM_BUG_ON(!i915_request_in_priority_queue(rq));
> +
> +	__list_del_entry(&rq->sched.link);
> +	if (tail)
> +		list_add_tail(&rq->sched.link, list);
> +	else
> +		list_add(&rq->sched.link, list);

So it is more move than remove(_from_priolist) ?

> +
> +	/* If we just removed the last element in the old plist, delete it */
> +	if (list_empty(prev))
> +		__remove_priolist(se, prev);
> +}
> +
> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se)
> +{
> +	struct i915_priolist * const s = &se->queue.sentinel;
> +	struct i915_priolist *pl = s->next[0];
> +	int lvl;
> +
> +	GEM_BUG_ON(!list_empty(&pl->requests));

Lost as to why pl->requests has to be empty at this point. Considering:

+#define i915_sched_dequeue(se, pl, rq, rn) \
+	for ((pl) = (se)->queue.sentinel.next[0]; \
+	     (pl) != &(se)->queue.sentinel; \
+	     (pl) = __i915_sched_dequeue_next(se)) \
+		priolist_for_each_request_safe(rq, rn, pl)
+

I also don't understand what it would de-queue. Whole priolist woth of 
requests at a time? But it can't be empty to dequeue something. And who 
puts any unconsumed requests back on somewhere in this case.

Regards,

Tvrtko

> +	GEM_BUG_ON(pl == s);
> +
> +	/* Keep pl->next[0] valid for for_each_priolist iteration */
> +	if (pl->priority != I915_PRIORITY_NORMAL)
> +		pl_push(pl, &s->requests);
> +
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +	do {
> +		s->next[lvl] = pl->next[lvl];
> +		if (pl->next[lvl] == s) {
> +			GEM_BUG_ON(s->level != lvl);
> +			s->level--;
> +		}
> +	} while (--lvl >= 0);
> +
> +	return pl->next[0];
>   }
>   
>   static struct i915_request *
> @@ -491,7 +661,7 @@ static void __i915_request_set_priority(struct i915_request *rq, int prio)
>   
>   		GEM_BUG_ON(rq->engine != engine);
>   		if (i915_request_in_priority_queue(rq))
> -			list_move_tail(&rq->sched.link, plist);
> +			remove_from_priolist(se, rq, plist, true);
>   
>   		/* Defer (tasklet) submission until after all updates. */
>   		kick_submission(engine, rq, prio);
> @@ -627,8 +797,7 @@ void __i915_sched_defer_request(struct intel_engine_cs *engine,
>   
>   		/* Note list is reversed for waiters wrt signal hierarchy */
>   		GEM_BUG_ON(rq->engine != engine);
> -		GEM_BUG_ON(!i915_request_in_priority_queue(rq));
> -		list_move(&rq->sched.link, &dfs);
> +		remove_from_priolist(se, rq, &dfs, false);
>   
>   		/* Track our visit, and prevent duplicate processing */
>   		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
> @@ -927,7 +1096,7 @@ void i915_sched_resume_request(struct intel_engine_cs *engine,
>   void __i915_sched_cancel_queue(struct i915_sched *se)
>   {
>   	struct i915_request *rq, *rn;
> -	struct rb_node *rb;
> +	struct i915_priolist *pl;
>   
>   	lockdep_assert_held(&se->lock);
>   
> @@ -936,16 +1105,9 @@ void __i915_sched_cancel_queue(struct i915_sched *se)
>   		i915_request_put(i915_request_mark_eio(rq));
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&se->queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> -			i915_request_put(i915_request_mark_eio(rq));
> -			__i915_request_submit(rq);
> -		}
> -
> -		rb_erase_cached(&p->node, &se->queue);
> -		i915_priolist_free(p);
> +	i915_sched_dequeue(se, pl, rq, rn) {
> +		i915_request_put(i915_request_mark_eio(rq));
> +		__i915_request_submit(rq);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   
> @@ -1225,9 +1387,9 @@ void i915_sched_show(struct drm_printer *m,
>   		     unsigned int max)
>   {
>   	const struct i915_request *rq, *last;
> +	struct i915_priolist *pl;
>   	unsigned long flags;
>   	unsigned int count;
> -	struct rb_node *rb;
>   
>   	rcu_read_lock();
>   	spin_lock_irqsave(&se->lock, flags);
> @@ -1282,10 +1444,8 @@ void i915_sched_show(struct drm_printer *m,
>   
>   	last = NULL;
>   	count = 0;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -
> -		priolist_for_each_request(rq, p) {
> +	for_each_priolist(pl, &se->queue) {
> +		priolist_for_each_request(rq, pl) {
>   			if (count++ < max - 1)
>   				show_request(m, rq, "\t", 0);
>   			else
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index fe392109b112..872d221f6ba7 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -24,12 +24,6 @@ struct intel_engine_cs;
>   		  ##__VA_ARGS__);					\
>   } while (0)
>   
> -#define priolist_for_each_request(it, plist) \
> -	list_for_each_entry(it, &(plist)->requests, sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist) \
> -	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> -
>   void i915_sched_node_init(struct i915_sched_node *node);
>   void i915_sched_node_reinit(struct i915_sched_node *node);
>   
> @@ -100,7 +94,7 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>   
>   static inline bool i915_sched_is_idle(const struct i915_sched *se)
>   {
> -	return RB_EMPTY_ROOT(&se->queue.rb_root);
> +	return i915_priolist_is_empty(&se->queue);
>   }
>   
>   static inline bool
> @@ -168,6 +162,14 @@ i915_sched_get_active_request(const struct i915_sched *se)
>   	return NULL;
>   }
>   
> +/* Walk the scheduler queue of requests (in submission order) and remove them */
> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se);
> +#define i915_sched_dequeue(se, pl, rq, rn) \
> +	for ((pl) = (se)->queue.sentinel.next[0]; \
> +	     (pl) != &(se)->queue.sentinel; \
> +	     (pl) = __i915_sched_dequeue_next(se)) \
> +		priolist_for_each_request_safe(rq, rn, pl)
> +
>   void i915_request_show_with_schedule(struct drm_printer *m,
>   				     const struct i915_request *rq,
>   				     const char *prefix,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index 5ca2dc1b4fb5..bc668f375097 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -115,7 +115,7 @@ struct i915_sched {
>   	 * @queue is only used to transfer requests from the scheduler
>   	 * frontend to the back.
>   	 */
> -	struct rb_root_cached queue;
> +	struct i915_priolist_root queue;
>   
>   	/**
>   	 * @tasklet: softirq tasklet for bottom half
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 3db34d3eea58..946c93441c1f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -25,6 +25,7 @@ selftest(ring, intel_ring_mock_selftests)
>   selftest(engine, intel_engine_cs_mock_selftests)
>   selftest(timelines, intel_timeline_mock_selftests)
>   selftest(requests, i915_request_mock_selftests)
> +selftest(scheduler, i915_scheduler_mock_selftests)
>   selftest(objects, i915_gem_object_mock_selftests)
>   selftest(phys, i915_gem_phys_mock_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> index f54bdbeaa48b..2bb2d3d07d06 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> @@ -12,6 +12,54 @@
>   #include "selftests/igt_spinner.h"
>   #include "selftests/i915_random.h"
>   
> +static int mock_skiplist_levels(void *dummy)
> +{
> +	struct i915_priolist_root root = {};
> +	struct i915_priolist *pl = &root.sentinel;
> +	IGT_TIMEOUT(end_time);
> +	unsigned long total;
> +	int count, lvl;
> +
> +	total = 0;
> +	do {
> +		for (count = 0; count < 16384; count++) {
> +			lvl = random_level(&root);
> +			if (lvl > pl->level) {
> +				if (lvl < I915_PRIOLIST_HEIGHT - 1)
> +					lvl = ++pl->level;
> +				else
> +					lvl = I915_PRIOLIST_HEIGHT - 1;
> +			}
> +
> +			pl->next[lvl] = ptr_inc(pl->next[lvl]);
> +		}
> +		total += count;
> +	} while (!__igt_timeout(end_time, NULL));
> +
> +	pr_info("Total %9lu\n", total);
> +	for (lvl = 0; lvl <= pl->level; lvl++) {
> +		int x = ilog2((unsigned long)pl->next[lvl]);
> +		char row[80];
> +
> +		memset(row, '*', x);
> +		row[x] = '\0';
> +
> +		pr_info(" [%2d] %9lu %s\n",
> +			lvl, (unsigned long)pl->next[lvl], row);
> +	}
> +
> +	return 0;
> +}
> +
> +int i915_scheduler_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(mock_skiplist_levels),
> +	};
> +
> +	return i915_subtests(tests, NULL);
> +}
> +
>   static void scheduling_disable(struct intel_engine_cs *engine)
>   {
>   	engine->props.preempt_timeout_ms = 0;
> @@ -80,9 +128,9 @@ static int all_engines(struct drm_i915_private *i915,
>   static bool check_context_order(struct i915_sched *se)
>   {
>   	u64 last_seqno, last_context;
> +	struct i915_priolist *p;
>   	unsigned long count;
>   	bool result = false;
> -	struct rb_node *rb;
>   	int last_prio;
>   
>   	/* We expect the execution order to follow ascending fence-context */
> @@ -92,8 +140,7 @@ static bool check_context_order(struct i915_sched *se)
>   	last_context = 0;
>   	last_seqno = 0;
>   	last_prio = 0;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	for_each_priolist(p, &se->queue) {
>   		struct i915_request *rq;
>   
>   		priolist_for_each_request(rq, p) {
>