[Intel-gfx] [PATCH 09/31] drm/i915: Replace priolist rbtree with a skiplist
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Mon Feb 8 12:29:14 UTC 2021
On 08/02/2021 10:52, Chris Wilson wrote:
> Replace the priolist rbtree with a skiplist. The crucial difference is
> that walking and removing the first element of a skiplist is O(1), but
> O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> hindrance for submission latency as it occurs between picking a request
> for the priolist and submitting it to hardware, as well effectively
> tripling the number of O(lgN) operations required under the irqoff lock.
> This is critical to reducing the latency jitter with multiple clients.
>
> The downsides to skiplists are that lookup/insertion is only
> probabilistically O(lgN) and there is a significant memory penalty to
> as each skip node is larger than the rbtree equivalent. Furthermore, we
> don't use dynamic arrays for the skiplist, so the allocation is fixed,
> and imposes an upper bound on the scalability wrt to the number of
> inflight requests.
>
> In the following patches, we introduce a new sort key to the scheduler,
> a virtual deadline. This imposes a different structure to the tree.
> Using a priority sort, we have very few priority levels active at any
> time, most likely just the default priority and so the rbtree degenerates
> to a single elements containing the list of all ready requests. The
> deadlines in contrast are very sparse, and typically each request has a
> unique deadline. Instead of being able to simply walk the list during
> dequeue, with the deadline scheduler we have to iterate through the bst
> on the critical submission path. Skiplists are vastly superior in this
> instance due to the O(1) iteration during dequeue, with very similar
> characteristics [on average] to the rbtree for insertion.
>
> This means that by using skiplists we can introduce a sparse sort key
> without degrading latency on the critical submission path.
>
> As an example, one simple case where we try to do lots of
> semi-independent work without any priority management (gem_exec_parallel),
> the lock hold times were:
> [worst] [total] [avg]
> 973.05 6301584.84 0.35 # plain rbtree
> 559.82 5424915.25 0.33 # best rbtree with pruning
> 208.21 3898784.09 0.24 # skiplist
> 34.05 5784106.01 0.32 # rbtree without deadlines
> 23.35 4152999.80 0.24 # skiplist without deadlines
>
> Based on the skiplist implementation by Dr Con Kolivas for MuQSS.
>
> References: https://en.wikipedia.org/wiki/Skip_list
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> .../drm/i915/gt/intel_execlists_submission.c | 168 +++++-----
> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 41 +--
> drivers/gpu/drm/i915/i915_priolist_types.h | 64 +++-
> drivers/gpu/drm/i915/i915_scheduler.c | 304 +++++++++++++-----
> drivers/gpu/drm/i915/i915_scheduler.h | 16 +-
> drivers/gpu/drm/i915/i915_scheduler_types.h | 2 +-
> .../drm/i915/selftests/i915_mock_selftests.h | 1 +
> .../gpu/drm/i915/selftests/i915_scheduler.c | 53 ++-
> 8 files changed, 454 insertions(+), 195 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 78fda9b4f626..4a0258347c10 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -254,11 +254,6 @@ static void ring_set_paused(const struct intel_engine_cs *engine, int state)
> wmb();
> }
>
> -static struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> - return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> static int rq_prio(const struct i915_request *rq)
> {
> return READ_ONCE(rq->sched.attr.priority);
> @@ -282,15 +277,27 @@ static int effective_prio(const struct i915_request *rq)
> return prio;
> }
>
> +static struct i915_request *first_request(const struct i915_sched *se)
> +{
> + struct i915_priolist *pl = se->queue.sentinel.next[0];
> +
> + if (pl == &se->queue.sentinel)
> + return NULL;
> +
> + return list_first_entry_or_null(&pl->requests,
> + struct i915_request,
> + sched.link);
> +}
> +
> static int queue_prio(const struct i915_sched *se)
> {
> - struct rb_node *rb;
> + struct i915_request *rq;
>
> - rb = rb_first_cached(&se->queue);
> - if (!rb)
> + rq = first_request(se);
> + if (!rq)
> return INT_MIN;
>
> - return to_priolist(rb)->priority;
> + return rq_prio(rq);
> }
>
> static int virtual_prio(const struct intel_engine_execlists *el)
> @@ -300,7 +307,7 @@ static int virtual_prio(const struct intel_engine_execlists *el)
> return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
> }
>
> -static bool need_preempt(const struct intel_engine_cs *engine,
> +static bool need_preempt(struct intel_engine_cs *engine,
> const struct i915_request *rq)
> {
> const struct i915_sched *se = &engine->sched;
> @@ -1144,7 +1151,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> struct i915_request **port = execlists->pending;
> struct i915_request ** const last_port = port + execlists->port_mask;
> struct i915_request *last, * const *active;
> + struct i915_request *rq, *rn;
> struct virtual_engine *ve;
> + struct i915_priolist *pl;
> struct rb_node *rb;
> bool submit = false;
>
> @@ -1355,87 +1364,79 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> break;
> }
>
> - while ((rb = rb_first_cached(&se->queue))) {
> - struct i915_priolist *p = to_priolist(rb);
> - struct i915_request *rq, *rn;
> + i915_sched_dequeue(se, pl, rq, rn) {
> + bool merge = true;
>
> - priolist_for_each_request_consume(rq, rn, p) {
> - bool merge = true;
> + /*
> + * Can we combine this request with the current port?
> + * It has to be the same context/ringbuffer and not
> + * have any exceptions (e.g. GVT saying never to
> + * combine contexts).
> + *
> + * If we can combine the requests, we can execute both
> + * by updating the RING_TAIL to point to the end of the
> + * second request, and so we never need to tell the
> + * hardware about the first.
> + */
> + if (last && !can_merge_rq(last, rq)) {
> + /*
> + * If we are on the second port and cannot
> + * combine this request with the last, then we
> + * are done.
> + */
> + if (port == last_port)
> + goto done;
>
> /*
> - * Can we combine this request with the current port?
> - * It has to be the same context/ringbuffer and not
> - * have any exceptions (e.g. GVT saying never to
> - * combine contexts).
> - *
> - * If we can combine the requests, we can execute both
> - * by updating the RING_TAIL to point to the end of the
> - * second request, and so we never need to tell the
> - * hardware about the first.
> + * We must not populate both ELSP[] with the
> + * same LRCA, i.e. we must submit 2 different
> + * contexts if we submit 2 ELSP.
> */
> - if (last && !can_merge_rq(last, rq)) {
> - /*
> - * If we are on the second port and cannot
> - * combine this request with the last, then we
> - * are done.
> - */
> - if (port == last_port)
> - goto done;
> + if (last->context == rq->context)
> + goto done;
>
> - /*
> - * We must not populate both ELSP[] with the
> - * same LRCA, i.e. we must submit 2 different
> - * contexts if we submit 2 ELSP.
> - */
> - if (last->context == rq->context)
> - goto done;
> + if (i915_request_has_sentinel(last))
> + goto done;
>
> - if (i915_request_has_sentinel(last))
> - goto done;
> + /*
> + * We avoid submitting virtual requests into
> + * the secondary ports so that we can migrate
> + * the request immediately to another engine
> + * rather than wait for the primary request.
> + */
> + if (rq->execution_mask != engine->mask)
> + goto done;
>
> - /*
> - * We avoid submitting virtual requests into
> - * the secondary ports so that we can migrate
> - * the request immediately to another engine
> - * rather than wait for the primary request.
> - */
> - if (rq->execution_mask != engine->mask)
> - goto done;
> + /*
> + * If GVT overrides us we only ever submit
> + * port[0], leaving port[1] empty. Note that we
> + * also have to be careful that we don't queue
> + * the same context (even though a different
> + * request) to the second port.
> + */
> + if (ctx_single_port_submission(last->context) ||
> + ctx_single_port_submission(rq->context))
> + goto done;
>
> - /*
> - * If GVT overrides us we only ever submit
> - * port[0], leaving port[1] empty. Note that we
> - * also have to be careful that we don't queue
> - * the same context (even though a different
> - * request) to the second port.
> - */
> - if (ctx_single_port_submission(last->context) ||
> - ctx_single_port_submission(rq->context))
> - goto done;
> -
> - merge = false;
> - }
> -
> - if (__i915_request_submit(rq)) {
> - if (!merge) {
> - *port++ = i915_request_get(last);
> - last = NULL;
> - }
> -
> - GEM_BUG_ON(last &&
> - !can_merge_ctx(last->context,
> - rq->context));
> - GEM_BUG_ON(last &&
> - i915_seqno_passed(last->fence.seqno,
> - rq->fence.seqno));
> -
> - submit = true;
> - last = rq;
> - }
> + merge = false;
> }
>
> - rb_erase_cached(&p->node, &se->queue);
> - i915_priolist_free(p);
> + if (__i915_request_submit(rq)) {
> + if (!merge) {
> + *port++ = i915_request_get(last);
> + last = NULL;
> + }
> +
> + GEM_BUG_ON(last &&
> + !can_merge_ctx(last->context,
> + rq->context));
> + GEM_BUG_ON(last &&
> + i915_seqno_passed(last->fence.seqno,
> + rq->fence.seqno));
> +
> + submit = true;
> + last = rq;
> + }
> }
> done:
> *port++ = i915_request_get(last);
> @@ -1456,7 +1457,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> * request triggering preemption on the next dequeue (or subsequent
> * interrupt for secondary ports).
> */
> - execlists->queue_priority_hint = queue_prio(se);
> + execlists->queue_priority_hint = pl->priority;
> spin_unlock(&se->lock);
>
> /*
> @@ -2716,7 +2717,6 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
> }
>
> execlists->queue_priority_hint = INT_MIN;
> - se->queue = RB_ROOT_CACHED;
>
> GEM_BUG_ON(__tasklet_is_enabled(&se->tasklet));
> se->tasklet.callback = nop_submission_tasklet;
> @@ -3173,6 +3173,8 @@ static void virtual_context_exit(struct intel_context *ce)
>
> for (n = 0; n < ve->num_siblings; n++)
> intel_engine_pm_put(ve->siblings[n]);
> +
> + i915_sched_park(intel_engine_get_scheduler(&ve->base));
> }
>
> static const struct intel_context_ops virtual_context_ops = {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index d14b9db77df8..c16393df42a0 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -60,11 +60,6 @@
>
> #define GUC_REQUEST_SIZE 64 /* bytes */
>
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> - return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
> {
> struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
> @@ -186,9 +181,10 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
> struct i915_request **first = execlists->inflight;
> struct i915_request ** const last_port = first + execlists->port_mask;
> struct i915_request *last = first[0];
> + struct i915_request *rq, *rn;
> struct i915_request **port;
> + struct i915_priolist *pl;
> bool submit = false;
> - struct rb_node *rb;
>
> lockdep_assert_held(&se->lock);
>
> @@ -205,32 +201,22 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
> * event.
> */
> port = first;
> - while ((rb = rb_first_cached(&se->queue))) {
> - struct i915_priolist *p = to_priolist(rb);
> - struct i915_request *rq, *rn;
> + i915_sched_dequeue(se, pl, rq, rn) {
> + if (last && rq->context != last->context) {
> + if (port == last_port)
> + goto done;
>
> - priolist_for_each_request_consume(rq, rn, p) {
> - if (last && rq->context != last->context) {
> - if (port == last_port)
> - goto done;
> -
> - *port = schedule_in(last,
> - port - execlists->inflight);
> - port++;
> - }
> -
> - list_del_init(&rq->sched.link);
> - __i915_request_submit(rq);
> - submit = true;
> - last = rq;
> + *port = schedule_in(last, port - execlists->inflight);
> + port++;
> }
>
> - rb_erase_cached(&p->node, &se->queue);
> - i915_priolist_free(p);
> + list_del_init(&rq->sched.link);
> + __i915_request_submit(rq);
> + submit = true;
> + last = rq;
> }
> done:
> - execlists->queue_priority_hint =
> - rb ? to_priolist(rb)->priority : INT_MIN;
> + execlists->queue_priority_hint = pl->priority;
> if (submit) {
> *port = schedule_in(last, port - execlists->inflight);
> *++port = NULL;
> @@ -361,7 +347,6 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
> __i915_sched_cancel_queue(se);
>
> execlists->queue_priority_hint = INT_MIN;
> - se->queue = RB_ROOT_CACHED;
>
> spin_unlock_irqrestore(&se->lock, flags);
> intel_engine_signal_breadcrumbs(engine);
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index bc2fa84f98a8..ee7482b9c813 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -38,10 +38,72 @@ enum {
> #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
> #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>
> +/*
> + * The slab returns power-of-two chunks of memory, so fill out the
> + * node to the next cacheline.
> + *
> + * We can estimate how many requests the skiplist will scale to based
> + * on its height:
> + * 11 => 4 million requests
> + * 12 => 16 million requests
> + */
> +#ifdef CONFIG_64BIT
> +#define I915_PRIOLIST_HEIGHT 12
> +#else
> +#define I915_PRIOLIST_HEIGHT 11
> +#endif
> +
> +/*
> + * i915_priolist forms a skiplist. The skiplist is built in layers,
> + * starting at the base [0] is a singly linked list of all i915_priolist.
> + * Each higher layer contains a fraction of the i915_priolist from the
> + * previous layer:
> + *
> + * S[0] 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF S
> + * E[1] >1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F E
> + * N[2] -->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F N
> + * T[3] ------>7------>F-------7------>F------>7------>F------>7------>F T
> + * I[4] -------------->F-------------->F-------------->F-------------->F I
> + * N[5] ------------------------------>F------------------------------>F N
> + * E[6] ------------------------------>F-------------------------------> E
> + * L[7] ---------------------------------------------------------------> L
> + *
> + * To iterate through all active i915_priolist, we only need to follow
> + * the chain in i915_priolist.next[0] (see for_each_priolist()).
> + *
> + * To quickly find a specific key (or insert point), we can perform a binary
> + * search by starting at the highest level and following the linked list
> + * at that level until we either find the node, or have gone passed the key.
> + * Then we descend a level, and start walking the list again starting from
> + * the current position, until eventually we find our key, or we run out of
> + * levels.
> + *
> + * https://en.wikipedia.org/wiki/Skip_list
> + */
> struct i915_priolist {
> struct list_head requests;
> - struct rb_node node;
> int priority;
> +
> + int level;
> + struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
> };
>
> +struct i915_priolist_root {
> + struct i915_priolist sentinel;
> + u32 prng;
> +};
> +
> +#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
> +
> +#define for_each_priolist(p, root) \
> + for ((p) = (root)->sentinel.next[0]; \
> + (p) != &(root)->sentinel; \
> + (p) = (p)->next[0])
> +
> +#define priolist_for_each_request(it, plist) \
> + list_for_each_entry(it, &(plist)->requests, sched.link)
> +
> +#define priolist_for_each_request_safe(it, n, plist) \
> + list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> +
> #endif /* _I915_PRIOLIST_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 312e1538d001..518eac67959e 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -4,7 +4,9 @@
> * Copyright © 2018 Intel Corporation
> */
>
> +#include <linux/bitops.h>
> #include <linux/mutex.h>
> +#include <linux/prandom.h>
>
> #include "gt/intel_ring.h"
> #include "gt/intel_lrc_reg.h"
> @@ -168,6 +170,16 @@ void i915_sched_select_mode(struct i915_sched *se, enum i915_sched_mode mode)
> }
> }
>
> +static void init_priolist(struct i915_priolist_root *const root)
> +{
> + struct i915_priolist *pl = &root->sentinel;
> +
> + memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
> + pl->requests.prev = NULL;
> + pl->priority = INT_MIN;
> + pl->level = -1;
> +}
> +
> void i915_sched_init(struct i915_sched *se,
> struct device *dev,
> const char *name,
> @@ -183,9 +195,9 @@ void i915_sched_init(struct i915_sched *se,
>
> se->mask = mask;
>
> + init_priolist(&se->queue);
> INIT_LIST_HEAD(&se->requests);
> INIT_LIST_HEAD(&se->hold);
> - se->queue = RB_ROOT_CACHED;
>
> init_ipi(&se->ipi);
>
> @@ -194,8 +206,60 @@ void i915_sched_init(struct i915_sched *se,
> se->revoke_context = i915_sched_default_revoke_context;
> }
>
> +__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
> +{
> + struct i915_priolist *pl = &root->sentinel;
> + int lvl;
> +
> + for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
> + if (pl->next[lvl] != pl) {
> + GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
> + return false;
> + }
> + }
> +
> + if (pl->level != -1) {
> + GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static bool pl_empty(struct list_head *st)
> +{
> + return !st->prev;
> +}
> +
> +static void pl_push(struct i915_priolist *pl, struct list_head *st)
> +{
> + /* Keep list_empty(&pl->requests) valid for concurrent readers */
> + pl->requests.prev = st->prev;
> + st->prev = &pl->requests;
> + GEM_BUG_ON(pl_empty(st));
> +}
> +
> +static struct i915_priolist *pl_pop(struct list_head *st)
> +{
> + struct i915_priolist *pl;
> +
> + GEM_BUG_ON(pl_empty(st));
> + pl = container_of(st->prev, typeof(*pl), requests);
> + st->prev = pl->requests.prev;
> +
> + return pl;
> +}
> +
> void i915_sched_park(struct i915_sched *se)
> {
> + struct i915_priolist_root *root = &se->queue;
> + struct list_head *list = &root->sentinel.requests;
> +
> + GEM_BUG_ON(!priolist_idle(root));
> +
> + while (!pl_empty(list))
> + kmem_cache_free(global.slab_priorities, pl_pop(list));
> +
> GEM_BUG_ON(!i915_sched_is_idle(se));
> se->no_priolist = false;
> }
> @@ -251,70 +315,71 @@ static inline bool node_signaled(const struct i915_sched_node *node)
> return i915_request_completed(node_to_request(node));
> }
>
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +static inline unsigned int random_level(struct i915_priolist_root *root)
> {
> - return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> -static void assert_priolists(struct i915_sched * const se)
> -{
> - struct rb_node *rb;
> - long last_prio;
> -
> - if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> - return;
> -
> - GEM_BUG_ON(rb_first_cached(&se->queue) !=
> - rb_first(&se->queue.rb_root));
> -
> - last_prio = INT_MAX;
> - for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> - const struct i915_priolist *p = to_priolist(rb);
> -
> - GEM_BUG_ON(p->priority > last_prio);
> - last_prio = p->priority;
> - }
> + /*
> + * Given a uniform distribution of random numbers over the u32, then
> + * the probability each bit being unset is P=0.5. The probability of a
> + * successive sequence of bits being unset is P(n) = 0.5^n [n > 0].
> + * P(level:1) = 0.5
> + * P(level:2) = 0.25
> + * P(level:3) = 0.125
> + * P(level:4) = 0.0625
> + * ...
> + * So we can use ffs() on a good random number generator to pick our
> + * level. We divide by two to reduce the probability of choosing a
> + * level to .25, as the cost of descending a level is the same as
> + * following an extra link in the chain at that level (so we can
> + * pack more nodes into fewer levels without incurring extra cost,
> + * and allow scaling to higher volumes of requests without expanding
> + * the height of the skiplist).
> + */
> + root->prng = next_pseudo_random32(root->prng);
> + return __ffs(root->prng) / 2;
> }
>
> static struct list_head *
> lookup_priolist(struct i915_sched *se, int prio)
> {
> - struct i915_priolist *p;
> - struct rb_node **parent, *rb;
> - bool first = true;
> + struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
> + struct i915_priolist_root *const root = &se->queue;
> + struct i915_priolist *pl, *tmp;
> + int lvl;
>
> lockdep_assert_held(&se->lock);
> - assert_priolists(se);
> -
> if (unlikely(se->no_priolist))
> prio = I915_PRIORITY_NORMAL;
>
> + for_each_priolist(pl, root) { /* recycle any empty elements before us */
> + if (pl->priority <= prio || !list_empty(&pl->requests))
> + break;
> +
> + __i915_sched_dequeue_next(se);
> + }
> +
> find_priolist:
> - /* most positive priority is scheduled first, equal priorities fifo */
> - rb = NULL;
> - parent = &se->queue.rb_root.rb_node;
> - while (*parent) {
> - rb = *parent;
> - p = to_priolist(rb);
> - if (prio > p->priority) {
> - parent = &rb->rb_left;
> - } else if (prio < p->priority) {
> - parent = &rb->rb_right;
> - first = false;
> - } else {
> - return &p->requests;
> - }
> + pl = &root->sentinel;
> + lvl = pl->level;
> + while (lvl >= 0) {
> + while (tmp = pl->next[lvl], tmp->priority >= prio)
> + pl = tmp;
> + if (pl->priority == prio)
> + goto out;
> + update[lvl--] = pl;
> }
>
> if (prio == I915_PRIORITY_NORMAL) {
> - p = &se->default_priolist;
> + pl = &se->default_priolist;
> + } else if (!pl_empty(&root->sentinel.requests)) {
> + pl = pl_pop(&root->sentinel.requests);
> } else {
> - p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> + pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> /* Convert an allocation failure to a priority bump */
> - if (unlikely(!p)) {
> + if (unlikely(!pl)) {
> prio = I915_PRIORITY_NORMAL; /* recurses just once */
>
> - /* To maintain ordering with all rendering, after an
> + /*
> + * To maintain ordering with all rendering, after an
> * allocation failure we have to disable all scheduling.
> * Requests will then be executed in fifo, and schedule
> * will ensure that dependencies are emitted in fifo.
> @@ -327,18 +392,123 @@ lookup_priolist(struct i915_sched *se, int prio)
> }
> }
>
> - p->priority = prio;
> - INIT_LIST_HEAD(&p->requests);
> + pl->priority = prio;
> + INIT_LIST_HEAD(&pl->requests);
>
> - rb_link_node(&p->node, rb, parent);
> - rb_insert_color_cached(&p->node, &se->queue, first);
> + lvl = random_level(root);
> + if (lvl > root->sentinel.level) {
> + if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> + lvl = ++root->sentinel.level;
> + update[lvl] = &root->sentinel;
> + } else {
> + lvl = I915_PRIOLIST_HEIGHT - 1;
> + }
> + }
> + GEM_BUG_ON(lvl < 0);
> + GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
>
> - return &p->requests;
> + pl->level = lvl;
> + do {
> + tmp = update[lvl];
> + pl->next[lvl] = tmp->next[lvl];
> + tmp->next[lvl] = pl;
> + } while (--lvl >= 0);
> +
> + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
> + struct i915_priolist *chk;
> +
> + chk = &root->sentinel;
> + lvl = chk->level;
> + do {
> + while (tmp = chk->next[lvl], tmp->priority >= prio)
> + chk = tmp;
> + } while (--lvl >= 0);
> +
> + GEM_BUG_ON(chk != pl);
> + }
> +
> +out:
> + GEM_BUG_ON(pl == &root->sentinel);
> + return &pl->requests;
> }
>
> -void __i915_priolist_free(struct i915_priolist *p)
> +static void __remove_priolist(struct i915_sched *se, struct list_head *plist)
> {
> - kmem_cache_free(global.slab_priorities, p);
> + struct i915_priolist_root *root = &se->queue;
> + struct i915_priolist *pl, *tmp;
> + struct i915_priolist *old =
> + container_of(plist, struct i915_priolist, requests);
> + int prio = old->priority;
> + int lvl;
> +
> + lockdep_assert_held(&se->lock);
> + GEM_BUG_ON(!list_empty(plist));
> +
> + pl = &root->sentinel;
> + lvl = pl->level;
> + GEM_BUG_ON(lvl < 0);
> +
> + if (prio != I915_PRIORITY_NORMAL)
> + pl_push(old, &pl->requests);
> +
> + do {
> + while (tmp = pl->next[lvl], tmp->priority > prio)
> + pl = tmp;
> + if (lvl <= old->level) {
> + pl->next[lvl] = old->next[lvl];
> + if (pl == &root->sentinel && old->next[lvl] == pl) {
> + GEM_BUG_ON(pl->level != lvl);
> + pl->level--;
> + }
> + }
> + } while (--lvl >= 0);
> + GEM_BUG_ON(tmp != old);
> +}
> +
> +static void remove_from_priolist(struct i915_sched *se,
> + struct i915_request *rq,
> + struct list_head *list,
> + bool tail)
> +{
> + struct list_head *prev = rq->sched.link.prev;
This depends on rq being at the head of it's list?
> +
> + GEM_BUG_ON(!i915_request_in_priority_queue(rq));
> +
> + __list_del_entry(&rq->sched.link);
> + if (tail)
> + list_add_tail(&rq->sched.link, list);
> + else
> + list_add(&rq->sched.link, list);
So it is more move than remove(_from_priolist) ?
> +
> + /* If we just removed the last element in the old plist, delete it */
> + if (list_empty(prev))
> + __remove_priolist(se, prev);
> +}
> +
> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se)
> +{
> + struct i915_priolist * const s = &se->queue.sentinel;
> + struct i915_priolist *pl = s->next[0];
> + int lvl;
> +
> + GEM_BUG_ON(!list_empty(&pl->requests));
Lost as to why pl->requests has to be empty at this point. Considering:
+#define i915_sched_dequeue(se, pl, rq, rn) \
+ for ((pl) = (se)->queue.sentinel.next[0]; \
+ (pl) != &(se)->queue.sentinel; \
+ (pl) = __i915_sched_dequeue_next(se)) \
+ priolist_for_each_request_safe(rq, rn, pl)
+
I also don't understand what it would de-queue. Whole priolist woth of
requests at a time? But it can't be empty to dequeue something. And who
puts any unconsumed requests back on somewhere in this case.
Regards,
Tvrtko
> + GEM_BUG_ON(pl == s);
> +
> + /* Keep pl->next[0] valid for for_each_priolist iteration */
> + if (pl->priority != I915_PRIORITY_NORMAL)
> + pl_push(pl, &s->requests);
> +
> + lvl = pl->level;
> + GEM_BUG_ON(lvl < 0);
> + do {
> + s->next[lvl] = pl->next[lvl];
> + if (pl->next[lvl] == s) {
> + GEM_BUG_ON(s->level != lvl);
> + s->level--;
> + }
> + } while (--lvl >= 0);
> +
> + return pl->next[0];
> }
>
> static struct i915_request *
> @@ -491,7 +661,7 @@ static void __i915_request_set_priority(struct i915_request *rq, int prio)
>
> GEM_BUG_ON(rq->engine != engine);
> if (i915_request_in_priority_queue(rq))
> - list_move_tail(&rq->sched.link, plist);
> + remove_from_priolist(se, rq, plist, true);
>
> /* Defer (tasklet) submission until after all updates. */
> kick_submission(engine, rq, prio);
> @@ -627,8 +797,7 @@ void __i915_sched_defer_request(struct intel_engine_cs *engine,
>
> /* Note list is reversed for waiters wrt signal hierarchy */
> GEM_BUG_ON(rq->engine != engine);
> - GEM_BUG_ON(!i915_request_in_priority_queue(rq));
> - list_move(&rq->sched.link, &dfs);
> + remove_from_priolist(se, rq, &dfs, false);
>
> /* Track our visit, and prevent duplicate processing */
> clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
> @@ -927,7 +1096,7 @@ void i915_sched_resume_request(struct intel_engine_cs *engine,
> void __i915_sched_cancel_queue(struct i915_sched *se)
> {
> struct i915_request *rq, *rn;
> - struct rb_node *rb;
> + struct i915_priolist *pl;
>
> lockdep_assert_held(&se->lock);
>
> @@ -936,16 +1105,9 @@ void __i915_sched_cancel_queue(struct i915_sched *se)
> i915_request_put(i915_request_mark_eio(rq));
>
> /* Flush the queued requests to the timeline list (for retiring). */
> - while ((rb = rb_first_cached(&se->queue))) {
> - struct i915_priolist *p = to_priolist(rb);
> -
> - priolist_for_each_request_consume(rq, rn, p) {
> - i915_request_put(i915_request_mark_eio(rq));
> - __i915_request_submit(rq);
> - }
> -
> - rb_erase_cached(&p->node, &se->queue);
> - i915_priolist_free(p);
> + i915_sched_dequeue(se, pl, rq, rn) {
> + i915_request_put(i915_request_mark_eio(rq));
> + __i915_request_submit(rq);
> }
> GEM_BUG_ON(!i915_sched_is_idle(se));
>
> @@ -1225,9 +1387,9 @@ void i915_sched_show(struct drm_printer *m,
> unsigned int max)
> {
> const struct i915_request *rq, *last;
> + struct i915_priolist *pl;
> unsigned long flags;
> unsigned int count;
> - struct rb_node *rb;
>
> rcu_read_lock();
> spin_lock_irqsave(&se->lock, flags);
> @@ -1282,10 +1444,8 @@ void i915_sched_show(struct drm_printer *m,
>
> last = NULL;
> count = 0;
> - for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> - struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -
> - priolist_for_each_request(rq, p) {
> + for_each_priolist(pl, &se->queue) {
> + priolist_for_each_request(rq, pl) {
> if (count++ < max - 1)
> show_request(m, rq, "\t", 0);
> else
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index fe392109b112..872d221f6ba7 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -24,12 +24,6 @@ struct intel_engine_cs;
> ##__VA_ARGS__); \
> } while (0)
>
> -#define priolist_for_each_request(it, plist) \
> - list_for_each_entry(it, &(plist)->requests, sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist) \
> - list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> -
> void i915_sched_node_init(struct i915_sched_node *node);
> void i915_sched_node_reinit(struct i915_sched_node *node);
>
> @@ -100,7 +94,7 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>
> static inline bool i915_sched_is_idle(const struct i915_sched *se)
> {
> - return RB_EMPTY_ROOT(&se->queue.rb_root);
> + return i915_priolist_is_empty(&se->queue);
> }
>
> static inline bool
> @@ -168,6 +162,14 @@ i915_sched_get_active_request(const struct i915_sched *se)
> return NULL;
> }
>
> +/* Walk the scheduler queue of requests (in submission order) and remove them */
> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se);
> +#define i915_sched_dequeue(se, pl, rq, rn) \
> + for ((pl) = (se)->queue.sentinel.next[0]; \
> + (pl) != &(se)->queue.sentinel; \
> + (pl) = __i915_sched_dequeue_next(se)) \
> + priolist_for_each_request_safe(rq, rn, pl)
> +
> void i915_request_show_with_schedule(struct drm_printer *m,
> const struct i915_request *rq,
> const char *prefix,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index 5ca2dc1b4fb5..bc668f375097 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -115,7 +115,7 @@ struct i915_sched {
> * @queue is only used to transfer requests from the scheduler
> * frontend to the back.
> */
> - struct rb_root_cached queue;
> + struct i915_priolist_root queue;
>
> /**
> * @tasklet: softirq tasklet for bottom half
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 3db34d3eea58..946c93441c1f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -25,6 +25,7 @@ selftest(ring, intel_ring_mock_selftests)
> selftest(engine, intel_engine_cs_mock_selftests)
> selftest(timelines, intel_timeline_mock_selftests)
> selftest(requests, i915_request_mock_selftests)
> +selftest(scheduler, i915_scheduler_mock_selftests)
> selftest(objects, i915_gem_object_mock_selftests)
> selftest(phys, i915_gem_phys_mock_selftests)
> selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> index f54bdbeaa48b..2bb2d3d07d06 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> @@ -12,6 +12,54 @@
> #include "selftests/igt_spinner.h"
> #include "selftests/i915_random.h"
>
> +static int mock_skiplist_levels(void *dummy)
> +{
> + struct i915_priolist_root root = {};
> + struct i915_priolist *pl = &root.sentinel;
> + IGT_TIMEOUT(end_time);
> + unsigned long total;
> + int count, lvl;
> +
> + total = 0;
> + do {
> + for (count = 0; count < 16384; count++) {
> + lvl = random_level(&root);
> + if (lvl > pl->level) {
> + if (lvl < I915_PRIOLIST_HEIGHT - 1)
> + lvl = ++pl->level;
> + else
> + lvl = I915_PRIOLIST_HEIGHT - 1;
> + }
> +
> + pl->next[lvl] = ptr_inc(pl->next[lvl]);
> + }
> + total += count;
> + } while (!__igt_timeout(end_time, NULL));
> +
> + pr_info("Total %9lu\n", total);
> + for (lvl = 0; lvl <= pl->level; lvl++) {
> + int x = ilog2((unsigned long)pl->next[lvl]);
> + char row[80];
> +
> + memset(row, '*', x);
> + row[x] = '\0';
> +
> + pr_info(" [%2d] %9lu %s\n",
> + lvl, (unsigned long)pl->next[lvl], row);
> + }
> +
> + return 0;
> +}
> +
> +int i915_scheduler_mock_selftests(void)
> +{
> + static const struct i915_subtest tests[] = {
> + SUBTEST(mock_skiplist_levels),
> + };
> +
> + return i915_subtests(tests, NULL);
> +}
> +
> static void scheduling_disable(struct intel_engine_cs *engine)
> {
> engine->props.preempt_timeout_ms = 0;
> @@ -80,9 +128,9 @@ static int all_engines(struct drm_i915_private *i915,
> static bool check_context_order(struct i915_sched *se)
> {
> u64 last_seqno, last_context;
> + struct i915_priolist *p;
> unsigned long count;
> bool result = false;
> - struct rb_node *rb;
> int last_prio;
>
> /* We expect the execution order to follow ascending fence-context */
> @@ -92,8 +140,7 @@ static bool check_context_order(struct i915_sched *se)
> last_context = 0;
> last_seqno = 0;
> last_prio = 0;
> - for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> - struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> + for_each_priolist(p, &se->queue) {
> struct i915_request *rq;
>
> priolist_for_each_request(rq, p) {
>
More information about the Intel-gfx
mailing list