[Intel-gfx] [PATCH 14/40] drm/i915: Combine multiple internal plists into the same i915_priolist bucket

Mon Sep 24 10:25:37 UTC 2018

On 19/09/2018 20:55, Chris Wilson wrote:
> As we are about to allow ourselves to slightly bump the user priority
> into a few different sublevels, packthose internal priority lists
> into the same i915_priolist to keep the rbtree compact and avoid having
> to allocate the default user priority even after the internal bumping.
> The downside to having an requests[] rather than a node per active list,
> is that we then have to walk over the empty higher priority lists. To
> compensate, we track the active buckets and use a small bitmap to skip
> over any inactive ones.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/intel_engine_cs.c      |  6 +-
>   drivers/gpu/drm/i915/intel_guc_submission.c | 12 ++-
>   drivers/gpu/drm/i915/intel_lrc.c            | 87 ++++++++++++++-------
>   drivers/gpu/drm/i915/intel_ringbuffer.h     | 13 ++-
>   4 files changed, 80 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 217ed3ee1cab..83f2f7774c1f 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1534,10 +1534,10 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   	count = 0;
>   	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
>   	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p =
> -			rb_entry(rb, typeof(*p), node);
> +		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +		int i;
>   
> -		list_for_each_entry(rq, &p->requests, sched.link) {
> +		priolist_for_each_request(rq, p, i) {
>   			if (count++ < MAX_REQUESTS_TO_SHOW - 1)
>   				print_request(m, rq, "\t\tQ ");
>   			else
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 6f693ef62c64..8531bd917ec3 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -726,30 +726,28 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
>   	while ((rb = rb_first_cached(&execlists->queue))) {
>   		struct i915_priolist *p = to_priolist(rb);
>   		struct i915_request *rq, *rn;
> +		int i;
>   
> -		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
> +		priolist_for_each_request_consume(rq, rn, p, i) {

Hm consumed clears the bitmask every time, but we are not certain yet we 
will consume the request.

>   			if (last && rq->hw_context != last->hw_context) {
> -				if (port == last_port) {
> -					__list_del_many(&p->requests,
> -							&rq->sched.link);
> +				if (port == last_port)
>   					goto done;
> -				}
>   
>   				if (submit)
>   					port_assign(port, last);
>   				port++;
>   			}
>   
> -			INIT_LIST_HEAD(&rq->sched.link);
> +			list_del_init(&rq->sched.link);
>   
>   			__i915_request_submit(rq);
>   			trace_i915_request_in(rq, port_index(port, execlists));
> +
>   			last = rq;
>   			submit = true;
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> -		INIT_LIST_HEAD(&p->requests);
>   		if (p->priority != I915_PRIORITY_NORMAL)
>   			kmem_cache_free(engine->i915->priorities, p);
>   	}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e8de250c3413..aeae82b5223c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -259,14 +259,49 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
>   	ce->lrc_desc = desc;
>   }
>   
> -static struct i915_priolist *
> +static void assert_priolists(struct intel_engine_execlists * const execlists,
> +			     int queue_priority)
> +{
> +	struct rb_node *rb;
> +	int last_prio, i;
> +
> +	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> +		return;
> +
> +	GEM_BUG_ON(rb_first_cached(&execlists->queue) !=
> +		   rb_first(&execlists->queue.rb_root));
> +
> +	last_prio = (queue_priority >> I915_USER_PRIORITY_SHIFT) + 1;
 >
> +	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> +		struct i915_priolist *p = to_priolist(rb);
> +
> +		GEM_BUG_ON(p->priority >= last_prio);
> +		last_prio = p->priority;
> +
> +		GEM_BUG_ON(!p->used);
> +		for (i = 0; i < ARRAY_SIZE(p->requests); i++) {
> +			if (list_empty(&p->requests[i]))
> +				continue;

Asserting that bitmask slot is not set if list is empty is not interesting?

> +
> +			GEM_BUG_ON(!(p->used & BIT(i)));
> +		}
> +	}

In general, won't this be a tiny bit too expensive on debug build and 
with priority/rq stress tests? To walk absolutely all priority levels 
times bucket count on every lookup and dequeue. Or maybe we don't have 
any tests at the moment which instantiate that many levels.

> +}
> +
> +static struct list_head *
>   lookup_priolist(struct intel_engine_cs *engine, int prio)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_priolist *p;
>   	struct rb_node **parent, *rb;
>   	bool first = true;
> +	int idx, i;
> +
> +	assert_priolists(execlists, INT_MAX);
>   
> +	/* buckets sorted from highest [in slot 0] to lowest priority */
> +	idx = I915_PRIORITY_COUNT - (prio & ~I915_PRIORITY_MASK) - 1;

0 - (prio & ~0) - 1 = -prio - 1, hm..?

Hm no.. I915_PRIORITY_COUNT with zero reserved internal bits is actually 
1.  So 1 - (prio & 0x0) - 1 = 0, ok phew..

Unintuitive for the count to be one with no reserved bits? Although it 
is only a temporary stage..

> +	prio >>= I915_USER_PRIORITY_SHIFT;
>   	if (unlikely(execlists->no_priolist))
>   		prio = I915_PRIORITY_NORMAL;
>   
> @@ -283,7 +318,7 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>   			parent = &rb->rb_right;
>   			first = false;
>   		} else {
> -			return p;
> +			goto out;
>   		}
>   	}
>   
> @@ -309,11 +344,15 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>   	}
>   
>   	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	for (i = 0; i < ARRAY_SIZE(p->requests); i++)
> +		INIT_LIST_HEAD(&p->requests[i]);
>   	rb_link_node(&p->node, rb, parent);
>   	rb_insert_color_cached(&p->node, &execlists->queue, first);
> +	p->used = 0;
>   
> -	return p;
> +out:
> +	p->used |= BIT(idx);
> +	return &p->requests[idx];
>   }
>   
>   static void unwind_wa_tail(struct i915_request *rq)
> @@ -325,7 +364,7 @@ static void unwind_wa_tail(struct i915_request *rq)
>   static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   {
>   	struct i915_request *rq, *rn;
> -	struct i915_priolist *uninitialized_var(p);
> +	struct list_head *uninitialized_var(pl);
>   	int last_prio = I915_PRIORITY_INVALID;
>   
>   	lockdep_assert_held(&engine->timeline.lock);
> @@ -342,12 +381,11 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
>   		if (rq_prio(rq) != last_prio) {
>   			last_prio = rq_prio(rq);
> -			p = lookup_priolist(engine, last_prio);
> +			pl = lookup_priolist(engine, last_prio);
>   		}
>   		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
>   
> -		GEM_BUG_ON(p->priority != rq_prio(rq));
> -		list_add(&rq->sched.link, &p->requests);
> +		list_add(&rq->sched.link, pl);
>   	}
>   }
>   
> @@ -665,8 +703,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	while ((rb = rb_first_cached(&execlists->queue))) {
>   		struct i915_priolist *p = to_priolist(rb);
>   		struct i915_request *rq, *rn;
> +		int i;
>   
> -		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
> +		priolist_for_each_request_consume(rq, rn, p, i) {
>   			/*
>   			 * Can we combine this request with the current port?
>   			 * It has to be the same context/ringbuffer and not
> @@ -685,11 +724,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				 * combine this request with the last, then we
>   				 * are done.
>   				 */
> -				if (port == last_port) {
> -					__list_del_many(&p->requests,
> -							&rq->sched.link);
> +				if (port == last_port)
>   					goto done;
> -				}
>   
>   				/*
>   				 * If GVT overrides us we only ever submit
> @@ -699,11 +735,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				 * request) to the second port.
>   				 */
>   				if (ctx_single_port_submission(last->hw_context) ||
> -				    ctx_single_port_submission(rq->hw_context)) {
> -					__list_del_many(&p->requests,
> -							&rq->sched.link);
> +				    ctx_single_port_submission(rq->hw_context))
>   					goto done;
> -				}
>   
>   				GEM_BUG_ON(last->hw_context == rq->hw_context);
>   
> @@ -714,15 +747,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				GEM_BUG_ON(port_isset(port));
>   			}
>   
> -			INIT_LIST_HEAD(&rq->sched.link);
> +			list_del_init(&rq->sched.link);
> +
>   			__i915_request_submit(rq);
>   			trace_i915_request_in(rq, port_index(port, execlists));
> +
>   			last = rq;
>   			submit = true;
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> -		INIT_LIST_HEAD(&p->requests);
>   		if (p->priority != I915_PRIORITY_NORMAL)
>   			kmem_cache_free(engine->i915->priorities, p);
>   	}
> @@ -746,6 +780,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	 */
>   	execlists->queue_priority =
>   		port != execlists->port ? rq_prio(last) : INT_MIN;
> +	assert_priolists(execlists, execlists->queue_priority);
>   
>   	if (submit) {
>   		port_assign(port, last);
> @@ -857,16 +892,16 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>   	/* Flush the queued requests to the timeline list (for retiring). */
>   	while ((rb = rb_first_cached(&execlists->queue))) {
>   		struct i915_priolist *p = to_priolist(rb);
> +		int i;
>   
> -		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
> -			INIT_LIST_HEAD(&rq->sched.link);
> +		priolist_for_each_request_consume(rq, rn, p, i) {
> +			list_del_init(&rq->sched.link);
>   
>   			dma_fence_set_error(&rq->fence, -EIO);
>   			__i915_request_submit(rq);
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> -		INIT_LIST_HEAD(&p->requests);
>   		if (p->priority != I915_PRIORITY_NORMAL)
>   			kmem_cache_free(engine->i915->priorities, p);
>   	}
> @@ -1072,8 +1107,7 @@ static void queue_request(struct intel_engine_cs *engine,
>   			  struct i915_sched_node *node,
>   			  int prio)
>   {
> -	list_add_tail(&node->link,
> -		      &lookup_priolist(engine, prio)->requests);
> +	list_add_tail(&node->link, lookup_priolist(engine, prio));
>   }
>   
>   static void __update_queue(struct intel_engine_cs *engine, int prio)
> @@ -1143,7 +1177,7 @@ sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
>   static void execlists_schedule(struct i915_request *request,
>   			       const struct i915_sched_attr *attr)
>   {
> -	struct i915_priolist *uninitialized_var(pl);
> +	struct list_head *uninitialized_var(pl);
>   	struct intel_engine_cs *engine, *last;
>   	struct i915_dependency *dep, *p;
>   	struct i915_dependency stack;
> @@ -1242,8 +1276,7 @@ static void execlists_schedule(struct i915_request *request,
>   				pl = lookup_priolist(engine, prio);
>   				last = engine;
>   			}
> -			GEM_BUG_ON(pl->priority != prio);
> -			list_move_tail(&node->link, &pl->requests);
> +			list_move_tail(&node->link, pl);
>   		} else {
>   			/*
>   			 * If the request is not in the priolist queue because
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 2dfa585712c2..1534de5bb852 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -190,11 +190,22 @@ enum intel_engine_id {
>   };
>   
>   struct i915_priolist {
> +	struct list_head requests[I915_PRIORITY_COUNT];
>   	struct rb_node node;
> -	struct list_head requests;
> +	unsigned long used;
>   	int priority;
>   };
>   
> +#define priolist_for_each_request(it, plist, idx) \
> +	for (idx = 0; idx < ARRAY_SIZE((plist)->requests); idx++) \
> +		list_for_each_entry(it, &(plist)->requests[idx], sched.link)
> +
> +#define priolist_for_each_request_consume(it, n, plist, idx) \
> +	for (; (idx = ffs((plist)->used)); (plist)->used &= ~BIT(idx - 1)) \
> +		list_for_each_entry_safe(it, n, \
> +					 &(plist)->requests[idx - 1], \
> +					 sched.link)
> +
>   struct st_preempt_hang {
>   	struct completion completion;
>   	bool inject_hang;
> 

Regards,

Tvrtko