[Intel-gfx] [PATCH 4/7] drm/i915: Combine multiple internal plists into the same i915_priolist bucket

Tue Sep 25 09:48:44 UTC 2018

On 25/09/2018 09:32, Chris Wilson wrote:
> As we are about to allow ourselves to slightly bump the user priority
> into a few different sublevels, packthose internal priority lists
> into the same i915_priolist to keep the rbtree compact and avoid having
> to allocate the default user priority even after the internal bumping.
> The downside to having an requests[] rather than a node per active list,
> is that we then have to walk over the empty higher priority lists. To
> compensate, we track the active buckets and use a small bitmap to skip
> over any inactive ones.
> 
> v2: Use MASK of internal levels to simplify our usage.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/intel_engine_cs.c      |  6 +-
>   drivers/gpu/drm/i915/intel_guc_submission.c | 12 ++-
>   drivers/gpu/drm/i915/intel_lrc.c            | 87 ++++++++++++++-------
>   drivers/gpu/drm/i915/intel_ringbuffer.h     | 13 ++-
>   4 files changed, 80 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 217ed3ee1cab..83f2f7774c1f 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1534,10 +1534,10 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>   	count = 0;
>   	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
>   	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p =
> -			rb_entry(rb, typeof(*p), node);
> +		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +		int i;
>   
> -		list_for_each_entry(rq, &p->requests, sched.link) {
> +		priolist_for_each_request(rq, p, i) {
>   			if (count++ < MAX_REQUESTS_TO_SHOW - 1)
>   				print_request(m, rq, "\t\tQ ");
>   			else
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 4874a212754c..ac862b42f6a1 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -746,30 +746,28 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
>   	while ((rb = rb_first_cached(&execlists->queue))) {
>   		struct i915_priolist *p = to_priolist(rb);
>   		struct i915_request *rq, *rn;
> +		int i;
>   
> -		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
> +		priolist_for_each_request_consume(rq, rn, p, i) {
>   			if (last && rq->hw_context != last->hw_context) {
> -				if (port == last_port) {
> -					__list_del_many(&p->requests,
> -							&rq->sched.link);
> +				if (port == last_port)
>   					goto done;
> -				}
>   
>   				if (submit)
>   					port_assign(port, last);
>   				port++;
>   			}
>   
> -			INIT_LIST_HEAD(&rq->sched.link);
> +			list_del_init(&rq->sched.link);
>   
>   			__i915_request_submit(rq);
>   			trace_i915_request_in(rq, port_index(port, execlists));
> +
>   			last = rq;
>   			submit = true;
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> -		INIT_LIST_HEAD(&p->requests);
>   		if (p->priority != I915_PRIORITY_NORMAL)
>   			kmem_cache_free(engine->i915->priorities, p);
>   	}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e8de250c3413..b1b3f67d1120 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -259,14 +259,49 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
>   	ce->lrc_desc = desc;
>   }
>   
> -static struct i915_priolist *
> +static void assert_priolists(struct intel_engine_execlists * const execlists,
> +			     int queue_priority)
> +{
> +	struct rb_node *rb;
> +	int last_prio, i;
> +
> +	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> +		return;
> +
> +	GEM_BUG_ON(rb_first_cached(&execlists->queue) !=
> +		   rb_first(&execlists->queue.rb_root));
> +
> +	last_prio = (queue_priority >> I915_USER_PRIORITY_SHIFT) + 1;
> +	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> +		struct i915_priolist *p = to_priolist(rb);
> +
> +		GEM_BUG_ON(p->priority >= last_prio);
> +		last_prio = p->priority;
> +
> +		GEM_BUG_ON(!p->used);
> +		for (i = 0; i < ARRAY_SIZE(p->requests); i++) {
> +			if (list_empty(&p->requests[i]))
> +				continue;
> +
> +			GEM_BUG_ON(!(p->used & BIT(i)));
> +		}
> +	}
> +}
> +
> +static struct list_head *
>   lookup_priolist(struct intel_engine_cs *engine, int prio)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_priolist *p;
>   	struct rb_node **parent, *rb;
>   	bool first = true;
> +	int idx, i;
> +
> +	assert_priolists(execlists, INT_MAX);
>   
> +	/* buckets sorted from highest [in slot 0] to lowest priority */
> +	idx = I915_PRIORITY_MASK - (prio & I915_PRIORITY_MASK);

How about:

#define I915_INTERNAL_PRIO(p) (p & I915_PRIORITY_MASK)

idx = I915_PRIORITY_COUNT - 1 - I915_INTERNAL_PRIO(prio); ?

Regards,

Tvrtko

> +	prio >>= I915_USER_PRIORITY_SHIFT;
>   	if (unlikely(execlists->no_priolist))
>   		prio = I915_PRIORITY_NORMAL;
>   
> @@ -283,7 +318,7 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>   			parent = &rb->rb_right;
>   			first = false;
>   		} else {
> -			return p;
> +			goto out;
>   		}
>   	}
>   
> @@ -309,11 +344,15 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>   	}
>   
>   	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	for (i = 0; i < ARRAY_SIZE(p->requests); i++)
> +		INIT_LIST_HEAD(&p->requests[i]);
>   	rb_link_node(&p->node, rb, parent);
>   	rb_insert_color_cached(&p->node, &execlists->queue, first);
> +	p->used = 0;
>   
> -	return p;
> +out:
> +	p->used |= BIT(idx);
> +	return &p->requests[idx];
>   }
>   
>   static void unwind_wa_tail(struct i915_request *rq)
> @@ -325,7 +364,7 @@ static void unwind_wa_tail(struct i915_request *rq)
>   static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   {
>   	struct i915_request *rq, *rn;
> -	struct i915_priolist *uninitialized_var(p);
> +	struct list_head *uninitialized_var(pl);
>   	int last_prio = I915_PRIORITY_INVALID;
>   
>   	lockdep_assert_held(&engine->timeline.lock);
> @@ -342,12 +381,11 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
>   		if (rq_prio(rq) != last_prio) {
>   			last_prio = rq_prio(rq);
> -			p = lookup_priolist(engine, last_prio);
> +			pl = lookup_priolist(engine, last_prio);
>   		}
>   		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
>   
> -		GEM_BUG_ON(p->priority != rq_prio(rq));
> -		list_add(&rq->sched.link, &p->requests);
> +		list_add(&rq->sched.link, pl);
>   	}
>   }
>   
> @@ -665,8 +703,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	while ((rb = rb_first_cached(&execlists->queue))) {
>   		struct i915_priolist *p = to_priolist(rb);
>   		struct i915_request *rq, *rn;
> +		int i;
>   
> -		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
> +		priolist_for_each_request_consume(rq, rn, p, i) {
>   			/*
>   			 * Can we combine this request with the current port?
>   			 * It has to be the same context/ringbuffer and not
> @@ -685,11 +724,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				 * combine this request with the last, then we
>   				 * are done.
>   				 */
> -				if (port == last_port) {
> -					__list_del_many(&p->requests,
> -							&rq->sched.link);
> +				if (port == last_port)
>   					goto done;
> -				}
>   
>   				/*
>   				 * If GVT overrides us we only ever submit
> @@ -699,11 +735,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				 * request) to the second port.
>   				 */
>   				if (ctx_single_port_submission(last->hw_context) ||
> -				    ctx_single_port_submission(rq->hw_context)) {
> -					__list_del_many(&p->requests,
> -							&rq->sched.link);
> +				    ctx_single_port_submission(rq->hw_context))
>   					goto done;
> -				}
>   
>   				GEM_BUG_ON(last->hw_context == rq->hw_context);
>   
> @@ -714,15 +747,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				GEM_BUG_ON(port_isset(port));
>   			}
>   
> -			INIT_LIST_HEAD(&rq->sched.link);
> +			list_del_init(&rq->sched.link);
> +
>   			__i915_request_submit(rq);
>   			trace_i915_request_in(rq, port_index(port, execlists));
> +
>   			last = rq;
>   			submit = true;
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> -		INIT_LIST_HEAD(&p->requests);
>   		if (p->priority != I915_PRIORITY_NORMAL)
>   			kmem_cache_free(engine->i915->priorities, p);
>   	}
> @@ -746,6 +780,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	 */
>   	execlists->queue_priority =
>   		port != execlists->port ? rq_prio(last) : INT_MIN;
> +	assert_priolists(execlists, execlists->queue_priority);
>   
>   	if (submit) {
>   		port_assign(port, last);
> @@ -857,16 +892,16 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>   	/* Flush the queued requests to the timeline list (for retiring). */
>   	while ((rb = rb_first_cached(&execlists->queue))) {
>   		struct i915_priolist *p = to_priolist(rb);
> +		int i;
>   
> -		list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
> -			INIT_LIST_HEAD(&rq->sched.link);
> +		priolist_for_each_request_consume(rq, rn, p, i) {
> +			list_del_init(&rq->sched.link);
>   
>   			dma_fence_set_error(&rq->fence, -EIO);
>   			__i915_request_submit(rq);
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> -		INIT_LIST_HEAD(&p->requests);
>   		if (p->priority != I915_PRIORITY_NORMAL)
>   			kmem_cache_free(engine->i915->priorities, p);
>   	}
> @@ -1072,8 +1107,7 @@ static void queue_request(struct intel_engine_cs *engine,
>   			  struct i915_sched_node *node,
>   			  int prio)
>   {
> -	list_add_tail(&node->link,
> -		      &lookup_priolist(engine, prio)->requests);
> +	list_add_tail(&node->link, lookup_priolist(engine, prio));
>   }
>   
>   static void __update_queue(struct intel_engine_cs *engine, int prio)
> @@ -1143,7 +1177,7 @@ sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
>   static void execlists_schedule(struct i915_request *request,
>   			       const struct i915_sched_attr *attr)
>   {
> -	struct i915_priolist *uninitialized_var(pl);
> +	struct list_head *uninitialized_var(pl);
>   	struct intel_engine_cs *engine, *last;
>   	struct i915_dependency *dep, *p;
>   	struct i915_dependency stack;
> @@ -1242,8 +1276,7 @@ static void execlists_schedule(struct i915_request *request,
>   				pl = lookup_priolist(engine, prio);
>   				last = engine;
>   			}
> -			GEM_BUG_ON(pl->priority != prio);
> -			list_move_tail(&node->link, &pl->requests);
> +			list_move_tail(&node->link, pl);
>   		} else {
>   			/*
>   			 * If the request is not in the priolist queue because
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 2dfa585712c2..1534de5bb852 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -190,11 +190,22 @@ enum intel_engine_id {
>   };
>   
>   struct i915_priolist {
> +	struct list_head requests[I915_PRIORITY_COUNT];
>   	struct rb_node node;
> -	struct list_head requests;
> +	unsigned long used;
>   	int priority;
>   };
>   
> +#define priolist_for_each_request(it, plist, idx) \
> +	for (idx = 0; idx < ARRAY_SIZE((plist)->requests); idx++) \
> +		list_for_each_entry(it, &(plist)->requests[idx], sched.link)
> +
> +#define priolist_for_each_request_consume(it, n, plist, idx) \
> +	for (; (idx = ffs((plist)->used)); (plist)->used &= ~BIT(idx - 1)) \
> +		list_for_each_entry_safe(it, n, \
> +					 &(plist)->requests[idx - 1], \
> +					 sched.link)
> +
>   struct st_preempt_hang {
>   	struct completion completion;
>   	bool inject_hang;
>