[Intel-gfx] [PATCH 15/27] drm/i915: Split execlist priority queue into rbtree + linked list

Mon Apr 24 10:28:32 UTC 2017

On 19/04/2017 10:41, Chris Wilson wrote:
> All the requests at the same priority are executed in FIFO order. They
> do not need to be stored in the rbtree themselves, as they are a simple
> list within a level. If we move the requests at one priority into a list,
> we can then reduce the rbtree to the set of priorities. This should keep
> the height of the rbtree small, as the number of active priorities can not
> exceed the number of active requests and should be typically only a few.
>
> Currently, we have ~2k possible different priority levels, that may
> increase to allow even more fine grained selection. Allocating those in
> advance seems a waste (and may be impossible), so we opt for allocating
> upon first use, and freeing after its requests are depleted. To avoid
> the possibility of an allocation failure causing us to lose a request,
> we preallocate the default priority (0) and bump any request to that
> priority if we fail to allocate it the appropriate plist. Having a
> request (that is ready to run, so not leading to corruption) execute
> out-of-order is better than leaking the request (and its dependency
> tree) entirely.
>
> There should be a benefit to reducing execlists_dequeue() to principally
> using a simple list (and reducing the frequency of both rbtree iteration
> and balancing on erase) but for typical workloads, request coalescing
> should be small enough that we don't notice any change. The main gain is
> from improving PI calls to schedule, and the explicit list within a
> level should make request unwinding simpler (we just need to insert at
> the head of the list rather than the tail and not have to make the
> rbtree search more complicated).

Sounds attractive! What workloads show the benefit and how much?

> v2: Avoid use-after-free when deleting a depleted priolist
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michał Winiarski <michal.winiarski at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        | 12 +++--
>  drivers/gpu/drm/i915/i915_gem_request.c    |  4 +-
>  drivers/gpu/drm/i915/i915_gem_request.h    |  2 +-
>  drivers/gpu/drm/i915/i915_guc_submission.c | 20 ++++++--
>  drivers/gpu/drm/i915/intel_lrc.c           | 75 ++++++++++++++++++++++--------
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |  7 +++
>  6 files changed, 90 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 0b5d7142d8d9..a8c7788d986e 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3314,7 +3314,6 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>
>  		if (i915.enable_execlists) {
>  			u32 ptr, read, write;
> -			struct rb_node *rb;
>  			unsigned int idx;
>
>  			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
> @@ -3358,9 +3357,14 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  			rcu_read_unlock();
>
>  			spin_lock_irq(&engine->timeline->lock);
> -			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
> -				rq = rb_entry(rb, typeof(*rq), priotree.node);
> -				print_request(m, rq, "\t\tQ ");
> +			for (rb = engine->execlist_first; rb; rb = rb_next(rb)){
> +				struct execlist_priolist *plist =
> +					rb_entry(rb, typeof(*plist), node);
> +
> +				list_for_each_entry(rq,
> +						    &plist->requests,
> +						    priotree.link)
> +					print_request(m, rq, "\t\tQ ");
>  			}
>  			spin_unlock_irq(&engine->timeline->lock);
>  		} else if (INTEL_GEN(dev_priv) > 6) {
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 83b1584b3deb..59c0e0b00028 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -159,7 +159,7 @@ i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
>  {
>  	struct i915_dependency *dep, *next;
>
> -	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
> +	GEM_BUG_ON(!list_empty(&pt->link));
>
>  	/* Everyone we depended upon (the fences we wait to be signaled)
>  	 * should retire before us and remove themselves from our list.
> @@ -185,7 +185,7 @@ i915_priotree_init(struct i915_priotree *pt)
>  {
>  	INIT_LIST_HEAD(&pt->signalers_list);
>  	INIT_LIST_HEAD(&pt->waiters_list);
> -	RB_CLEAR_NODE(&pt->node);
> +	INIT_LIST_HEAD(&pt->link);
>  	pt->priority = INT_MIN;
>  }
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 4ccab5affd3c..0a1d717b9fa7 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -67,7 +67,7 @@ struct i915_dependency {
>  struct i915_priotree {
>  	struct list_head signalers_list; /* those before us, we depend upon */
>  	struct list_head waiters_list; /* those after us, they depend upon us */
> -	struct rb_node node;
> +	struct list_head link;
>  	int priority;
>  #define I915_PRIORITY_MAX 1024
>  #define I915_PRIORITY_MIN (-I915_PRIORITY_MAX)
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 370373c97b81..69b39729003b 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -664,9 +664,15 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>
>  	spin_lock_irq(&engine->timeline->lock);
>  	rb = engine->execlist_first;
> +	GEM_BUG_ON(rb_first(&engine->execlist_queue) != rb);
>  	while (rb) {
> +		struct execlist_priolist *plist =
> +			rb_entry(rb, typeof(*plist), node);
>  		struct drm_i915_gem_request *rq =
> -			rb_entry(rb, typeof(*rq), priotree.node);
> +			list_first_entry(&plist->requests,
> +					 typeof(*rq),
> +					 priotree.link);
> +		GEM_BUG_ON(list_empty(&plist->requests));
>
>  		if (last && rq->ctx != last->ctx) {
>  			if (port != engine->execlist_port)
> @@ -677,9 +683,15 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  			port++;
>  		}
>
> -		rb = rb_next(rb);
> -		rb_erase(&rq->priotree.node, &engine->execlist_queue);
> -		RB_CLEAR_NODE(&rq->priotree.node);
> +		if (rq->priotree.link.next == rq->priotree.link.prev) {
> +			rb = rb_next(rb);
> +			rb_erase(&plist->node, &engine->execlist_queue);
> +			if (plist->priority)
> +				kfree(plist);
> +		} else {
> +			__list_del_entry(&rq->priotree.link);
> +		}
> +		INIT_LIST_HEAD(&rq->priotree.link);
>  		rq->priotree.priority = INT_MAX;
>
>  		i915_guc_submit(rq);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 69299fbab4f9..f96d7980ac16 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -442,9 +442,15 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>
>  	spin_lock_irq(&engine->timeline->lock);
>  	rb = engine->execlist_first;
> +	GEM_BUG_ON(rb_first(&engine->execlist_queue) != rb);
>  	while (rb) {
> +		struct execlist_priolist *plist =
> +			rb_entry(rb, typeof(*plist), node);
>  		struct drm_i915_gem_request *cursor =
> -			rb_entry(rb, typeof(*cursor), priotree.node);
> +			list_first_entry(&plist->requests,
> +					 typeof(*cursor),
> +					 priotree.link);
> +		GEM_BUG_ON(list_empty(&plist->requests));
>
>  		/* Can we combine this request with the current port? It has to
>  		 * be the same context/ringbuffer and not have any exceptions
> @@ -479,9 +485,15 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			port++;
>  		}
>
> -		rb = rb_next(rb);
> -		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> -		RB_CLEAR_NODE(&cursor->priotree.node);
> +		if (cursor->priotree.link.next == cursor->priotree.link.prev) {
> +			rb = rb_next(rb);
> +			rb_erase(&plist->node, &engine->execlist_queue);
> +			if (plist->priority)
> +				kfree(plist);
> +		} else {
> +			__list_del_entry(&cursor->priotree.link);
> +		}
> +		INIT_LIST_HEAD(&cursor->priotree.link);
>  		cursor->priotree.priority = INT_MAX;
>
>  		__i915_gem_request_submit(cursor);
> @@ -621,28 +633,53 @@ static void intel_lrc_irq_handler(unsigned long data)
>  	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
>  }
>
> -static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
> +static bool
> +insert_request(struct intel_engine_cs *engine,
> +	       struct i915_priotree *pt,
> +	       int prio)
>  {
> +	struct execlist_priolist *plist;
>  	struct rb_node **p, *rb;
>  	bool first = true;
>
> +find_plist:
>  	/* most positive priority is scheduled first, equal priorities fifo */
>  	rb = NULL;
> -	p = &root->rb_node;
> +	p = &engine->execlist_queue.rb_node;
>  	while (*p) {
> -		struct i915_priotree *pos;
> -
>  		rb = *p;
> -		pos = rb_entry(rb, typeof(*pos), node);
> -		if (pt->priority > pos->priority) {
> +		plist = rb_entry(rb, typeof(*plist), node);
> +		if (prio > plist->priority) {
>  			p = &rb->rb_left;
> -		} else {
> +		} else if (prio < plist->priority) {
>  			p = &rb->rb_right;
>  			first = false;
> +		} else {
> +			list_add_tail(&pt->link, &plist->requests);
> +			return false;
>  		}
>  	}
> -	rb_link_node(&pt->node, rb, p);
> -	rb_insert_color(&pt->node, root);
> +
> +	if (!prio) {
> +		plist = &engine->default_priolist;

Should be "prio == I915_PRIO_DEFAULT" (give or take).

But I am not completely happy with special casing the default priority 
for two reasons.

Firstly, userspace can opt to lower its priority and completely defeat 
this path.

Secondly, we already have flip priority which perhaps should have it's 
own fast path / avoid allocation as well.

Those two combined make me unsure whether the optimisation is worth it. 
What would be the pros and cons of three steps:

1. No optimisation.
2. prio == default optimisation like above.
3. Better system with caching of frequently used levels.

Last is definitely complicated, second is not, but is the second much 
better than the first?

Perhaps a simplification of 3) where we would defer the freeing of 
unused priority levels until the busy to idle transition? That would 
also drop the existence and need for special handling of 
engine->default_prio.

> +	} else {
> +		plist = kmalloc(sizeof(*plist), GFP_ATOMIC);
> +		/* Convert an allocation failure to a priority bump */

Where is the priority bump? It looks like it can be the opposite for 
high prio requests below.

I don't think it matters what happens with priorities hugely when small 
allocations start to go bad but would like to understand the comment.

And perhaps this would be worthy of a dedicated slab cache?

> +		if (unlikely(!plist)) {
> +			prio = 0; /* recurses just once */
> +			goto find_plist;
> +		}
> +	}
> +
> +	plist->priority = prio;
> +	rb_link_node(&plist->node, rb, p);
> +	rb_insert_color(&plist->node, &engine->execlist_queue);
> +
> +	INIT_LIST_HEAD(&plist->requests);
> +	list_add_tail(&pt->link, &plist->requests);
> +
> +	if (first)
> +		engine->execlist_first = &plist->node;
>
>  	return first;
>  }
> @@ -655,8 +692,9 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
>  	/* Will be called from irq-context when using foreign fences. */
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
>
> -	if (insert_request(&request->priotree, &engine->execlist_queue)) {
> -		engine->execlist_first = &request->priotree.node;
> +	if (insert_request(engine,
> +			   &request->priotree,
> +			   request->priotree.priority)) {
>  		if (execlists_elsp_ready(engine))
>  			tasklet_hi_schedule(&engine->irq_tasklet);
>  	}
> @@ -745,10 +783,9 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
>  			continue;
>
>  		pt->priority = prio;
> -		if (!RB_EMPTY_NODE(&pt->node)) {
> -			rb_erase(&pt->node, &engine->execlist_queue);
> -			if (insert_request(pt, &engine->execlist_queue))
> -				engine->execlist_first = &pt->node;
> +		if (!list_empty(&pt->link)) {
> +			__list_del_entry(&pt->link);
> +			insert_request(engine, pt, prio);
>  		}
>  	}
>
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 39b733e5cfd3..1ff41bd9e89a 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -187,6 +187,12 @@ enum intel_engine_id {
>  	VECS
>  };
>
> +struct execlist_priolist {
> +	struct rb_node node;
> +	struct list_head requests;
> +	int priority;
> +};
> +
>  #define INTEL_ENGINE_CS_MAX_NAME 8
>
>  struct intel_engine_cs {
> @@ -376,6 +382,7 @@ struct intel_engine_cs {
>
>  	/* Execlists */
>  	struct tasklet_struct irq_tasklet;
> +	struct execlist_priolist default_priolist;
>  	struct execlist_port {
>  		struct drm_i915_gem_request *request_count;
>  #define EXECLIST_COUNT_BITS 2
>

Regards,

Tvrtko