[Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities

Thu Nov 3 16:21:25 UTC 2016

Hi,

1st pass comments only for now.

On 02/11/2016 17:50, Chris Wilson wrote:
> Track the priority of each request and use it to determine the order in
> which we submit requests to the hardware via execlists.
>
> The priority of the request is determined by the user (eventually via
> the context) but may be overridden at any time by the driver. When we set
> the priority of the request, we bump the priority of all of its
> dependencies to match - so that a high priority drawing operation is not
> stuck behind a background task.
>
> When the request is ready to execute (i.e. we have signaled the submit
> fence following completion of all its dependencies, including third
> party fences), we put the request into a priority sorted rbtree to be
> submitted to the hardware. If the request is higher priority than all
> pending requests, it will be submitted on the next context-switch
> interrupt as soon as the hardware has completed the current request. We
> do not currently preempt any current execution to immediately run a very
> high priority request, at least not yet.
>
> One more limitation, is that this is first implementation is for
> execlists only so currently limited to gen8/gen9.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
>  drivers/gpu/drm/i915/i915_gem.c            |   3 +-
>  drivers/gpu/drm/i915/i915_gem_request.c    |   4 ++
>  drivers/gpu/drm/i915/i915_gem_request.h    |   6 +-
>  drivers/gpu/drm/i915/i915_guc_submission.c |   4 ++
>  drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
>  drivers/gpu/drm/i915/intel_lrc.c           | 104 ++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
>  8 files changed, 109 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 3cb96d260dfb..dac435680e98 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
>  			  struct drm_i915_gem_request *rq,
>  			  const char *prefix)
>  {
> -	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
> +	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,

Noticed this is quite unreadable due the range of INT_MIN to INT_MAX. Do 
we need such granularity or could use something smaller so it looks 
nicer here? I know, the argument is quite poor. :)

>  		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
> +		   rq->priotree.priority,
>  		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
>  		   rq->timeline->common->name);
>  }
> @@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>
>  		if (i915.enable_execlists) {
>  			u32 ptr, read, write;
> +			struct rb_node *rb;
>
>  			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
>  				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
> @@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  			rcu_read_unlock();
>
>  			spin_lock_irq(&engine->timeline->lock);
> -			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
> +			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
> +				rq = rb_entry(rb, typeof(*rq), priotree.node);
>  				print_request(m, rq, "\t\tQ ");
>  			}
>  			spin_unlock_irq(&engine->timeline->lock);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index cf28666021f8..4697848ecfd9 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2687,10 +2687,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
>
>  	if (i915.enable_execlists) {
>  		spin_lock_irq(&engine->timeline->lock);
> -		INIT_LIST_HEAD(&engine->execlist_queue);
>  		i915_gem_request_put(engine->execlist_port[0].request);
>  		i915_gem_request_put(engine->execlist_port[1].request);
>  		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> +		engine->execlist_queue = RB_ROOT;
> +		engine->execlist_first = NULL;
>  		spin_unlock_irq(&engine->timeline->lock);
>  	}
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 13090f226203..5f0068fac3e9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -141,6 +141,8 @@ i915_priotree_fini(struct i915_priotree *pt)
>  {
>  	struct i915_dependency *dep, *next;
>
> +	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
> +
>  	/* Everyone we depended upon should retire before us and remove
>  	 * themselves from our list. However, retirement is run independently
>  	 * on each timeline and so we may be called out-of-order.
> @@ -164,6 +166,8 @@ i915_priotree_init(struct i915_priotree *pt)
>  {
>  	INIT_LIST_HEAD(&pt->pre_list);
>  	INIT_LIST_HEAD(&pt->post_list);
> +	RB_CLEAR_NODE(&pt->node);
> +	pt->priority = INT_MIN;
>  }
>
>  void i915_gem_retire_noop(struct i915_gem_active *active,
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 0cb2ba546062..4288f512ac51 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -51,6 +51,9 @@ struct i915_dependency {
>  struct i915_priotree {
>  	struct list_head pre_list; /* who is before us, we depend upon */
>  	struct list_head post_list; /* who is after us, they depend upon us */
> +	struct rb_node node;
> +	int priority;
> +#define I915_PRIORITY_MAX 1024

Unused in this patch.

>  };
>
>  /**
> @@ -168,9 +171,6 @@ struct drm_i915_gem_request {
>  	struct drm_i915_file_private *file_priv;
>  	/** file_priv list entry for this request */
>  	struct list_head client_list;
> -
> -	/** Link in the execlist submission queue, guarded by execlist_lock. */
> -	struct list_head execlist_link;
>  };
>
>  extern const struct dma_fence_ops i915_fence_ops;
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index ddb574d5ceda..b31573a825fa 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -640,6 +640,9 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	struct i915_guc_client *client = guc->execbuf_client;
>  	int b_ret;
>
> +	rq->previous_context = rq->engine->last_context;
> +	rq->engine->last_context = rq->ctx;
> +
>  	__i915_gem_request_submit(rq);
>
>  	spin_lock(&client->wq_lock);
> @@ -1522,6 +1525,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
>  		engine->submit_request = i915_guc_submit;
> +		engine->schedule = NULL;
>
>  		/* Replay the current set of previously submitted requests */
>  		list_for_each_entry(request,
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index c9171a058478..3da4d466e332 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -239,7 +239,8 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
>   */
>  void intel_engine_setup_common(struct intel_engine_cs *engine)
>  {
> -	INIT_LIST_HEAD(&engine->execlist_queue);
> +	engine->execlist_queue = RB_ROOT;
> +	engine->execlist_first = NULL;
>
>  	intel_engine_init_timeline(engine);
>  	intel_engine_init_hangcheck(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 186c3ccb2c34..70ac74c959bd 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -432,9 +432,10 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
>
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_gem_request *cursor, *last;
> +	struct drm_i915_gem_request *last;
>  	struct execlist_port *port = engine->execlist_port;
>  	unsigned long flags;
> +	struct rb_node *rb;
>  	bool submit = false;
>
>  	last = port->request;
> @@ -471,7 +472,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	 */
>
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
> -	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
>  		/* Can we combine this request with the current port? It has to
>  		 * be the same context/ringbuffer and not have any exceptions
>  		 * (e.g. GVT saying never to combine contexts).
> @@ -503,16 +508,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			port++;
>  		}
>
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		/* We keep the previous context alive until we retire the
> +		 * following request. This ensures that any the context object
> +		 * is still pinned for any residual writes the HW makes into
> +		 * it on the context switch into the next object following the
> +		 * breadcrumb. Otherwise, we may retire the context too early.
> +		 */
> +		cursor->previous_context = engine->last_context;
> +		engine->last_context = cursor->ctx;
> +

Will we will need a knob to control the amount of context merging? Until 
preemption that is, similar to GuC queue depth "nerfing" from the other 
patch.

>  		__i915_gem_request_submit(cursor);
>  		last = cursor;
>  		submit = true;
>  	}
>  	if (submit) {
> -		/* Decouple all the requests submitted from the queue */
> -		engine->execlist_queue.next = &cursor->execlist_link;
> -		cursor->execlist_link.prev = &engine->execlist_queue;
> -
>  		i915_gem_request_assign(&port->request, last);
> +		engine->execlist_first = rb;
>  	}
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
> @@ -595,26 +611,77 @@ static void intel_lrc_irq_handler(unsigned long data)
>  	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
>  }
>
> +static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
> +{
> +	struct rb_node **p, *rb;
> +	bool first = true;
> +
> +	/* most positive priority is scheduled first, equal priorities fifo */
> +	rb = NULL;
> +	p = &root->rb_node;
> +	while (*p) {
> +		struct i915_priotree *pos;
> +
> +		rb = *p;
> +		pos = rb_entry(rb, typeof(*pos), node);
> +		if (pt->priority > pos->priority) {
> +			p = &rb->rb_left;
> +		} else {
> +			p = &rb->rb_right;
> +			first = false;
> +		}
> +	}
> +	rb_link_node(&pt->node, rb, p);
> +	rb_insert_color(&pt->node, root);
> +
> +	return first;
> +}
> +
>  static void execlists_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
>
>  	assert_spin_locked(&engine->timeline->lock);
>
> -	/* We keep the previous context alive until we retire the following
> -	 * request. This ensures that any the context object is still pinned
> -	 * for any residual writes the HW makes into it on the context switch
> -	 * into the next object following the breadcrumb. Otherwise, we may
> -	 * retire the context too early.
> -	 */
> -	request->previous_context = engine->last_context;
> -	engine->last_context = request->ctx;
> -
> -	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> +	if (insert_request(&request->priotree, &engine->execlist_queue))
> +		engine->execlist_first = &request->priotree.node;
>  	if (execlists_elsp_idle(engine))
>  		tasklet_hi_schedule(&engine->irq_tasklet);
>  }
>
> +static void update_priorities(struct i915_priotree *pt, int prio)
> +{
> +	struct drm_i915_gem_request *request =
> +		container_of(pt, struct drm_i915_gem_request, priotree);
> +	struct intel_engine_cs *engine = request->engine;
> +	struct i915_dependency *dep;
> +
> +	if (prio <= READ_ONCE(pt->priority))
> +		return;
> +
> +	/* Recursively bump all dependent priorities to match the new request */
> +	list_for_each_entry(dep, &pt->pre_list, pre_link)
> +		update_priorities(dep->signal, prio);

John got in trouble from recursion in his scheduler, used for the same 
thing AFAIR. Or was it the priority bumping? Either way, it could be 
imperative to avoid it.

> +
> +	/* Fifo and depth-first replacement ensure our deps execute before us */
> +	spin_lock_irq(&engine->timeline->lock);
> +	pt->priority = prio;
> +	if (!RB_EMPTY_NODE(&pt->node)) {
> +		rb_erase(&pt->node, &engine->execlist_queue);
> +		if (insert_request(pt, &engine->execlist_queue))
> +			engine->execlist_first = &pt->node;
> +	}
> +	spin_unlock_irq(&engine->timeline->lock);
> +}
> +
> +static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
> +{
> +	/* Make sure that our entire dependency chain shares our prio */
> +	update_priorities(&request->priotree, prio);
> +
> +	/* XXX Do we need to preempt to make room for us and our deps? */
> +}

Hm, why isn't scheduling backend agnostic? Makes it much simpler to do 
the first implementation like this?

> +
>  int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
> @@ -1651,8 +1718,10 @@ void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>
> -	for_each_engine(engine, dev_priv, id)
> +	for_each_engine(engine, dev_priv, id) {
>  		engine->submit_request = execlists_submit_request;
> +		engine->schedule = execlists_schedule;
> +	}
>  }
>
>  static void
> @@ -1665,6 +1734,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>  	engine->emit_breadcrumb = gen8_emit_breadcrumb;
>  	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
>  	engine->submit_request = execlists_submit_request;
> +	engine->schedule = execlists_schedule;
>
>  	engine->irq_enable = gen8_logical_ring_enable_irq;
>  	engine->irq_disable = gen8_logical_ring_disable_irq;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 75991a3c694b..cbc148863a03 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -348,7 +348,8 @@ struct intel_engine_cs {
>  		struct drm_i915_gem_request *request;
>  		unsigned int count;
>  	} execlist_port[2];
> -	struct list_head execlist_queue;
> +	struct rb_root execlist_queue;
> +	struct rb_node *execlist_first;
>  	unsigned int fw_domains;
>  	bool disable_lite_restore_wa;
>  	bool preempt_wa;
>

Regards,

Tvrtko