[Intel-gfx] [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc

Wed Feb 1 12:09:40 UTC 2017

On 01/02/2017 10:24, Chris Wilson wrote:
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority
> requests to gazump their way to the head of the queue, however it nerfs
> the GuC by converting it back into a simple execlist (where the CPU has
> to wake up after every request to feed new commands into the GuC).

Not every request since it does have two virtual ports, or even a bit 
better with request merging. Just saying so it sounds less dramatic for 
people reading commit messages.

> v2: Drop hack status - though iirc there is still a lockdep inversion
> between fence and engine->timeline->lock (which is impossible as the
> nesting only occurs on different fences - hopefully just requires some
> judicious lockdep annotation)
> v3: Apply lockdep nesting to enabling signaling on the request, using
> the pattern we already have in __i915_gem_request_submit();
> v4: Replaying requests after a hang also now needs the timeline
> spinlock, to disable the interrupts at least
> v5: Hold wq lock for completeness, and emit a tracepoint for enabling signal
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 109 +++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/i915_irq.c            |   4 +-
>  drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
>  3 files changed, 106 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 8ced9e26f075..68c32da28094 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -25,6 +25,8 @@
>  #include "i915_drv.h"
>  #include "intel_uc.h"
>
> +#include <trace/events/dma_fence.h>
> +
>  /**
>   * DOC: GuC-based command submission
>   *
> @@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
>
> -	spin_lock(&client->wq_lock);
> +	spin_lock_irq(&client->wq_lock);
>  	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
>  	freespace -= client->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -358,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  		client->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&client->wq_lock);
> +	spin_unlock_irq(&client->wq_lock);
>
>  	return ret;
>  }
> @@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
>
>  	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
>
> -	spin_lock(&client->wq_lock);
> +	spin_lock_irq(&client->wq_lock);
>  	client->wq_rsvd -= wqi_size;
> -	spin_unlock(&client->wq_lock);
> +	spin_unlock_irq(&client->wq_lock);
>  }
>
>  /* Construct a Work Item and append it to the GuC's Work Queue */
> @@ -532,10 +534,96 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
>
>  static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  {
> -	i915_gem_request_submit(rq);
> +	__i915_gem_request_submit(rq);
>  	__i915_guc_submit(rq);
>  }
>
> +static void nested_enable_signaling(struct drm_i915_gem_request *rq)
> +{
> +	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
> +	 * detects an ordering issue between the fence lockclass and the
> +	 * global_timeline. This circular dependency can only occur via 2
> +	 * different fences (but same fence lockclass), so we use the nesting
> +	 * annotation here to prevent the warn, equivalent to the nesting
> +	 * inside i915_gem_request_submit() for when we also enable the
> +	 * signaler.
> +	 */
> +
> +	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> +			     &rq->fence.flags))
> +		return;
> +
> +	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags));
> +	trace_dma_fence_enable_signal(&rq->fence);
> +
> +	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
> +	intel_engine_enable_signaling(rq);
> +	spin_unlock(&rq->lock);
> +}
> +
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +{
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {
> +			if (port != engine->execlist_port)
> +				break;
> +
> +			i915_gem_request_assign(&port->request, last);
> +			nested_enable_signaling(last);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		nested_enable_signaling(last);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}
> +
> +static void i915_guc_irq_handler(unsigned long data)
> +{
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			port[0].request = port[1].request;
> +			port[1].request = NULL;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -944,15 +1032,22 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
>  		struct drm_i915_gem_request *rq;
> +		unsigned long flags;
>
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
>
>  		/* Replay the current set of previously submitted requests */
> +		spin_lock_irqsave(&engine->timeline->lock, flags);
>  		list_for_each_entry(rq, &engine->timeline->requests, link) {
> +			spin_lock(&client->wq_lock);
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> +			spin_unlock(&client->wq_lock);
> +
>  			__i915_guc_submit(rq);
>  		}
> +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 0ff75f2282fa..e0cec254faa5 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1350,8 +1350,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  static __always_inline void
>  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		tasklet_schedule(&engine->irq_tasklet);
>  		notify_ring(engine);
> +	}

Would this be better:

if (iir & (CTX | USR)) {
	set_bit();
	tasklet_hi_schedule();

	if (iir & USR) {
		notify_ring();
	}
}

?

>
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
>  		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 0e99d53d5523..1b7fc0ffa7ad 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1272,7 +1272,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>
>  	/* After a GPU reset, we may have requests to replay */
>  	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -	if (!execlists_elsp_idle(engine)) {
> +	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
>  		engine->execlist_port[0].count = 0;
>  		engine->execlist_port[1].count = 0;
>  		execlists_submit_ports(engine);
> @@ -1340,9 +1340,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
>
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));

Should we put this write under the i915.enable_guc_submission perhasp? 
Maybe it makes no difference since this is after the reset but don't 
know, perhaps it is more logical.

>  	if (request->ctx != port[0].request->ctx) {
>

Regards,

Tvrtko