[Intel-gfx] [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc

Thu Dec 1 10:45:51 UTC 2016

On 14/11/2016 08:57, Chris Wilson wrote:
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority
> requests to gazump their way to the head of the queue, however it nerfs
> the GuC by converting it back into a simple execlist (where the CPU has
> to wake up after every request to feed new commands into the GuC).

As it is starting to sink in we'll have to do add this hack sooner or 
later, review comments below.

Also, would you be OK to rebase this or would prefer to delegate it?

> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 85 +++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_irq.c            |  4 +-
>  drivers/gpu/drm/i915/intel_lrc.c           |  3 --
>  3 files changed, 76 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 4462112725ef..088f5a99ecfc 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
>  	freespace -= gc->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  		gc->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>
>  	return ret;
>  }
> @@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
>
>  	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	gc->wq_rsvd -= wqi_size;
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>  }
>
>  /* Construct a Work Item and append it to the GuC's Work Queue */
> @@ -644,7 +644,7 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	rq->previous_context = engine->last_context;
>  	engine->last_context = rq->ctx;
>
> -	i915_gem_request_submit(rq);
> +	__i915_gem_request_submit(rq);
>
>  	spin_lock(&client->wq_lock);
>  	guc_wq_item_append(client, rq);
> @@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	spin_unlock(&client->wq_lock);
>  }

Confused me at first here until I noticed engine->submit_request will be 
the execlist_submit_request later. Perhaps it would be good to rename a 
lot of things now, like engine->request_queue, 
intel_engine_submit_request and maybe more?

>
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +{
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {

Not sure if GVT comes into the picture here, but it does not sounds like 
it would harm to use can_merge_ctx here?

> +			if (port != engine->execlist_port)
> +				break;

It may be an overkill for the first version, but I was thinking that we 
don't have to limit it to two at a time. And it would depend on 
measuring of course. But perhaps it would make sense to do the 
generalisation of the number of supported ports straight away.

> +
> +			i915_gem_request_assign(&port->request, last);
> +			dma_fence_enable_sw_signaling(&last->fence);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		dma_fence_enable_sw_signaling(&last->fence);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}

We could theoretically share most of the execlist_dequeue and just do a 
couple things differently depending on the mode.

Looks like one could be a new engine->submit_ports vfunc. And there is 
also the lite restore WA and sw signalling to design in nicely, but it 
may be worth sharing the code. It would be renamed to sometihng like 
scheduler_dequeue or something.

> +
> +static void i915_guc_irq_handler(unsigned long data)
> +{
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			rq = port[1].request;
> +			port[0].request = rq;
> +			port[1].request = NULL;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -1531,16 +1595,13 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
>
>  		/* Replay the current set of previously submitted requests */
> -		list_for_each_entry(request,
> -				    &engine->timeline->requests, link) {
> +		list_for_each_entry(request, &engine->timeline->requests, link)
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> -			if (i915_sw_fence_done(&request->submit))
> -				i915_guc_submit(request);
> -		}
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index cb8a75f6ca16..18dce4c66d56 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  static __always_inline void
>  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		tasklet_schedule(&engine->irq_tasklet);

This would be better made conditional on GuC submission just to calling 
tasklet_schedule twice (occasionally) in execlist mode.

>  		notify_ring(engine);
> +	}
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
>  		tasklet_schedule(&engine->irq_tasklet);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index d13a335ad83a..ffab255e55a7 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1425,9 +1425,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
>
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
>  	if (request->ctx != port[0].request->ctx) {
>

Regards,

Tvrtko