[Intel-gfx] [PATCH v8] drm/i915/scheduler: emulate a scheduler for guc

Tue Feb 14 01:08:54 UTC 2017

Hi Chris
Just to check with you that this patch will checkin. Because it is essential for GVT once GUC enabled.

> -----Original Message-----
> From: Intel-gfx [mailto:intel-gfx-bounces at lists.freedesktop.org] On Behalf
> Of Zheng, Xiao
> Sent: Monday, February 13, 2017 4:06 PM
> To: Chris Wilson <chris at chris-wilson.co.uk>; intel-gfx at lists.freedesktop.org
> Cc: Wang, Zhenyu Z <zhenyu.z.wang at intel.com>
> Subject: Re: [Intel-gfx] [PATCH v8] drm/i915/scheduler: emulate a scheduler
> for guc
> 
> Thanks Chris.
> This is Xiao from GVT-g team. GVT-g is actually looking for this patch once
> GUC submission enabled in host/native side.  GVT-g requires a single
> submission (only one request in GUC FW anytime) to GUC FW because we
> need to do additional setup/cleanup for VM switch.
> One thing need to mention:  we need  'execlists_context_status_change'
> notification before i915_guc_submit() just what we did for ELSP port write.
> Anyway we can submit another patch based on this.
> The question is:  when will this patch be merged to upstream?
> 
> > -----Original Message-----
> > From: Intel-gfx [mailto:intel-gfx-bounces at lists.freedesktop.org] On
> > Behalf Of Chris Wilson
> > Sent: Thursday, February 2, 2017 6:36 PM
> > To: intel-gfx at lists.freedesktop.org
> > Subject: [Intel-gfx] [PATCH v8] drm/i915/scheduler: emulate a
> > scheduler for guc
> >
> > This emulates execlists on top of the GuC in order to defer submission
> > of requests to the hardware. This deferral allows time for high
> > priority requests to gazump their way to the head of the queue,
> > however it nerfs the GuC by converting it back into a simple execlist
> > (where the CPU has to wake up after every request to feed new
> commands into the GuC).
> >
> > v2: Drop hack status - though iirc there is still a lockdep inversion
> > between fence and engine->timeline->lock (which is impossible as the
> > nesting only occurs on different fences - hopefully just requires some
> > judicious lockdep
> > annotation)
> > v3: Apply lockdep nesting to enabling signaling on the request, using
> > the pattern we already have in __i915_gem_request_submit();
> > v4: Replaying requests after a hang also now needs the timeline
> > spinlock, to disable the interrupts at least
> > v5: Hold wq lock for completeness, and emit a tracepoint for enabling
> > signal
> > v6: Reorder interrupt checking for a happier gcc.
> > v7: Only signal the tasklet after a user-interrupt if using guc
> > scheduling
> > v8: Restore lost update of rq through the i915_guc_irq_handler
> > (Tvrtko)
> >
> > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_guc_submission.c | 110
> > +++++++++++++++++++++++++++--
> >  drivers/gpu/drm/i915/i915_irq.c            |  13 +++-
> >  drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
> >  3 files changed, 114 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c
> > b/drivers/gpu/drm/i915/i915_guc_submission.c
> > index 8ced9e26f075..d99185aafe58 100644
> > --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> > @@ -25,6 +25,8 @@
> >  #include "i915_drv.h"
> >  #include "intel_uc.h"
> >
> > +#include <trace/events/dma_fence.h>
> > +
> >  /**
> >   * DOC: GuC-based command submission
> >   *
> > @@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct
> > drm_i915_gem_request *request)
> >  	u32 freespace;
> >  	int ret;
> >
> > -	spin_lock(&client->wq_lock);
> > +	spin_lock_irq(&client->wq_lock);
> >  	freespace = CIRC_SPACE(client->wq_tail, desc->head, client-
> > >wq_size);
> >  	freespace -= client->wq_rsvd;
> >  	if (likely(freespace >= wqi_size)) { @@ -358,7 +360,7 @@ int
> > i915_guc_wq_reserve(struct drm_i915_gem_request *request)
> >  		client->no_wq_space++;
> >  		ret = -EAGAIN;
> >  	}
> > -	spin_unlock(&client->wq_lock);
> > +	spin_unlock_irq(&client->wq_lock);
> >
> >  	return ret;
> >  }
> > @@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct
> > drm_i915_gem_request *request)
> >
> >  	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
> >
> > -	spin_lock(&client->wq_lock);
> > +	spin_lock_irq(&client->wq_lock);
> >  	client->wq_rsvd -= wqi_size;
> > -	spin_unlock(&client->wq_lock);
> > +	spin_unlock_irq(&client->wq_lock);
> >  }
> >
> >  /* Construct a Work Item and append it to the GuC's Work Queue */ @@
> > -
> > 532,10 +534,97 @@ static void __i915_guc_submit(struct
> > drm_i915_gem_request *rq)
> >
> >  static void i915_guc_submit(struct drm_i915_gem_request *rq)  {
> > -	i915_gem_request_submit(rq);
> > +	__i915_gem_request_submit(rq);
> >  	__i915_guc_submit(rq);
> >  }
> >
> > +static void nested_enable_signaling(struct drm_i915_gem_request *rq) {
> > +	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
> > +	 * detects an ordering issue between the fence lockclass and the
> > +	 * global_timeline. This circular dependency can only occur via 2
> > +	 * different fences (but same fence lockclass), so we use the nesting
> > +	 * annotation here to prevent the warn, equivalent to the nesting
> > +	 * inside i915_gem_request_submit() for when we also enable the
> > +	 * signaler.
> > +	 */
> > +
> > +	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> > +			     &rq->fence.flags))
> > +		return;
> > +
> > +	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq-
> > >fence.flags));
> > +	trace_dma_fence_enable_signal(&rq->fence);
> > +
> > +	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
> > +	intel_engine_enable_signaling(rq);
> > +	spin_unlock(&rq->lock);
> > +}
> > +
> > +static bool i915_guc_dequeue(struct intel_engine_cs *engine) {
> > +	struct execlist_port *port = engine->execlist_port;
> > +	struct drm_i915_gem_request *last = port[0].request;
> > +	unsigned long flags;
> > +	struct rb_node *rb;
> > +	bool submit = false;
> > +
> > +	spin_lock_irqsave(&engine->timeline->lock, flags);
> > +	rb = engine->execlist_first;
> > +	while (rb) {
> > +		struct drm_i915_gem_request *cursor =
> > +			rb_entry(rb, typeof(*cursor), priotree.node);
> > +
> > +		if (last && cursor->ctx != last->ctx) {
> > +			if (port != engine->execlist_port)
> > +				break;
> > +
> > +			i915_gem_request_assign(&port->request, last);
> > +			nested_enable_signaling(last);
> > +			port++;
> > +		}
> > +
> > +		rb = rb_next(rb);
> > +		rb_erase(&cursor->priotree.node, &engine-
> > >execlist_queue);
> > +		RB_CLEAR_NODE(&cursor->priotree.node);
> > +		cursor->priotree.priority = INT_MAX;
> > +
> > +		i915_guc_submit(cursor);
> > +		last = cursor;
> > +		submit = true;
> > +	}
> > +	if (submit) {
> > +		i915_gem_request_assign(&port->request, last);
> > +		nested_enable_signaling(last);
> > +		engine->execlist_first = rb;
> > +	}
> > +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> > +
> > +	return submit;
> > +}
> > +
> > +static void i915_guc_irq_handler(unsigned long data) {
> > +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> > +	struct execlist_port *port = engine->execlist_port;
> > +	struct drm_i915_gem_request *rq;
> > +	bool submit;
> > +
> > +	do {
> > +		rq = port[0].request;
> > +		while (rq && i915_gem_request_completed(rq)) {
> > +			i915_gem_request_put(rq);
> > +			port[0].request = port[1].request;
> > +			port[1].request = NULL;
> > +			rq = port[0].request;
> > +		}
> > +
> > +		submit = false;
> > +		if (!port[1].request)
> > +			submit = i915_guc_dequeue(engine);
> > +	} while (submit);
> > +}
> > +
> >  /*
> >   * Everything below here is concerned with setup & teardown, and is
> >   * therefore not part of the somewhat time-critical batch-submission
> > @@ -
> > 944,15 +1033,22 @@ int i915_guc_submission_enable(struct
> > drm_i915_private *dev_priv)
> >  	/* Take over from manual control of ELSP (execlists) */
> >  	for_each_engine(engine, dev_priv, id) {
> >  		struct drm_i915_gem_request *rq;
> > +		unsigned long flags;
> >
> > -		engine->submit_request = i915_guc_submit;
> > -		engine->schedule = NULL;
> > +		tasklet_init(&engine->irq_tasklet,
> > +			     i915_guc_irq_handler,
> > +			     (unsigned long)engine);
> >
> >  		/* Replay the current set of previously submitted requests */
> > +		spin_lock_irqsave(&engine->timeline->lock, flags);
> >  		list_for_each_entry(rq, &engine->timeline->requests, link) {
> > +			spin_lock(&client->wq_lock);
> >  			client->wq_rsvd += sizeof(struct guc_wq_item);
> > +			spin_unlock(&client->wq_lock);
> > +
> >  			__i915_guc_submit(rq);
> >  		}
> > +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
> >  	}
> >
> >  	return 0;
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c
> > b/drivers/gpu/drm/i915/i915_irq.c index 436b5baf38c2..72f9497518bc
> > 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -1350,13 +1350,20 @@ static void snb_gt_irq_handler(struct
> > drm_i915_private *dev_priv,  static __always_inline void
> > gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
> {
> > -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> > -		notify_ring(engine);
> > +	bool tasklet = false;
> >
> >  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
> >  		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > -		tasklet_hi_schedule(&engine->irq_tasklet);
> > +		tasklet = true;
> >  	}
> > +
> > +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> > +		notify_ring(engine);
> > +		tasklet |= i915.enable_guc_submission;
> > +	}
> > +
> > +	if (tasklet)
> > +		tasklet_hi_schedule(&engine->irq_tasklet);
> >  }
> >
> >  static irqreturn_t gen8_gt_irq_ack(struct drm_i915_private *dev_priv,
> > diff -- git a/drivers/gpu/drm/i915/intel_lrc.c
> > b/drivers/gpu/drm/i915/intel_lrc.c
> > index 44a92ea464ba..c81d86afb52f 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -1278,7 +1278,7 @@ static int gen8_init_common_ring(struct
> > intel_engine_cs *engine)
> >
> >  	/* After a GPU reset, we may have requests to replay */
> >  	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > -	if (!execlists_elsp_idle(engine)) {
> > +	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
> >  		engine->execlist_port[0].count = 0;
> >  		engine->execlist_port[1].count = 0;
> >  		execlists_submit_ports(engine);
> > @@ -1345,9 +1345,6 @@ static void reset_common_ring(struct
> > intel_engine_cs *engine,
> >  	request->ring->last_retired_head = -1;
> >  	intel_ring_update_space(request->ring);
> >
> > -	if (i915.enable_guc_submission)
> > -		return;
> > -
> >  	/* Catch up with any missed context-switch interrupts */
> >  	if (request->ctx != port[0].request->ctx) {
> >  		i915_gem_request_put(port[0].request);
> > --
> > 2.11.0
> >
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx at lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx