[Intel-gfx] [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission

Thu Nov 10 10:43:29 UTC 2016

On 07/11/2016 13:59, Chris Wilson wrote:
> Defer the transfer from the client's timeline onto the execution
> timeline from the point of readiness to the point of actual submission.
> For example, in execlists, a request is finally submitted to hardware
> when the hardware is ready, and only put onto the hardware queue when
> the request is ready. By deferring the transfer, we ensure that the
> timeline is maintained in retirement order if we decide to queue the
> requests onto the hardware in a different order than fifo.
>
> v2: Rebased onto distinct global/user timeline lock classes.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_request.c    | 31 +++++++++++++++++-------------
>  drivers/gpu/drm/i915/i915_gem_request.h    |  2 ++
>  drivers/gpu/drm/i915/i915_guc_submission.c | 14 +++++++++++++-
>  drivers/gpu/drm/i915/intel_lrc.c           | 23 +++++++++++++---------
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
>  5 files changed, 49 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index e41d51a68ed8..19c29fafb07a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
>  	return atomic_inc_return(&tl->next_seqno);
>  }
>
> -static int __i915_sw_fence_call
> -submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +void __i915_gem_request_submit(struct drm_i915_gem_request *request)
>  {
> -	struct drm_i915_gem_request *request =
> -		container_of(fence, typeof(*request), submit);
>  	struct intel_engine_cs *engine = request->engine;
>  	struct intel_timeline *timeline;
> -	unsigned long flags;
>  	u32 seqno;
>
> -	if (state != FENCE_COMPLETE)
> -		return NOTIFY_DONE;
> -
>  	/* Transfer from per-context onto the global per-engine timeline */
>  	timeline = engine->timeline;
>  	GEM_BUG_ON(timeline == request->timeline);
> -
> -	/* Will be called from irq-context when using foreign DMA fences */
> -	spin_lock_irqsave(&timeline->lock, flags);
> +	assert_spin_locked(&timeline->lock);
>
>  	seqno = timeline_get_seqno(timeline->common);
>  	GEM_BUG_ON(!seqno);
> @@ -345,15 +336,29 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>  	GEM_BUG_ON(!request->global_seqno);
>  	engine->emit_breadcrumb(request,
>  				request->ring->vaddr + request->postfix);
> -	engine->submit_request(request);
>
>  	spin_lock(&request->timeline->lock);
>  	list_move_tail(&request->link, &timeline->requests);
>  	spin_unlock(&request->timeline->lock);
>
>  	i915_sw_fence_commit(&request->execute);
> +}
>
> -	spin_unlock_irqrestore(&timeline->lock, flags);
> +static int __i915_sw_fence_call
> +submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +{
> +	if (state == FENCE_COMPLETE) {
> +		struct drm_i915_gem_request *request =
> +			container_of(fence, typeof(*request), submit);
> +		struct intel_engine_cs *engine = request->engine;
> +		unsigned long flags;
> +
> +		/* Will be called from irq-context when using foreign fences. */
> +		spin_lock_irqsave_nested(&engine->timeline->lock, flags,
> +					 SINGLE_DEPTH_NESTING);
> +		engine->submit_request(request);
> +		spin_unlock_irqrestore(&engine->timeline->lock, flags);

Would it be cleaner to move the lock taking to engine->submit_request?

And is _nested still required? I thought you said it is not. I can't 
find signalling under the timeline lock either.

Regards,

Tvrtko

> +	}
>
>  	return NOTIFY_DONE;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index c8547d9b9004..d8904863d3d9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -241,6 +241,8 @@ void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
>  #define i915_add_request_no_flush(req) \
>  	__i915_add_request(req, false)
>
> +void __i915_gem_request_submit(struct drm_i915_gem_request *request);
> +
>  struct intel_rps_client;
>  #define NO_WAITBOOST ERR_PTR(-1)
>  #define IS_RPS_CLIENT(p) (!IS_ERR(p))
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 666dab7a675a..83438c6a8864 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -629,11 +629,23 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
>  static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  {
>  	struct drm_i915_private *dev_priv = rq->i915;
> -	unsigned int engine_id = rq->engine->id;
> +	struct intel_engine_cs *engine = rq->engine;
> +	unsigned int engine_id = engine->id;
>  	struct intel_guc *guc = &rq->i915->guc;
>  	struct i915_guc_client *client = guc->execbuf_client;
>  	int b_ret;
>
> +	/* We keep the previous context alive until we retire the following
> +	 * request. This ensures that any the context object is still pinned
> +	 * for any residual writes the HW makes into it on the context switch
> +	 * into the next object following the breadcrumb. Otherwise, we may
> +	 * retire the context too early.
> +	 */
> +	rq->previous_context = engine->last_context;
> +	engine->last_context = rq->ctx;
> +
> +	__i915_gem_request_submit(rq);
> +
>  	spin_lock(&client->wq_lock);
>  	guc_wq_item_append(client, rq);
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index fa3012c342cc..fa4920e72e93 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -434,6 +434,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_gem_request *cursor, *last;
>  	struct execlist_port *port = engine->execlist_port;
> +	unsigned long flags;
>  	bool submit = false;
>
>  	last = port->request;
> @@ -469,6 +470,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	 * and context switches) submission.
>  	 */
>
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
>  	spin_lock(&engine->execlist_lock);
>  	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
>  		/* Can we combine this request with the current port? It has to
> @@ -501,6 +503,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			i915_gem_request_assign(&port->request, last);
>  			port++;
>  		}
> +
> +		/* We keep the previous context alive until we retire the
> +		 * following request. This ensures that any the context object
> +		 * is still pinned for any residual writes the HW makes into it
> +		 * on the context switch into the next object following the
> +		 * breadcrumb. Otherwise, we may retire the context too early.
> +		 */
> +		cursor->previous_context = engine->last_context;
> +		engine->last_context = cursor->ctx;
> +
> +		__i915_gem_request_submit(cursor);
>  		last = cursor;
>  		submit = true;
>  	}
> @@ -512,6 +525,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		i915_gem_request_assign(&port->request, last);
>  	}
>  	spin_unlock(&engine->execlist_lock);
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
>  	if (submit)
>  		execlists_submit_ports(engine);
> @@ -599,15 +613,6 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
>
>  	spin_lock_irqsave(&engine->execlist_lock, flags);
>
> -	/* We keep the previous context alive until we retire the following
> -	 * request. This ensures that any the context object is still pinned
> -	 * for any residual writes the HW makes into it on the context switch
> -	 * into the next object following the breadcrumb. Otherwise, we may
> -	 * retire the context too early.
> -	 */
> -	request->previous_context = engine->last_context;
> -	engine->last_context = request->ctx;
> -
>  	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>  	if (execlists_elsp_idle(engine))
>  		tasklet_hi_schedule(&engine->irq_tasklet);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 700e93d80616..f91ee24e2763 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1294,6 +1294,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct drm_i915_private *dev_priv = request->i915;
>
> +	__i915_gem_request_submit(request);
> +
>  	I915_WRITE_TAIL(request->engine, request->tail);
>  }
>
>