[Intel-gfx] [PATCH 1/4] drm/i915: Unify execlist and legacy request life-cycles

Wed Oct 7 09:03:48 PDT 2015

On Tue, Oct 06, 2015 at 03:52:01PM +0100, Nick Hoath wrote:
> There is a desire to simplify the i915 driver by reducing the number of
> different code paths introduced by the LRC / execlists support.  As the
> execlists request is now part of the gem request it is possible and
> desirable to unify the request life-cycles for execlist and legacy
> requests.
> 
> Added a context complete flag to a request which gets set during the
> context switch interrupt.
> 
> Added a function i915_gem_request_retireable().  A request is considered
> retireable if its seqno passed (i.e. the request has completed) and either
> it was never submitted to the ELSP or its context completed.  This ensures
> that context save is carried out before the last request for a context is
> considered retireable.  retire_requests_ring() now uses
> i915_gem_request_retireable() rather than request_complete() when deciding
> which requests to retire. Requests that were not waiting for a context
> switch interrupt (either as a result of being merged into a following
> request or by being a legacy request) will be considered retireable as
> soon as their seqno has passed.
> 
> Removed the extra request reference held for the execlist request.
> 
> Removed intel_execlists_retire_requests() and all references to
> intel_engine_cs.execlist_retired_req_list.
> 
> Moved context unpinning into retire_requests_ring() for now.  Further work
> is pending for the context pinning - this patch should allow us to use the
> active list to track context and ring buffer objects later.
> 
> Changed gen8_cs_irq_handler() so that notify_ring() is called when
> contexts complete as well as when a user interrupt occurs so that
> notification happens when a request is complete and context save has
> finished.
> 
> v2: Rebase over the read-read optimisation changes
> 
> v3: Reworked IRQ handler after removing IRQ handler cleanup patch
> 
> v4: Fixed various pin leaks
> 
> Issue: VIZ-4277
> Signed-off-by: Thomas Daniel <thomas.daniel at intel.com>
> Signed-off-by: Nick Hoath <nicholas.hoath at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  6 +++
>  drivers/gpu/drm/i915/i915_gem.c         | 67 +++++++++++++++++++++------
>  drivers/gpu/drm/i915/i915_irq.c         | 81 +++++++++++++++++++++------------
>  drivers/gpu/drm/i915/intel_lrc.c        | 43 +++--------------
>  drivers/gpu/drm/i915/intel_lrc.h        |  2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
>  6 files changed, 118 insertions(+), 82 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index fbf0ae9..3d217f9 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2262,6 +2262,12 @@ struct drm_i915_gem_request {
>  	/** Execlists no. of times this request has been sent to the ELSP */
>  	int elsp_submitted;
>  
> +	/**
> +	 * Execlists: whether this requests's context has completed after
> +	 * submission to the ELSP
> +	 */
> +	bool ctx_complete;
> +
>  };
>  
>  int i915_gem_request_alloc(struct intel_engine_cs *ring,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 52642af..fc82171 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1386,6 +1386,24 @@ __i915_gem_request_retire__upto(struct drm_i915_gem_request *req)
>  				       typeof(*tmp), list);
>  
>  		i915_gem_request_retire(tmp);
> +
> +		if (i915.enable_execlists) {
> +			struct intel_context *ctx = tmp->ctx;
> +			struct drm_i915_private *dev_priv =
> +				engine->dev->dev_private;
> +			unsigned long flags;
> +			struct drm_i915_gem_object *ctx_obj =
> +				ctx->engine[engine->id].state;
> +
> +			spin_lock_irqsave(&engine->execlist_lock, flags);
> +
> +			if (ctx_obj && (ctx != engine->default_context))
> +				intel_lr_context_unpin(tmp);
> +
> +			intel_runtime_pm_put(dev_priv);
> +			spin_unlock_irqrestore(&engine->execlist_lock, flags);
> +		}
> +
>  	} while (tmp != req);
>  
>  	WARN_ON(i915_verify_lists(engine->dev));
> @@ -2359,6 +2377,12 @@ void i915_vma_move_to_active(struct i915_vma *vma,
>  	list_move_tail(&vma->mm_list, &vma->vm->active_list);
>  }
>  
> +static bool i915_gem_request_retireable(struct drm_i915_gem_request *req)
> +{
> +	return (i915_gem_request_completed(req, true) &&
> +		(!req->elsp_submitted || req->ctx_complete));
> +}
> +
>  static void
>  i915_gem_object_retire__write(struct drm_i915_gem_object *obj)
>  {
> @@ -2829,10 +2853,28 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>  					   struct drm_i915_gem_request,
>  					   list);
>  
> -		if (!i915_gem_request_completed(request, true))
> +		if (!i915_gem_request_retireable(request))
>  			break;
>  
>  		i915_gem_request_retire(request);
> +
> +		if (i915.enable_execlists) {
> +			struct intel_context *ctx = request->ctx;
> +			struct drm_i915_private *dev_priv =
> +				ring->dev->dev_private;
> +			unsigned long flags;
> +			struct drm_i915_gem_object *ctx_obj =
> +				ctx->engine[ring->id].state;
> +
> +			spin_lock_irqsave(&ring->execlist_lock, flags);
> +
> +			if (ctx_obj && (ctx != ring->default_context))
> +				intel_lr_context_unpin(request);
> +
> +			intel_runtime_pm_put(dev_priv);
> +			spin_unlock_irqrestore(&ring->execlist_lock, flags);
> +		}
> +
>  	}
>  
>  	/* Move any buffers on the active list that are no longer referenced
> @@ -2848,12 +2890,14 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>  
>  		if (!list_empty(&obj->last_read_req[ring->id]->list))
>  			break;
> +		if (!i915_gem_request_retireable(obj->last_read_req[ring->id]))
> +			break;
>  
>  		i915_gem_object_retire__read(obj, ring->id);
>  	}
>  
>  	if (unlikely(ring->trace_irq_req &&
> -		     i915_gem_request_completed(ring->trace_irq_req, true))) {
> +		     i915_gem_request_retireable(ring->trace_irq_req))) {
>  		ring->irq_put(ring);
>  		i915_gem_request_assign(&ring->trace_irq_req, NULL);
>  	}
> @@ -2872,15 +2916,6 @@ i915_gem_retire_requests(struct drm_device *dev)
>  	for_each_ring(ring, dev_priv, i) {
>  		i915_gem_retire_requests_ring(ring);
>  		idle &= list_empty(&ring->request_list);
> -		if (i915.enable_execlists) {
> -			unsigned long flags;
> -
> -			spin_lock_irqsave(&ring->execlist_lock, flags);
> -			idle &= list_empty(&ring->execlist_queue);
> -			spin_unlock_irqrestore(&ring->execlist_lock, flags);
> -
> -			intel_execlists_retire_requests(ring);
> -		}
>  	}
>  
>  	if (idle)
> @@ -2956,12 +2991,14 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
>  		if (req == NULL)
>  			continue;
>  
> -		if (list_empty(&req->list))
> -			goto retire;
> +		if (list_empty(&req->list)) {
> +			if (i915_gem_request_retireable(req))
> +				i915_gem_object_retire__read(obj, i);
> +			continue;
> +		}
>  
> -		if (i915_gem_request_completed(req, true)) {
> +		if (i915_gem_request_retireable(req)) {
>  			__i915_gem_request_retire__upto(req);
> -retire:
>  			i915_gem_object_retire__read(obj, i);
>  		}
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 8ca772d..ebbafac 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1291,66 +1291,91 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
>  				       u32 master_ctl)
>  {
>  	irqreturn_t ret = IRQ_NONE;
> +	bool need_notify;
>  
>  	if (master_ctl & (GEN8_GT_RCS_IRQ | GEN8_GT_BCS_IRQ)) {
> -		u32 tmp = I915_READ_FW(GEN8_GT_IIR(0));
> -		if (tmp) {
> -			I915_WRITE_FW(GEN8_GT_IIR(0), tmp);
> +		u32 iir = I915_READ_FW(GEN8_GT_IIR(0));
> +
> +		if (iir) {
> +			I915_WRITE_FW(GEN8_GT_IIR(0), iir);
>  			ret = IRQ_HANDLED;
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[RCS]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_RCS_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[RCS]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_RCS_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[RCS]);
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[BCS]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_BCS_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[BCS]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_BCS_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[BCS]);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT0)!\n");
>  	}
>  
>  	if (master_ctl & (GEN8_GT_VCS1_IRQ | GEN8_GT_VCS2_IRQ)) {
> -		u32 tmp = I915_READ_FW(GEN8_GT_IIR(1));
> -		if (tmp) {
> -			I915_WRITE_FW(GEN8_GT_IIR(1), tmp);
> +		u32 iir = I915_READ_FW(GEN8_GT_IIR(1));
> +
> +		if (iir) {
> +			I915_WRITE_FW(GEN8_GT_IIR(1), iir);
>  			ret = IRQ_HANDLED;
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[VCS]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_VCS1_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[VCS]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_VCS1_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[VCS]);
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[VCS2]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_VCS2_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[VCS2]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_VCS2_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[VCS2]);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT1)!\n");
>  	}
>  
>  	if (master_ctl & GEN8_GT_VECS_IRQ) {
> -		u32 tmp = I915_READ_FW(GEN8_GT_IIR(3));
> -		if (tmp) {
> -			I915_WRITE_FW(GEN8_GT_IIR(3), tmp);
> +		u32 iir = I915_READ_FW(GEN8_GT_IIR(3));
> +
> +		if (iir) {
> +			I915_WRITE_FW(GEN8_GT_IIR(3), iir);
>  			ret = IRQ_HANDLED;
>  
> -			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT))
> -				intel_lrc_irq_handler(&dev_priv->ring[VECS]);
> -			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT))
> +			need_notify = false;
> +			if (iir & (GT_CONTEXT_SWITCH_INTERRUPT <<
> +					GEN8_VECS_IRQ_SHIFT))
> +				need_notify = intel_lrc_irq_handler(
> +						&dev_priv->ring[VECS]);
> +			if (iir & (GT_RENDER_USER_INTERRUPT <<
> +					GEN8_VECS_IRQ_SHIFT) || need_notify)
>  				notify_ring(&dev_priv->ring[VECS]);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (GT3)!\n");
>  	}
>  
>  	if (master_ctl & GEN8_GT_PM_IRQ) {
> -		u32 tmp = I915_READ_FW(GEN8_GT_IIR(2));
> -		if (tmp & dev_priv->pm_rps_events) {
> +		u32 iir = I915_READ_FW(GEN8_GT_IIR(2));
> +
> +		if (iir & dev_priv->pm_rps_events) {
>  			I915_WRITE_FW(GEN8_GT_IIR(2),
> -				      tmp & dev_priv->pm_rps_events);
> +				      iir & dev_priv->pm_rps_events);
>  			ret = IRQ_HANDLED;
> -			gen6_rps_irq_handler(dev_priv, tmp);
> +			gen6_rps_irq_handler(dev_priv, iir);
>  		} else
>  			DRM_ERROR("The master control interrupt lied (PM)!\n");
>  	}

I had a really hard time reading this patch until I realized what you've
done. And this is definitely getting too complicated. I think it'd be good
to first extract a little helper which takes the iir and the shift and
then calls both intel_lrc_irq_handler and notify ring.

Also if you do large-scale replacement like s/tmp/iir/ that needs to be a
separate patch.

With that technicality out of the way let's look at the larger picture.
Let me first describe in more detail how the legacy ctx retiring works,
since it doesn't work like what you've done here really. Let's look at the
case where we have some batch running on ctx A and then switch to a batch
running on ctx B:

| currently ctx A active on the hw
| MI_BB_START/flushes/...
| seqno write + interrupt for the batch just submitted, tracked in request 1
<- ctx A is still used by the hardware and not flushed out
| MI_SET_CTX to switch to B
| batch/flush
| seqno write/irq for request 2
v

What you seem to be doing is changing the logical retire point for request
1 to include the MI_SET_CTX (but in execlist terms, i.e. when the hw has
confirmed the ctx switch to the next lrc in the elsp).

But what the legacy stuff does is wait until request 2 with retiring the
ctx object for ctx B. So what happens in that context switch function
(which is called synchronously from execbuf ioctl code) is that we
directly unpin ctx A and pin ctx B. And then we throw the ctx A object
onto the active list with request 2 attached as a write fence to make sure
the eviction code waits for request 2 to complete, which necessarily means
the ctx switch has completed.

Of course this isn't entirely perfect since we could evict ctx A even
before request 2 has completed, but since throwing out active objects is
really a last resort thing that doesn't really matter.

Now we could do the exact same design for execlist since we never reorder
execlist ctx in upstream. And that's what I originally suggested. But that
would seriously piss of John with his scheduler patches.

Instead I think the proper design would be to create a new, separate
request object every time we submit a new batch for a different context
(we don't want to do this unconditionally for overheads) which will
complete exactly when the ctx switch completes. Since we have full-blown
request abstractio nowadays that should be possible without touching
generic code or adding special-cases to request or active object tracking
code.
-Daniel

> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 825fa7a..e8f5b6c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -426,9 +426,8 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>  			/* Same ctx: ignore first request, as second request
>  			 * will update tail past first request's workload */
>  			cursor->elsp_submitted = req0->elsp_submitted;
> +			req0->elsp_submitted = 0;
>  			list_del(&req0->execlist_link);
> -			list_add_tail(&req0->execlist_link,
> -				&ring->execlist_retired_req_list);
>  			req0 = cursor;
>  		} else {
>  			req1 = cursor;
> @@ -478,11 +477,9 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>  		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
>  			WARN(head_req->elsp_submitted == 0,
>  			     "Never submitted head request\n");
> -
>  			if (--head_req->elsp_submitted <= 0) {
> +				head_req->ctx_complete = 1;
>  				list_del(&head_req->execlist_link);
> -				list_add_tail(&head_req->execlist_link,
> -					&ring->execlist_retired_req_list);
>  				return true;
>  			}
>  		}
> @@ -497,8 +494,9 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>   *
>   * Check the unread Context Status Buffers and manage the submission of new
>   * contexts to the ELSP accordingly.
> + * @return whether a context completed
>   */
> -void intel_lrc_irq_handler(struct intel_engine_cs *ring)
> +bool intel_lrc_irq_handler(struct intel_engine_cs *ring)
>  {
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	u32 status_pointer;
> @@ -558,6 +556,8 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
>  		   _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
>  				 ((u32)ring->next_context_status_buffer &
>  				  GEN8_CSB_PTR_MASK) << 8));
> +
> +	return (submit_contexts != 0);
>  }
>  
>  static int execlists_context_queue(struct drm_i915_gem_request *request)
> @@ -569,8 +569,6 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>  	if (request->ctx != ring->default_context)
>  		intel_lr_context_pin(request);
>  
> -	i915_gem_request_reference(request);
> -
>  	spin_lock_irq(&ring->execlist_lock);
>  
>  	list_for_each_entry(cursor, &ring->execlist_queue, execlist_link)
> @@ -588,8 +586,6 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>  			WARN(tail_req->elsp_submitted != 0,
>  				"More than 2 already-submitted reqs queued\n");
>  			list_del(&tail_req->execlist_link);
> -			list_add_tail(&tail_req->execlist_link,
> -				&ring->execlist_retired_req_list);
>  		}
>  	}
>  
> @@ -958,32 +954,6 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>  	return 0;
>  }
>  
> -void intel_execlists_retire_requests(struct intel_engine_cs *ring)
> -{
> -	struct drm_i915_gem_request *req, *tmp;
> -	struct list_head retired_list;
> -
> -	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
> -	if (list_empty(&ring->execlist_retired_req_list))
> -		return;
> -
> -	INIT_LIST_HEAD(&retired_list);
> -	spin_lock_irq(&ring->execlist_lock);
> -	list_replace_init(&ring->execlist_retired_req_list, &retired_list);
> -	spin_unlock_irq(&ring->execlist_lock);
> -
> -	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
> -		struct intel_context *ctx = req->ctx;
> -		struct drm_i915_gem_object *ctx_obj =
> -				ctx->engine[ring->id].state;
> -
> -		if (ctx_obj && (ctx != ring->default_context))
> -			intel_lr_context_unpin(req);
> -		list_del(&req->execlist_link);
> -		i915_gem_request_unreference(req);
> -	}
> -}
> -
>  void intel_logical_ring_stop(struct intel_engine_cs *ring)
>  {
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> @@ -1938,7 +1908,6 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>  	init_waitqueue_head(&ring->irq_queue);
>  
>  	INIT_LIST_HEAD(&ring->execlist_queue);
> -	INIT_LIST_HEAD(&ring->execlist_retired_req_list);
>  	spin_lock_init(&ring->execlist_lock);
>  
>  	ret = i915_cmd_parser_init_ring(ring);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 4e60d54..e6a4900 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -95,7 +95,7 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>  			       struct list_head *vmas);
>  u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
>  
> -void intel_lrc_irq_handler(struct intel_engine_cs *ring);
> +bool intel_lrc_irq_handler(struct intel_engine_cs *ring);
>  void intel_execlists_retire_requests(struct intel_engine_cs *ring);
>  
>  #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 49fa41d..d99b167 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -264,7 +264,6 @@ struct  intel_engine_cs {
>  	/* Execlists */
>  	spinlock_t execlist_lock;
>  	struct list_head execlist_queue;
> -	struct list_head execlist_retired_req_list;
>  	u8 next_context_status_buffer;
>  	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
>  	int		(*emit_request)(struct drm_i915_gem_request *request);
> -- 
> 1.9.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch