[Intel-gfx] [RFC 2/5] drm/i915: Unify execlist and legacy request life-cycles

Thu Jul 9 09:54:59 PDT 2015

On 07/09/2015 03:57 AM, Nick Hoath wrote:
> There is a desire to simplify the i915 driver by reducing the number of
> different code paths introduced by the LRC / execlists support.  As the
> execlists request is now part of the gem request it is possible and
> desirable to unify the request life-cycles for execlist and legacy
> requests.
>
> Added a context complete flag to a request which gets set during the
> context switch interrupt.
>
> Added a function i915_gem_request_retireable().  A request is considered
> retireable if its seqno passed (i.e. the request has completed) and either
> it was never submitted to the ELSP or its context completed.  This ensures
> that context save is carried out before the last request for a context is
> considered retireable.  retire_requests_ring() now uses
> i915_gem_request_retireable() rather than request_complete() when deciding
> which requests to retire. Requests that were not waiting for a context
> switch interrupt (either as a result of being merged into a following
> request or by being a legacy request) will be considered retireable as
> soon as their seqno has passed.
>
> Removed the extra request reference held for the execlist request.
>
> Removed intel_execlists_retire_requests() and all references to
> intel_engine_cs.execlist_retired_req_list.
>
> Moved context unpinning into retire_requests_ring() for now.  Further work
> is pending for the context pinning - this patch should allow us to use the
> active list to track context and ring buffer objects later.

Just heads up on potential performance drop on certain workloads. Since 
retire_reuests_ring is called before each submission, for those CPU 
bound workloads, you will see context pin & unpin very often. The 
ioremap_wc / iounmap for ring buffer consumes more CPU time. I found 
this issue during GuC implementation because GuC does not use execlist 
request queue but legacy one. On SKL, there is about 3~5% performance 
drop in workloads such as SynMark2 oglbatch5/6/7.

Thanks,
Alex
> Changed gen8_cs_irq_handler() so that notify_ring() is called when
> contexts complete as well as when a user interrupt occurs so that
> notification happens when a request is complete and context save has
> finished.
>
> v2: Rebase over the read-read optimisation changes
>
> Signed-off-by: Thomas Daniel <thomas.daniel at intel.com>
> Signed-off-by: Nick Hoath <nicholas.hoath at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h         |  6 ++++
>   drivers/gpu/drm/i915/i915_gem.c         | 49 +++++++++++++++++++++++----------
>   drivers/gpu/drm/i915/i915_irq.c         |  6 ++--
>   drivers/gpu/drm/i915/intel_lrc.c        | 44 ++++++-----------------------
>   drivers/gpu/drm/i915/intel_lrc.h        |  2 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
>   6 files changed, 53 insertions(+), 55 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 1dbd957..ef02378 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2213,6 +2213,12 @@ struct drm_i915_gem_request {
>   	/** Execlists no. of times this request has been sent to the ELSP */
>   	int elsp_submitted;
>   
> +	/**
> +	 * Execlists: whether this requests's context has completed after
> +	 * submission to the ELSP
> +	 */
> +	bool ctx_complete;
> +
>   };
>   
>   int i915_gem_request_alloc(struct intel_engine_cs *ring,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 49016e0..3681a33 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2368,6 +2368,12 @@ void i915_vma_move_to_active(struct i915_vma *vma,
>   	list_move_tail(&vma->mm_list, &vma->vm->active_list);
>   }
>   
> +static bool i915_gem_request_retireable(struct drm_i915_gem_request *req)
> +{
> +	return (i915_gem_request_completed(req, true) &&
> +		(!req->elsp_submitted || req->ctx_complete));
> +}
> +
>   static void
>   i915_gem_object_retire__write(struct drm_i915_gem_object *obj)
>   {
> @@ -2853,10 +2859,28 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>   					   struct drm_i915_gem_request,
>   					   list);
>   
> -		if (!i915_gem_request_completed(request, true))
> +		if (!i915_gem_request_retireable(request))
>   			break;
>   
>   		i915_gem_request_retire(request);
> +
> +		if (i915.enable_execlists) {
> +			struct intel_context *ctx = request->ctx;
> +			struct drm_i915_private *dev_priv =
> +				ring->dev->dev_private;
> +			unsigned long flags;
> +			struct drm_i915_gem_object *ctx_obj =
> +				ctx->engine[ring->id].state;
> +
> +			spin_lock_irqsave(&ring->execlist_lock, flags);
> +
> +			if (ctx_obj && (ctx != ring->default_context))
> +				intel_lr_context_unpin(ring, ctx);
> +
> +			intel_runtime_pm_put(dev_priv);
> +			spin_unlock_irqrestore(&ring->execlist_lock, flags);
> +		}
> +
>   	}
>   
>   	/* Move any buffers on the active list that are no longer referenced
> @@ -2872,12 +2896,14 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>   
>   		if (!list_empty(&obj->last_read_req[ring->id]->list))
>   			break;
> +		if (!i915_gem_request_retireable(obj->last_read_req[ring->id]))
> +			break;
>   
>   		i915_gem_object_retire__read(obj, ring->id);
>   	}
>   
>   	if (unlikely(ring->trace_irq_req &&
> -		     i915_gem_request_completed(ring->trace_irq_req, true))) {
> +		     i915_gem_request_retireable(ring->trace_irq_req))) {
>   		ring->irq_put(ring);
>   		i915_gem_request_assign(&ring->trace_irq_req, NULL);
>   	}
> @@ -2896,15 +2922,6 @@ i915_gem_retire_requests(struct drm_device *dev)
>   	for_each_ring(ring, dev_priv, i) {
>   		i915_gem_retire_requests_ring(ring);
>   		idle &= list_empty(&ring->request_list);
> -		if (i915.enable_execlists) {
> -			unsigned long flags;
> -
> -			spin_lock_irqsave(&ring->execlist_lock, flags);
> -			idle &= list_empty(&ring->execlist_queue);
> -			spin_unlock_irqrestore(&ring->execlist_lock, flags);
> -
> -			intel_execlists_retire_requests(ring);
> -		}
>   	}
>   
>   	if (idle)
> @@ -2980,12 +2997,14 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
>   		if (req == NULL)
>   			continue;
>   
> -		if (list_empty(&req->list))
> -			goto retire;
> +		if (list_empty(&req->list)) {
> +			if (i915_gem_request_retireable(req))
> +				i915_gem_object_retire__read(obj, i);
> +			continue;
> +		}
>   
> -		if (i915_gem_request_completed(req, true)) {
> +		if (i915_gem_request_retireable(req)) {
>   			__i915_gem_request_retire__upto(req);
> -retire:
>   			i915_gem_object_retire__read(obj, i);
>   		}
>   	}
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 3ac30b8..dfa2379 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1158,9 +1158,11 @@ static void snb_gt_irq_handler(struct drm_device *dev,
>   
>   static void gen8_cs_irq_handler(struct intel_engine_cs *ring, u32 iir)
>   {
> +	bool need_notify = false;
> +
>   	if (iir & GT_CONTEXT_SWITCH_INTERRUPT)
> -		intel_lrc_irq_handler(ring);
> -	if (iir & GT_RENDER_USER_INTERRUPT)
> +		need_notify = intel_lrc_irq_handler(ring);
> +	if ((iir & GT_RENDER_USER_INTERRUPT) || need_notify)
>   		notify_ring(ring);
>   }
>   
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 9b928ab..8373900 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -411,9 +411,8 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>   			/* Same ctx: ignore first request, as second request
>   			 * will update tail past first request's workload */
>   			cursor->elsp_submitted = req0->elsp_submitted;
> +			req0->elsp_submitted = 0;
>   			list_del(&req0->execlist_link);
> -			list_add_tail(&req0->execlist_link,
> -				&ring->execlist_retired_req_list);
>   			req0 = cursor;
>   		} else {
>   			req1 = cursor;
> @@ -450,6 +449,7 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>   	req0->elsp_submitted++;
>   	if (req1)
>   		req1->elsp_submitted++;
> +
>   }
>   
>   static bool execlists_check_remove_request(struct intel_engine_cs *ring,
> @@ -469,11 +469,9 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>   		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
>   			WARN(head_req->elsp_submitted == 0,
>   			     "Never submitted head request\n");
> -
>   			if (--head_req->elsp_submitted <= 0) {
> +				head_req->ctx_complete = 1;
>   				list_del(&head_req->execlist_link);
> -				list_add_tail(&head_req->execlist_link,
> -					&ring->execlist_retired_req_list);
>   				return true;
>   			}
>   		}
> @@ -488,8 +486,9 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>    *
>    * Check the unread Context Status Buffers and manage the submission of new
>    * contexts to the ELSP accordingly.
> + * @return whether a context completed
>    */
> -void intel_lrc_irq_handler(struct intel_engine_cs *ring)
> +bool intel_lrc_irq_handler(struct intel_engine_cs *ring)
>   {
>   	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>   	u32 status_pointer;
> @@ -540,6 +539,8 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
>   
>   	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
>   		   ((u32)ring->next_context_status_buffer & 0x07) << 8);
> +
> +	return (submit_contexts != 0);
>   }
>   
>   static int execlists_context_queue(struct drm_i915_gem_request *request)
> @@ -551,7 +552,7 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>   	if (request->ctx != ring->default_context)
>   		intel_lr_context_pin(ring, request->ctx);
>   
> -	i915_gem_request_reference(request);
> +	i915_gem_context_reference(request->ctx);
>   
>   	request->tail = request->ringbuf->tail;
>   
> @@ -572,8 +573,6 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>   			WARN(tail_req->elsp_submitted != 0,
>   				"More than 2 already-submitted reqs queued\n");
>   			list_del(&tail_req->execlist_link);
> -			list_add_tail(&tail_req->execlist_link,
> -				&ring->execlist_retired_req_list);
>   		}
>   	}
>   
> @@ -938,32 +937,6 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>   	return 0;
>   }
>   
> -void intel_execlists_retire_requests(struct intel_engine_cs *ring)
> -{
> -	struct drm_i915_gem_request *req, *tmp;
> -	struct list_head retired_list;
> -
> -	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
> -	if (list_empty(&ring->execlist_retired_req_list))
> -		return;
> -
> -	INIT_LIST_HEAD(&retired_list);
> -	spin_lock_irq(&ring->execlist_lock);
> -	list_replace_init(&ring->execlist_retired_req_list, &retired_list);
> -	spin_unlock_irq(&ring->execlist_lock);
> -
> -	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
> -		struct intel_context *ctx = req->ctx;
> -		struct drm_i915_gem_object *ctx_obj =
> -				ctx->engine[ring->id].state;
> -
> -		if (ctx_obj && (ctx != ring->default_context))
> -			intel_lr_context_unpin(ring, ctx);
> -		list_del(&req->execlist_link);
> -		i915_gem_request_unreference(req);
> -	}
> -}
> -
>   void intel_logical_ring_stop(struct intel_engine_cs *ring)
>   {
>   	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> @@ -1706,7 +1679,6 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>   	init_waitqueue_head(&ring->irq_queue);
>   
>   	INIT_LIST_HEAD(&ring->execlist_queue);
> -	INIT_LIST_HEAD(&ring->execlist_retired_req_list);
>   	spin_lock_init(&ring->execlist_lock);
>   
>   	ret = i915_cmd_parser_init_ring(ring);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index f59940a..9d2e98a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -82,7 +82,7 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
>   			       struct list_head *vmas);
>   u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
>   
> -void intel_lrc_irq_handler(struct intel_engine_cs *ring);
> +bool intel_lrc_irq_handler(struct intel_engine_cs *ring);
>   void intel_execlists_retire_requests(struct intel_engine_cs *ring);
>   
>   #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 304cac4..73b0bd8 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -263,7 +263,6 @@ struct  intel_engine_cs {
>   	/* Execlists */
>   	spinlock_t execlist_lock;
>   	struct list_head execlist_queue;
> -	struct list_head execlist_retired_req_list;
>   	u8 next_context_status_buffer;
>   	u32             irq_keep_mask; /* bitmask for interrupts that should not be masked */
>   	int		(*emit_request)(struct drm_i915_gem_request *request);