[Intel-gfx] [PATCH] drm/i915/preemption: Allow preemption between submission ports

Fri Feb 23 14:06:06 UTC 2018

Chris Wilson <chris at chris-wilson.co.uk> writes:

> Sometimes we need to boost the priority of an in-flight request, which
> may lead to the situation where the second submission port then contains
> a higher priority context than the first and so we need to inject a
> preemption event. To do so we must always check inside
> execlists_dequeue() whether there is a priority inversion between the
> ports themselves as well as the head of the priority sorted queue, and we
> cannot just skip dequeuing if the queue is empty.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michał Winiarski <michal.winiarski at intel.com>
> Cc: Michel Thierry <michel.thierry at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> Rebase for conflicts
> -Chris
> ---
>  drivers/gpu/drm/i915/intel_engine_cs.c      |   2 +
>  drivers/gpu/drm/i915/intel_guc_submission.c |  17 +--
>  drivers/gpu/drm/i915/intel_lrc.c            | 161 ++++++++++++++++------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h     |   5 +
>  4 files changed, 107 insertions(+), 78 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index c31544406974..ce7fcf55ba18 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -423,6 +423,7 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
>  	BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
>  	GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
>  
> +	execlists->queue_priority = INT_MIN;
>  	execlists->queue = RB_ROOT;
>  	execlists->first = NULL;
>  }
> @@ -1903,6 +1904,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>  	spin_lock_irq(&engine->timeline->lock);
>  	list_for_each_entry(rq, &engine->timeline->requests, link)
>  		print_request(m, rq, "\t\tE ");
> +	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
>  	for (rb = execlists->first; rb; rb = rb_next(rb)) {
>  		struct i915_priolist *p =
>  			rb_entry(rb, typeof(*p), node);
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 649113c7a3c2..586dde579903 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -75,6 +75,11 @@
>   *
>   */
>  
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +	return rb_entry(rb, struct i915_priolist, node);
> +}
> +
>  static inline bool is_high_priority(struct intel_guc_client *client)
>  {
>  	return (client->priority == GUC_CLIENT_PRIORITY_KMD_HIGH ||
> @@ -682,15 +687,12 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>  	rb = execlists->first;
>  	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
>  
> -	if (!rb)
> -		goto unlock;
> -
>  	if (port_isset(port)) {
>  		if (engine->i915->preempt_context) {
>  			struct guc_preempt_work *preempt_work =
>  				&engine->i915->guc.preempt_work[engine->id];
>  
> -			if (rb_entry(rb, struct i915_priolist, node)->priority >
> +			if (execlists->queue_priority >
>  			    max(port_request(port)->priotree.priority, 0)) {
>  				execlists_set_active(execlists,
>  						     EXECLISTS_ACTIVE_PREEMPT);

This is the priority inversion part? We preempt and clear the ports
to rearrange if the last port has a higher priority request?

> @@ -706,8 +708,8 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>  	}
>  	GEM_BUG_ON(port_isset(port));
>  
> -	do {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	while (rb) {
> +		struct i915_priolist *p = to_priolist(rb);
>  		struct i915_request *rq, *rn;
>  
>  		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
> @@ -736,8 +738,9 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>  		INIT_LIST_HEAD(&p->requests);
>  		if (p->priority != I915_PRIORITY_NORMAL)
>  			kmem_cache_free(engine->i915->priorities, p);
> -	} while (rb);
> +	}
>  done:
> +	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
>  	execlists->first = rb;
>  	if (submit) {
>  		port_assign(port, last);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 964885b5d7cb..4bc72fbaf793 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -169,6 +169,23 @@ static void execlists_init_reg_state(u32 *reg_state,
>  				     struct intel_engine_cs *engine,
>  				     struct intel_ring *ring);
>  
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +	return rb_entry(rb, struct i915_priolist, node);
> +}
> +
> +static inline int rq_prio(const struct i915_request *rq)
> +{
> +	return rq->priotree.priority;
> +}
> +
> +static inline bool need_preempt(const struct intel_engine_cs *engine,
> +                                const struct i915_request *last,
> +                                int prio)
> +{
> +	return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
> +}
> +
>  /**
>   * intel_lr_context_descriptor_update() - calculate & cache the descriptor
>   * 					  descriptor for a pinned context
> @@ -224,7 +241,7 @@ lookup_priolist(struct intel_engine_cs *engine,
>  	parent = &execlists->queue.rb_node;
>  	while (*parent) {
>  		rb = *parent;
> -		p = rb_entry(rb, typeof(*p), node);
> +		p = to_priolist(rb);
>  		if (prio > p->priority) {
>  			parent = &rb->rb_left;
>  		} else if (prio < p->priority) {
> @@ -264,7 +281,7 @@ lookup_priolist(struct intel_engine_cs *engine,
>  	if (first)
>  		execlists->first = &p->node;
>  
> -	return ptr_pack_bits(p, first, 1);
> +	return p;

Hmm there is no need for decode first as we always resubmit
the queue depending on the prio?

>  }
>  
>  static void unwind_wa_tail(struct i915_request *rq)
> @@ -290,14 +307,10 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>  		__i915_request_unsubmit(rq);
>  		unwind_wa_tail(rq);
>  
> -		GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID);
> -		if (rq->priotree.priority != last_prio) {
> -			p = lookup_priolist(engine,
> -					    &rq->priotree,
> -					    rq->priotree.priority);
> -			p = ptr_mask_bits(p, 1);
> -
> -			last_prio = rq->priotree.priority;
> +		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> +		if (rq_prio(rq) != last_prio) {
> +			last_prio = rq_prio(rq);
> +			p = lookup_priolist(engine, &rq->priotree, last_prio);
>  		}
>  
>  		list_add(&rq->priotree.link, &p->requests);
> @@ -397,10 +410,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>  			desc = execlists_update_context(rq);
>  			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
>  
> -			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
> +			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x, prio=%d\n",
>  				  engine->name, n,
>  				  port[n].context_id, count,
> -				  rq->global_seqno);
> +				  rq->global_seqno,
> +				  rq_prio(rq));
>  		} else {
>  			GEM_BUG_ON(!n);
>  			desc = 0;
> @@ -453,12 +467,17 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>  		   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
>  				      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
>  
> +	/*
> +	 * Switch to our empty preempt context so
> +	 * the state of the GPU is known (idle).
> +	 */
>  	GEM_TRACE("%s\n", engine->name);
>  	for (n = execlists_num_ports(&engine->execlists); --n; )
>  		elsp_write(0, engine->execlists.elsp);
>  
>  	elsp_write(ce->lrc_desc, engine->execlists.elsp);
>  	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
> +	execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);

Surely a better place. Now looking at this would it be more prudent to
move both the clear and set right before the last elsp write?

Well I guess it really doesn't matter as we hold the timeline lock.

-Mika

>  }
>  
>  static void execlists_dequeue(struct intel_engine_cs *engine)
> @@ -495,8 +514,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	spin_lock_irq(&engine->timeline->lock);
>  	rb = execlists->first;
>  	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
> -	if (!rb)
> -		goto unlock;
>  
>  	if (last) {
>  		/*
> @@ -519,54 +536,48 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
>  			goto unlock;
>  
> -		if (engine->i915->preempt_context &&
> -		    rb_entry(rb, struct i915_priolist, node)->priority >
> -		    max(last->priotree.priority, 0)) {
> -			/*
> -			 * Switch to our empty preempt context so
> -			 * the state of the GPU is known (idle).
> -			 */
> +		if (need_preempt(engine, last, execlists->queue_priority)) {
>  			inject_preempt_context(engine);
> -			execlists_set_active(execlists,
> -					     EXECLISTS_ACTIVE_PREEMPT);
>  			goto unlock;
> -		} else {
> -			/*
> -			 * In theory, we could coalesce more requests onto
> -			 * the second port (the first port is active, with
> -			 * no preemptions pending). However, that means we
> -			 * then have to deal with the possible lite-restore
> -			 * of the second port (as we submit the ELSP, there
> -			 * may be a context-switch) but also we may complete
> -			 * the resubmission before the context-switch. Ergo,
> -			 * coalescing onto the second port will cause a
> -			 * preemption event, but we cannot predict whether
> -			 * that will affect port[0] or port[1].
> -			 *
> -			 * If the second port is already active, we can wait
> -			 * until the next context-switch before contemplating
> -			 * new requests. The GPU will be busy and we should be
> -			 * able to resubmit the new ELSP before it idles,
> -			 * avoiding pipeline bubbles (momentary pauses where
> -			 * the driver is unable to keep up the supply of new
> -			 * work).
> -			 */
> -			if (port_count(&port[1]))
> -				goto unlock;
> -
> -			/* WaIdleLiteRestore:bdw,skl
> -			 * Apply the wa NOOPs to prevent
> -			 * ring:HEAD == rq:TAIL as we resubmit the
> -			 * request. See gen8_emit_breadcrumb() for
> -			 * where we prepare the padding after the
> -			 * end of the request.
> -			 */
> -			last->tail = last->wa_tail;
>  		}
> +
> +		/*
> +		 * In theory, we could coalesce more requests onto
> +		 * the second port (the first port is active, with
> +		 * no preemptions pending). However, that means we
> +		 * then have to deal with the possible lite-restore
> +		 * of the second port (as we submit the ELSP, there
> +		 * may be a context-switch) but also we may complete
> +		 * the resubmission before the context-switch. Ergo,
> +		 * coalescing onto the second port will cause a
> +		 * preemption event, but we cannot predict whether
> +		 * that will affect port[0] or port[1].
> +		 *
> +		 * If the second port is already active, we can wait
> +		 * until the next context-switch before contemplating
> +		 * new requests. The GPU will be busy and we should be
> +		 * able to resubmit the new ELSP before it idles,
> +		 * avoiding pipeline bubbles (momentary pauses where
> +		 * the driver is unable to keep up the supply of new
> +		 * work). However, we have to double check that the
> +		 * priorities of the ports haven't been switch.
> +		 */
> +		if (port_count(&port[1]))
> +			goto unlock;
> +
> +		/*
> +		 * WaIdleLiteRestore:bdw,skl
> +		 * Apply the wa NOOPs to prevent
> +		 * ring:HEAD == rq:TAIL as we resubmit the
> +		 * request. See gen8_emit_breadcrumb() for
> +		 * where we prepare the padding after the
> +		 * end of the request.
> +		 */
> +		last->tail = last->wa_tail;
>  	}
>  
> -	do {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	while (rb) {
> +		struct i915_priolist *p = to_priolist(rb);
>  		struct i915_request *rq, *rn;
>  
>  		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
> @@ -628,8 +639,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		INIT_LIST_HEAD(&p->requests);
>  		if (p->priority != I915_PRIORITY_NORMAL)
>  			kmem_cache_free(engine->i915->priorities, p);
> -	} while (rb);
> +	}
>  done:
> +	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
>  	execlists->first = rb;
>  	if (submit)
>  		port_assign(port, last);
> @@ -690,7 +702,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  	/* Flush the queued requests to the timeline list (for retiring). */
>  	rb = execlists->first;
>  	while (rb) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +		struct i915_priolist *p = to_priolist(rb);
>  
>  		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
>  			INIT_LIST_HEAD(&rq->priotree.link);
> @@ -708,7 +720,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>  
>  	/* Remaining _unready_ requests will be nop'ed when submitted */
>  
> -
> +	execlists->queue_priority = INT_MIN;
>  	execlists->queue = RB_ROOT;
>  	execlists->first = NULL;
>  	GEM_BUG_ON(port_isset(execlists->port));
> @@ -864,10 +876,11 @@ static void execlists_submission_tasklet(unsigned long data)
>  							EXECLISTS_ACTIVE_USER));
>  
>  			rq = port_unpack(port, &count);
> -			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
> +			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
>  				  engine->name,
>  				  port->context_id, count,
> -				  rq ? rq->global_seqno : 0);
> +				  rq ? rq->global_seqno : 0,
> +				  rq ? rq_prio(rq) : 0);
>  
>  			/* Check the context/desc id for this event matches */
>  			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
> @@ -912,15 +925,19 @@ static void execlists_submission_tasklet(unsigned long data)
>  		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
>  }
>  
> -static void insert_request(struct intel_engine_cs *engine,
> -			   struct i915_priotree *pt,
> -			   int prio)
> +static void queue_request(struct intel_engine_cs *engine,
> +			  struct i915_priotree *pt,
> +			  int prio)
>  {
> -	struct i915_priolist *p = lookup_priolist(engine, pt, prio);
> +	list_add_tail(&pt->link, &lookup_priolist(engine, pt, prio)->requests);
> +}
>  
> -	list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
> -	if (ptr_unmask_bits(p, 1))
> +static void submit_queue(struct intel_engine_cs *engine, int prio)
> +{
> +	if (prio > engine->execlists.queue_priority) {
> +		engine->execlists.queue_priority = prio;
>  		tasklet_hi_schedule(&engine->execlists.tasklet);
> +	}
>  }
>  
>  static void execlists_submit_request(struct i915_request *request)
> @@ -931,7 +948,8 @@ static void execlists_submit_request(struct i915_request *request)
>  	/* Will be called from irq-context when using foreign fences. */
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
>  
> -	insert_request(engine, &request->priotree, request->priotree.priority);
> +	queue_request(engine, &request->priotree, rq_prio(request));
> +	submit_queue(engine, rq_prio(request));
>  
>  	GEM_BUG_ON(!engine->execlists.first);
>  	GEM_BUG_ON(list_empty(&request->priotree.link));
> @@ -987,7 +1005,7 @@ static void execlists_schedule(struct i915_request *request, int prio)
>  	 * static void update_priorities(struct i915_priotree *pt, prio) {
>  	 *	list_for_each_entry(dep, &pt->signalers_list, signal_link)
>  	 *		update_priorities(dep->signal, prio)
> -	 *	insert_request(pt);
> +	 *	queue_request(pt);
>  	 * }
>  	 * but that may have unlimited recursion depth and so runs a very
>  	 * real risk of overunning the kernel stack. Instead, we build
> @@ -1050,8 +1068,9 @@ static void execlists_schedule(struct i915_request *request, int prio)
>  		pt->priority = prio;
>  		if (!list_empty(&pt->link)) {
>  			__list_del_entry(&pt->link);
> -			insert_request(engine, pt, prio);
> +			queue_request(engine, pt, prio);
>  		}
> +		submit_queue(engine, prio);
>  	}
>  
>  	spin_unlock_irq(&engine->timeline->lock);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index a9b83bf7e837..c4e9022b34e3 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -257,6 +257,11 @@ struct intel_engine_execlists {
>  	 */
>  	unsigned int port_mask;
>  
> +	/**
> +	 * @queue_priority: Highest pending priority.
> +	 */
> +	int queue_priority;
> +
>  	/**
>  	 * @queue: queue of requests, in priority lists
>  	 */
> -- 
> 2.16.1