[Intel-gfx] [PATCH 13/27] drm/i915/execlists: Pack the count into the low bits of the port.request

Thu Apr 20 14:58:19 UTC 2017

On 19/04/2017 10:41, Chris Wilson wrote:
> add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187)
> function                                     old     new   delta
> execlists_submit_ports                       262     471    +209
> port_assign.isra                               -     136    +136
> capture                                     6344    6359     +15
> reset_common_ring                            438     452     +14
> execlists_submit_request                     228     238     +10
> gen8_init_common_ring                        334     341      +7
> intel_engine_is_idle                         106     105      -1
> i915_engine_info                            2314    2290     -24
> __i915_gem_set_wedged_BKL                    485     411     -74
> intel_lrc_irq_handler                       1789    1604    -185
> execlists_update_context                     294       -    -294
>
> The most important change there is the improve to the
> intel_lrc_irq_handler and excclist_submit_ports (net improvement since
> execlists_update_context is now inlined).
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        |  32 ++++---
>  drivers/gpu/drm/i915/i915_gem.c            |   6 +-
>  drivers/gpu/drm/i915/i915_gpu_error.c      |  13 ++-
>  drivers/gpu/drm/i915/i915_guc_submission.c |  18 ++--
>  drivers/gpu/drm/i915/intel_engine_cs.c     |   2 +-
>  drivers/gpu/drm/i915/intel_lrc.c           | 133 ++++++++++++++++-------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |   8 +-
>  7 files changed, 120 insertions(+), 92 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 870c470177b5..0b5d7142d8d9 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  		if (i915.enable_execlists) {
>  			u32 ptr, read, write;
>  			struct rb_node *rb;
> +			unsigned int idx;
>
>  			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
>  				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
> @@ -3332,8 +3333,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  			if (read > write)
>  				write += GEN8_CSB_ENTRIES;
>  			while (read < write) {
> -				unsigned int idx = ++read % GEN8_CSB_ENTRIES;
> -
> +				idx = ++read % GEN8_CSB_ENTRIES;
>  				seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
>  					   idx,
>  					   I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
> @@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  			}
>
>  			rcu_read_lock();
> -			rq = READ_ONCE(engine->execlist_port[0].request);
> -			if (rq) {
> -				seq_printf(m, "\t\tELSP[0] count=%d, ",
> -					   engine->execlist_port[0].count);
> -				print_request(m, rq, "rq: ");
> -			} else {
> -				seq_printf(m, "\t\tELSP[0] idle\n");
> -			}
> -			rq = READ_ONCE(engine->execlist_port[1].request);
> -			if (rq) {
> -				seq_printf(m, "\t\tELSP[1] count=%d, ",
> -					   engine->execlist_port[1].count);
> -				print_request(m, rq, "rq: ");
> -			} else {
> -				seq_printf(m, "\t\tELSP[1] idle\n");
> +			for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); idx++) {
> +				unsigned int count;
> +
> +				rq = port_unpack(&engine->execlist_port[idx],
> +						 &count);
> +				if (rq) {
> +					seq_printf(m, "\t\tELSP[%d] count=%d, ",
> +						   idx, count);
> +					print_request(m, rq, "rq: ");
> +				} else {
> +					seq_printf(m, "\t\tELSP[%d] idle\n",
> +						   idx);
> +				}
>  			}
>  			rcu_read_unlock();
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2bc72314cdd1..f6df402a5247 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3039,12 +3039,14 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
>  	 */
>
>  	if (i915.enable_execlists) {
> +		struct execlist_port *port = engine->execlist_port;
>  		unsigned long flags;
> +		unsigned int n;
>
>  		spin_lock_irqsave(&engine->timeline->lock, flags);
>
> -		i915_gem_request_put(engine->execlist_port[0].request);
> -		i915_gem_request_put(engine->execlist_port[1].request);
> +		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> +			i915_gem_request_put(port_request(&port[n]));
>  		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
>  		engine->execlist_queue = RB_ROOT;
>  		engine->execlist_first = NULL;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 4b247b050dcd..c5cdc6611d7f 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1324,12 +1324,17 @@ static void engine_record_requests(struct intel_engine_cs *engine,
>  static void error_record_engine_execlists(struct intel_engine_cs *engine,
>  					  struct drm_i915_error_engine *ee)
>  {
> +	const struct execlist_port *port = engine->execlist_port;
>  	unsigned int n;
>
> -	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> -		if (engine->execlist_port[n].request)
> -			record_request(engine->execlist_port[n].request,
> -				       &ee->execlist[n]);
> +	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> +		struct drm_i915_gem_request *rq = port_request(&port[n]);
> +
> +		if (!rq)
> +			break;
> +
> +		record_request(rq, &ee->execlist[n]);
> +	}
>  }
>
>  static void record_context(struct drm_i915_error_context *e,
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 1642fff9cf13..370373c97b81 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -658,7 +658,7 @@ static void nested_enable_signaling(struct drm_i915_gem_request *rq)
>  static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct execlist_port *port = engine->execlist_port;
> -	struct drm_i915_gem_request *last = port[0].request;
> +	struct drm_i915_gem_request *last = port[0].request_count;

It's confusing that in this new scheme sometimes we have direct access 
to the request and sometimes we have to go through the port_request macro.

So maybe we should always use the port_request macro. Hm, could we 
invent a new type to help enforce that? Like:

struct drm_i915_gem_port_request_slot {
	struct drm_i915_gem_request *req_count;
};

And then execlist port would contain these and helpers would need to be 
functions?

I've also noticed some GVT/GuC patches which sounded like they are 
adding the same single submission constraints so maybe now is the time 
to unify the dequeue? (Haven't looked at those patches deeper than the 
subject line so might be wrong.)

Not sure 100% of all the above, would need to sketch it. What are your 
thoughts?

>  	struct rb_node *rb;
>  	bool submit = false;
>
> @@ -672,7 +672,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  			if (port != engine->execlist_port)
>  				break;
>
> -			i915_gem_request_assign(&port->request, last);
> +			i915_gem_request_assign(&port->request_count, last);
>  			nested_enable_signaling(last);
>  			port++;
>  		}
> @@ -688,7 +688,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  		submit = true;
>  	}
>  	if (submit) {
> -		i915_gem_request_assign(&port->request, last);
> +		i915_gem_request_assign(&port->request_count, last);
>  		nested_enable_signaling(last);
>  		engine->execlist_first = rb;
>  	}
> @@ -705,17 +705,19 @@ static void i915_guc_irq_handler(unsigned long data)
>  	bool submit;
>
>  	do {
> -		rq = port[0].request;
> +		rq = port[0].request_count;
>  		while (rq && i915_gem_request_completed(rq)) {
>  			trace_i915_gem_request_out(rq);
>  			i915_gem_request_put(rq);
> -			port[0].request = port[1].request;
> -			port[1].request = NULL;
> -			rq = port[0].request;
> +
> +			port[0].request_count = port[1].request_count;
> +			port[1].request_count = NULL;
> +
> +			rq = port[0].request_count;
>  		}
>
>  		submit = false;
> -		if (!port[1].request)
> +		if (!port[1].request_count)
>  			submit = i915_guc_dequeue(engine);
>  	} while (submit);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 402769d9d840..10027d0a09b5 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1148,7 +1148,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
>  		return false;
>
>  	/* Both ports drained, no more ELSP submission? */
> -	if (engine->execlist_port[0].request)
> +	if (port_request(&engine->execlist_port[0]))
>  		return false;
>
>  	/* Ring stopped? */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 7df278fe492e..69299fbab4f9 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -342,39 +342,32 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
>
>  static void execlists_submit_ports(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_private *dev_priv = engine->i915;
>  	struct execlist_port *port = engine->execlist_port;
>  	u32 __iomem *elsp =
> -		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> -	u64 desc[2];
> -
> -	GEM_BUG_ON(port[0].count > 1);
> -	if (!port[0].count)
> -		execlists_context_status_change(port[0].request,
> -						INTEL_CONTEXT_SCHEDULE_IN);
> -	desc[0] = execlists_update_context(port[0].request);
> -	GEM_DEBUG_EXEC(port[0].context_id = upper_32_bits(desc[0]));
> -	port[0].count++;
> -
> -	if (port[1].request) {
> -		GEM_BUG_ON(port[1].count);
> -		execlists_context_status_change(port[1].request,
> -						INTEL_CONTEXT_SCHEDULE_IN);
> -		desc[1] = execlists_update_context(port[1].request);
> -		GEM_DEBUG_EXEC(port[1].context_id = upper_32_bits(desc[1]));
> -		port[1].count = 1;
> -	} else {
> -		desc[1] = 0;
> -	}
> -	GEM_BUG_ON(desc[0] == desc[1]);
> -
> -	/* You must always write both descriptors in the order below. */
> -	writel(upper_32_bits(desc[1]), elsp);
> -	writel(lower_32_bits(desc[1]), elsp);
> +		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +	unsigned int n;
> +
> +	for (n = ARRAY_SIZE(engine->execlist_port); n--; ) {

We could also add for_each_req_port or something, to iterate and unpack 
either req only or the count as well?

Preliminary pass only before the mtg. :)

Regards,

Tvrtko

> +		struct drm_i915_gem_request *rq;
> +		unsigned int count;
> +		u64 desc;
> +
> +		rq = port_unpack(&port[n], &count);
> +		if (rq) {
> +			GEM_BUG_ON(count > !n);
> +			if (!count++)
> +				execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> +			port[n].request_count = port_pack(rq, count);
> +			desc = execlists_update_context(rq);
> +			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
> +		} else {
> +			GEM_BUG_ON(!n);
> +			desc = 0;
> +		}
>
> -	writel(upper_32_bits(desc[0]), elsp);
> -	/* The context is automatically loaded after the following */
> -	writel(lower_32_bits(desc[0]), elsp);
> +		writel(upper_32_bits(desc), elsp);
> +		writel(lower_32_bits(desc), elsp);
> +	}
>  }
>
>  static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
> @@ -395,6 +388,18 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
>  	return true;
>  }
>
> +static void port_assign(struct execlist_port *port,
> +			struct drm_i915_gem_request *rq)
> +{
> +	GEM_BUG_ON(rq == port_request(port));
> +
> +	if (port->request_count)
> +		i915_gem_request_put(port_request(port));
> +
> +	port->request_count =
> +		port_pack(i915_gem_request_get(rq), port_count(port));
> +}
> +
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_gem_request *last;
> @@ -402,7 +407,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	struct rb_node *rb;
>  	bool submit = false;
>
> -	last = port->request;
> +	last = port_request(port);
>  	if (last)
>  		/* WaIdleLiteRestore:bdw,skl
>  		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
> @@ -412,7 +417,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		 */
>  		last->tail = last->wa_tail;
>
> -	GEM_BUG_ON(port[1].request);
> +	GEM_BUG_ON(port[1].request_count);
>
>  	/* Hardware submission is through 2 ports. Conceptually each port
>  	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
> @@ -469,7 +474,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>
>  			GEM_BUG_ON(last->ctx == cursor->ctx);
>
> -			i915_gem_request_assign(&port->request, last);
> +			if (submit)
> +				port_assign(port, last);
>  			port++;
>  		}
>
> @@ -484,7 +490,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		submit = true;
>  	}
>  	if (submit) {
> -		i915_gem_request_assign(&port->request, last);
> +		port_assign(port, last);
>  		engine->execlist_first = rb;
>  	}
>  	spin_unlock_irq(&engine->timeline->lock);
> @@ -495,14 +501,14 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>
>  static bool execlists_elsp_idle(struct intel_engine_cs *engine)
>  {
> -	return !engine->execlist_port[0].request;
> +	return !port_count(&engine->execlist_port[0]);
>  }
>
>  static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
>  {
>  	const struct execlist_port *port = engine->execlist_port;
>
> -	return port[0].count + port[1].count < 2;
> +	return port_count(&port[0]) + port_count(&port[1]) < 2;
>  }
>
>  /*
> @@ -552,7 +558,9 @@ static void intel_lrc_irq_handler(unsigned long data)
>  		tail = GEN8_CSB_WRITE_PTR(head);
>  		head = GEN8_CSB_READ_PTR(head);
>  		while (head != tail) {
> +			struct drm_i915_gem_request *rq;
>  			unsigned int status;
> +			unsigned int count;
>
>  			if (++head == GEN8_CSB_ENTRIES)
>  				head = 0;
> @@ -582,20 +590,24 @@ static void intel_lrc_irq_handler(unsigned long data)
>  			GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
>  					 port[0].context_id);
>
> -			GEM_BUG_ON(port[0].count == 0);
> -			if (--port[0].count == 0) {
> +			rq = port_unpack(&port[0], &count);
> +			GEM_BUG_ON(count == 0);
> +			if (--count == 0) {
>  				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> -				GEM_BUG_ON(!i915_gem_request_completed(port[0].request));
> -				execlists_context_status_change(port[0].request,
> -								INTEL_CONTEXT_SCHEDULE_OUT);
> +				GEM_BUG_ON(!i915_gem_request_completed(rq));
> +				execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> +
> +				trace_i915_gem_request_out(rq);
> +				i915_gem_request_put(rq);
>
> -				trace_i915_gem_request_out(port[0].request);
> -				i915_gem_request_put(port[0].request);
>  				port[0] = port[1];
>  				memset(&port[1], 0, sizeof(port[1]));
> +			} else {
> +				port[0].request_count = port_pack(rq, count);
>  			}
>
> -			GEM_BUG_ON(port[0].count == 0 &&
> +			/* After the final element, the hw should be idle */
> +			GEM_BUG_ON(port_count(&port[0]) == 0 &&
>  				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
>  		}
>
> @@ -1148,11 +1160,6 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
>  	return ret;
>  }
>
> -static u32 port_seqno(struct execlist_port *port)
> -{
> -	return port->request ? port->request->global_seqno : 0;
> -}
> -
>  static int gen8_init_common_ring(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_private *dev_priv = engine->i915;
> @@ -1177,12 +1184,22 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>  	/* After a GPU reset, we may have requests to replay */
>  	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
>  	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
> -		DRM_DEBUG_DRIVER("Restarting %s from requests [0x%x, 0x%x]\n",
> -				 engine->name,
> -				 port_seqno(&engine->execlist_port[0]),
> -				 port_seqno(&engine->execlist_port[1]));
> -		engine->execlist_port[0].count = 0;
> -		engine->execlist_port[1].count = 0;
> +		struct execlist_port *port = engine->execlist_port;
> +		unsigned int n;
> +
> +		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> +			if (!port[n].request_count)
> +				break;
> +
> +			DRM_DEBUG_DRIVER("Restarting %s from 0x%x [%d]\n",
> +					 engine->name,
> +					 port_request(&port[n])->global_seqno,
> +					 n);
> +
> +			/* Discard the current inflight count */
> +			port[n].request_count = port_request(&port[n]);
> +		}
> +
>  		execlists_submit_ports(engine);
>  	}
>
> @@ -1261,13 +1278,13 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	intel_ring_update_space(request->ring);
>
>  	/* Catch up with any missed context-switch interrupts */
> -	if (request->ctx != port[0].request->ctx) {
> -		i915_gem_request_put(port[0].request);
> +	if (request->ctx != port_request(&port[0])->ctx) {
> +		i915_gem_request_put(port_request(&port[0]));
>  		port[0] = port[1];
>  		memset(&port[1], 0, sizeof(port[1]));
>  	}
>
> -	GEM_BUG_ON(request->ctx != port[0].request->ctx);
> +	GEM_BUG_ON(request->ctx != port_request(&port[0])->ctx);
>
>  	/* Reset WaIdleLiteRestore:bdw,skl as well */
>  	request->tail =
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index d25b88467e5e..39b733e5cfd3 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -377,8 +377,12 @@ struct intel_engine_cs {
>  	/* Execlists */
>  	struct tasklet_struct irq_tasklet;
>  	struct execlist_port {
> -		struct drm_i915_gem_request *request;
> -		unsigned int count;
> +		struct drm_i915_gem_request *request_count;

Would req(uest)_slot maybe be better?

> +#define EXECLIST_COUNT_BITS 2
> +#define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS)
> +#define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS)
> +#define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS)
> +#define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, EXECLIST_COUNT_BITS)
>  		GEM_DEBUG_DECL(u32 context_id);
>  	} execlist_port[2];
>  	struct rb_root execlist_queue;
>