[Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request

Fri May 5 10:49:21 UTC 2017

On 03/05/2017 12:37, Chris Wilson wrote:
> add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187)
> function                                     old     new   delta
> execlists_submit_ports                       262     471    +209
> port_assign.isra                               -     136    +136
> capture                                     6344    6359     +15
> reset_common_ring                            438     452     +14
> execlists_submit_request                     228     238     +10
> gen8_init_common_ring                        334     341      +7
> intel_engine_is_idle                         106     105      -1
> i915_engine_info                            2314    2290     -24
> __i915_gem_set_wedged_BKL                    485     411     -74
> intel_lrc_irq_handler                       1789    1604    -185
> execlists_update_context                     294       -    -294
>
> The most important change there is the improve to the
> intel_lrc_irq_handler and excclist_submit_ports (net improvement since
> execlists_update_context is now inlined).
>
> v2: Use the port_api() for guc as well (even though currently we do not
> pack any counters in there, yet) and hide all port->request_count inside
> the helpers.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        |  32 ++++----
>  drivers/gpu/drm/i915/i915_gem.c            |   6 +-
>  drivers/gpu/drm/i915/i915_gpu_error.c      |  13 +++-
>  drivers/gpu/drm/i915/i915_guc_submission.c |  32 +++++---
>  drivers/gpu/drm/i915/intel_engine_cs.c     |   2 +-
>  drivers/gpu/drm/i915/intel_lrc.c           | 117 ++++++++++++++++-------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |  10 ++-
>  7 files changed, 122 insertions(+), 90 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 870c470177b5..0b5d7142d8d9 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  		if (i915.enable_execlists) {
>  			u32 ptr, read, write;
>  			struct rb_node *rb;
> +			unsigned int idx;
>
>  			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
>  				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
> @@ -3332,8 +3333,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  			if (read > write)
>  				write += GEN8_CSB_ENTRIES;
>  			while (read < write) {
> -				unsigned int idx = ++read % GEN8_CSB_ENTRIES;
> -
> +				idx = ++read % GEN8_CSB_ENTRIES;
>  				seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
>  					   idx,
>  					   I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
> @@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  			}
>
>  			rcu_read_lock();
> -			rq = READ_ONCE(engine->execlist_port[0].request);
> -			if (rq) {
> -				seq_printf(m, "\t\tELSP[0] count=%d, ",
> -					   engine->execlist_port[0].count);
> -				print_request(m, rq, "rq: ");
> -			} else {
> -				seq_printf(m, "\t\tELSP[0] idle\n");
> -			}
> -			rq = READ_ONCE(engine->execlist_port[1].request);
> -			if (rq) {
> -				seq_printf(m, "\t\tELSP[1] count=%d, ",
> -					   engine->execlist_port[1].count);
> -				print_request(m, rq, "rq: ");
> -			} else {
> -				seq_printf(m, "\t\tELSP[1] idle\n");
> +			for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); idx++) {
> +				unsigned int count;
> +
> +				rq = port_unpack(&engine->execlist_port[idx],
> +						 &count);
> +				if (rq) {
> +					seq_printf(m, "\t\tELSP[%d] count=%d, ",
> +						   idx, count);
> +					print_request(m, rq, "rq: ");
> +				} else {
> +					seq_printf(m, "\t\tELSP[%d] idle\n",
> +						   idx);
> +				}
>  			}
>  			rcu_read_unlock();
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f79079f2e265..c1df4b9d2661 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3038,12 +3038,14 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
>  	 */
>
>  	if (i915.enable_execlists) {
> +		struct execlist_port *port = engine->execlist_port;
>  		unsigned long flags;
> +		unsigned int n;
>
>  		spin_lock_irqsave(&engine->timeline->lock, flags);
>
> -		i915_gem_request_put(engine->execlist_port[0].request);
> -		i915_gem_request_put(engine->execlist_port[1].request);
> +		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> +			i915_gem_request_put(port_request(&port[n]));
>  		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
>  		engine->execlist_queue = RB_ROOT;
>  		engine->execlist_first = NULL;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index ec526d92f908..e18f350bc364 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1324,12 +1324,17 @@ static void engine_record_requests(struct intel_engine_cs *engine,
>  static void error_record_engine_execlists(struct intel_engine_cs *engine,
>  					  struct drm_i915_error_engine *ee)
>  {
> +	const struct execlist_port *port = engine->execlist_port;
>  	unsigned int n;
>
> -	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> -		if (engine->execlist_port[n].request)
> -			record_request(engine->execlist_port[n].request,
> -				       &ee->execlist[n]);
> +	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> +		struct drm_i915_gem_request *rq = port_request(&port[n]);
> +
> +		if (!rq)
> +			break;
> +
> +		record_request(rq, &ee->execlist[n]);
> +	}
>  }
>
>  static void record_context(struct drm_i915_error_context *e,
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 7e85b5ab8ae2..62d3831a8a0d 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -653,10 +653,22 @@ static void nested_enable_signaling(struct drm_i915_gem_request *rq)
>  	spin_unlock(&rq->lock);
>  }
>
> +static void port_assign(struct execlist_port *port,
> +			struct drm_i915_gem_request *rq)
> +{
> +	GEM_BUG_ON(rq == port_request(port));
> +
> +	if (port_isset(port))
> +		i915_gem_request_put(port_request(port));
> +
> +	port_set(port, i915_gem_request_get(rq));
> +	nested_enable_signaling(rq);
> +}
> +
>  static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct execlist_port *port = engine->execlist_port;
> -	struct drm_i915_gem_request *last = port[0].request;
> +	struct drm_i915_gem_request *last = port_request(&port[0]);
>  	struct rb_node *rb;
>  	bool submit = false;
>
> @@ -670,8 +682,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  			if (port != engine->execlist_port)
>  				break;
>
> -			i915_gem_request_assign(&port->request, last);
> -			nested_enable_signaling(last);
> +			port_assign(port, last);
>  			port++;
>  		}
>
> @@ -686,8 +697,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  		submit = true;
>  	}
>  	if (submit) {
> -		i915_gem_request_assign(&port->request, last);
> -		nested_enable_signaling(last);
> +		port_assign(port, last);
>  		engine->execlist_first = rb;
>  	}
>  	spin_unlock_irq(&engine->timeline->lock);
> @@ -703,17 +713,19 @@ static void i915_guc_irq_handler(unsigned long data)
>  	bool submit;
>
>  	do {
> -		rq = port[0].request;
> +		rq = port_request(&port[0]);
>  		while (rq && i915_gem_request_completed(rq)) {
>  			trace_i915_gem_request_out(rq);
>  			i915_gem_request_put(rq);
> -			port[0].request = port[1].request;
> -			port[1].request = NULL;
> -			rq = port[0].request;
> +
> +			port[0] = port[1];
> +			memset(&port[1], 0, sizeof(port[1]));
> +
> +			rq = port_request(&port[0]);
>  		}
>
>  		submit = false;
> -		if (!port[1].request)
> +		if (!port_count(&port[1]))
>  			submit = i915_guc_dequeue(engine);
>  	} while (submit);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 6d3d83876da9..24659788e7a3 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1230,7 +1230,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
>  		return false;
>
>  	/* Both ports drained, no more ELSP submission? */
> -	if (engine->execlist_port[0].request)
> +	if (port_request(&engine->execlist_port[0]))
>  		return false;
>
>  	/* Ring stopped? */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 0909549ad320..9f7b6112c53a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -337,39 +337,32 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
>
>  static void execlists_submit_ports(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_private *dev_priv = engine->i915;
>  	struct execlist_port *port = engine->execlist_port;
>  	u32 __iomem *elsp =
> -		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> -	u64 desc[2];
> -
> -	GEM_BUG_ON(port[0].count > 1);
> -	if (!port[0].count)
> -		execlists_context_status_change(port[0].request,
> -						INTEL_CONTEXT_SCHEDULE_IN);
> -	desc[0] = execlists_update_context(port[0].request);
> -	GEM_DEBUG_EXEC(port[0].context_id = upper_32_bits(desc[0]));
> -	port[0].count++;
> -
> -	if (port[1].request) {
> -		GEM_BUG_ON(port[1].count);
> -		execlists_context_status_change(port[1].request,
> -						INTEL_CONTEXT_SCHEDULE_IN);
> -		desc[1] = execlists_update_context(port[1].request);
> -		GEM_DEBUG_EXEC(port[1].context_id = upper_32_bits(desc[1]));
> -		port[1].count = 1;
> -	} else {
> -		desc[1] = 0;
> -	}
> -	GEM_BUG_ON(desc[0] == desc[1]);
> +		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +	unsigned int n;
>
> -	/* You must always write both descriptors in the order below. */
> -	writel(upper_32_bits(desc[1]), elsp);
> -	writel(lower_32_bits(desc[1]), elsp);
> +	for (n = ARRAY_SIZE(engine->execlist_port); n--; ) {
> +		struct drm_i915_gem_request *rq;
> +		unsigned int count;
> +		u64 desc;
> +
> +		rq = port_unpack(&port[n], &count);
> +		if (rq) {
> +			GEM_BUG_ON(count > !n);
> +			if (!count++)
> +				execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> +			port_set(&port[n], port_pack(rq, count));
> +			desc = execlists_update_context(rq);
> +			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
> +		} else {
> +			GEM_BUG_ON(!n);
> +			desc = 0;
> +		}
>
> -	writel(upper_32_bits(desc[0]), elsp);
> -	/* The context is automatically loaded after the following */
> -	writel(lower_32_bits(desc[0]), elsp);
> +		writel(upper_32_bits(desc), elsp);
> +		writel(lower_32_bits(desc), elsp);
> +	}
>  }
>
>  static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
> @@ -390,6 +383,17 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
>  	return true;
>  }
>
> +static void port_assign(struct execlist_port *port,
> +			struct drm_i915_gem_request *rq)
> +{
> +	GEM_BUG_ON(rq == port_request(port));
> +
> +	if (port_isset(port))
> +		i915_gem_request_put(port_request(port));
> +
> +	port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
> +}

Reason for not having one implementation of port_assign with 
enable_nested_signalling outside it in the guc case?

> +
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_gem_request *last;
> @@ -397,7 +401,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	struct rb_node *rb;
>  	bool submit = false;
>
> -	last = port->request;
> +	last = port_request(port);
>  	if (last)
>  		/* WaIdleLiteRestore:bdw,skl
>  		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
> @@ -407,7 +411,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		 */
>  		last->tail = last->wa_tail;
>
> -	GEM_BUG_ON(port[1].request);
> +	GEM_BUG_ON(port_isset(&port[1]));
>
>  	/* Hardware submission is through 2 ports. Conceptually each port
>  	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
> @@ -464,7 +468,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>
>  			GEM_BUG_ON(last->ctx == cursor->ctx);
>
> -			i915_gem_request_assign(&port->request, last);
> +			if (submit)
> +				port_assign(port, last);

Optimisation? GEM_BUG_ON(last != port_request(port))?

>  			port++;
>  		}
>
> @@ -479,7 +484,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		submit = true;
>  	}
>  	if (submit) {
> -		i915_gem_request_assign(&port->request, last);
> +		port_assign(port, last);
>  		engine->execlist_first = rb;
>  	}
>  	spin_unlock_irq(&engine->timeline->lock);
> @@ -488,16 +493,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		execlists_submit_ports(engine);
>  }
>
> -static bool execlists_elsp_idle(struct intel_engine_cs *engine)
> -{
> -	return !engine->execlist_port[0].request;
> -}
> -
>  static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
>  {
>  	const struct execlist_port *port = engine->execlist_port;
>
> -	return port[0].count + port[1].count < 2;
> +	return port_count(&port[0]) + port_count(&port[1]) < 2;
>  }
>
>  /*
> @@ -547,7 +547,9 @@ static void intel_lrc_irq_handler(unsigned long data)
>  		tail = GEN8_CSB_WRITE_PTR(head);
>  		head = GEN8_CSB_READ_PTR(head);
>  		while (head != tail) {
> +			struct drm_i915_gem_request *rq;
>  			unsigned int status;
> +			unsigned int count;
>
>  			if (++head == GEN8_CSB_ENTRIES)
>  				head = 0;
> @@ -577,20 +579,24 @@ static void intel_lrc_irq_handler(unsigned long data)
>  			GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
>  					 port[0].context_id);
>
> -			GEM_BUG_ON(port[0].count == 0);
> -			if (--port[0].count == 0) {
> +			rq = port_unpack(&port[0], &count);
> +			GEM_BUG_ON(count == 0);
> +			if (--count == 0) {

If you changed this to be:

count--;
port_set(...);
if (count > 0)
	continue;

It would be perhaps a bit more readable, but a potentially useless 
port_set on the other hand. Not sure.

>  				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> -				GEM_BUG_ON(!i915_gem_request_completed(port[0].request));
> -				execlists_context_status_change(port[0].request,
> -								INTEL_CONTEXT_SCHEDULE_OUT);
> +				GEM_BUG_ON(!i915_gem_request_completed(rq));
> +				execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> +
> +				trace_i915_gem_request_out(rq);
> +				i915_gem_request_put(rq);
>
> -				trace_i915_gem_request_out(port[0].request);
> -				i915_gem_request_put(port[0].request);
>  				port[0] = port[1];
>  				memset(&port[1], 0, sizeof(port[1]));
> +			} else {
> +				port_set(&port[0], port_pack(rq, count));
>  			}
>
> -			GEM_BUG_ON(port[0].count == 0 &&
> +			/* After the final element, the hw should be idle */
> +			GEM_BUG_ON(port_count(&port[0]) == 0 &&
>  				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
>  		}
>
> @@ -1148,6 +1154,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>  	struct drm_i915_private *dev_priv = engine->i915;
>  	struct execlist_port *port = engine->execlist_port;
>  	unsigned int n;
> +	bool submit;
>  	int ret;
>
>  	ret = intel_mocs_init_engine(engine);
> @@ -1169,19 +1176,21 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>  	/* After a GPU reset, we may have requests to replay */
>  	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
>
> +	submit = false;
>  	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> -		if (!port[n].request)
> +		if (!port_isset(&port[n]))
>  			break;
>
>  		DRM_DEBUG_DRIVER("Restarting %s:%d from 0x%x\n",
>  				 engine->name, n,
> -				 port[n].request->global_seqno);
> +				 port_request(&port[n])->global_seqno);
>
>  		/* Discard the current inflight count */
> -		port[n].count = 0;
> +		port_set(&port[n], port_request(&port[n]));
> +		submit = true;
>  	}
>
> -	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine))
> +	if (submit && !i915.enable_guc_submission)
>  		execlists_submit_ports(engine);
>
>  	return 0;
> @@ -1259,13 +1268,13 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	intel_ring_update_space(request->ring);
>
>  	/* Catch up with any missed context-switch interrupts */
> -	if (request->ctx != port[0].request->ctx) {
> -		i915_gem_request_put(port[0].request);
> +	if (request->ctx != port_request(&port[0])->ctx) {
> +		i915_gem_request_put(port_request(&port[0]));
>  		port[0] = port[1];
>  		memset(&port[1], 0, sizeof(port[1]));
>  	}
>
> -	GEM_BUG_ON(request->ctx != port[0].request->ctx);
> +	GEM_BUG_ON(request->ctx != port_request(&port[0])->ctx);
>
>  	/* Reset WaIdleLiteRestore:bdw,skl as well */
>  	request->tail =
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 4a599e3480a9..68765d45116b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -368,8 +368,14 @@ struct intel_engine_cs {
>  	/* Execlists */
>  	struct tasklet_struct irq_tasklet;
>  	struct execlist_port {
> -		struct drm_i915_gem_request *request;
> -		unsigned int count;
> +		struct drm_i915_gem_request *request_count;
> +#define EXECLIST_COUNT_BITS 2
> +#define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS)
> +#define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS)
> +#define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS)
> +#define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, EXECLIST_COUNT_BITS)
> +#define port_set(p, packed) ((p)->request_count = (packed))
> +#define port_isset(p) ((p)->request_count)
>  		GEM_DEBUG_DECL(u32 context_id);
>  	} execlist_port[2];
>  	struct rb_root execlist_queue;
>

Looks correct but I am still having trouble accepting the structure 
shrink and code savings are worth having less readable code.

Regards,

Tvrtko