[Intel-gfx] [PATCH 13/27] drm/i915/execlists: Pack the count into the low bits of the port.request
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Thu Apr 20 14:58:19 UTC 2017
On 19/04/2017 10:41, Chris Wilson wrote:
> add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187)
> function old new delta
> execlists_submit_ports 262 471 +209
> port_assign.isra - 136 +136
> capture 6344 6359 +15
> reset_common_ring 438 452 +14
> execlists_submit_request 228 238 +10
> gen8_init_common_ring 334 341 +7
> intel_engine_is_idle 106 105 -1
> i915_engine_info 2314 2290 -24
> __i915_gem_set_wedged_BKL 485 411 -74
> intel_lrc_irq_handler 1789 1604 -185
> execlists_update_context 294 - -294
>
> The most important change there is the improve to the
> intel_lrc_irq_handler and excclist_submit_ports (net improvement since
> execlists_update_context is now inlined).
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 32 ++++---
> drivers/gpu/drm/i915/i915_gem.c | 6 +-
> drivers/gpu/drm/i915/i915_gpu_error.c | 13 ++-
> drivers/gpu/drm/i915/i915_guc_submission.c | 18 ++--
> drivers/gpu/drm/i915/intel_engine_cs.c | 2 +-
> drivers/gpu/drm/i915/intel_lrc.c | 133 ++++++++++++++++-------------
> drivers/gpu/drm/i915/intel_ringbuffer.h | 8 +-
> 7 files changed, 120 insertions(+), 92 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 870c470177b5..0b5d7142d8d9 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
> if (i915.enable_execlists) {
> u32 ptr, read, write;
> struct rb_node *rb;
> + unsigned int idx;
>
> seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
> I915_READ(RING_EXECLIST_STATUS_LO(engine)),
> @@ -3332,8 +3333,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
> if (read > write)
> write += GEN8_CSB_ENTRIES;
> while (read < write) {
> - unsigned int idx = ++read % GEN8_CSB_ENTRIES;
> -
> + idx = ++read % GEN8_CSB_ENTRIES;
> seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
> idx,
> I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
> @@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void *unused)
> }
>
> rcu_read_lock();
> - rq = READ_ONCE(engine->execlist_port[0].request);
> - if (rq) {
> - seq_printf(m, "\t\tELSP[0] count=%d, ",
> - engine->execlist_port[0].count);
> - print_request(m, rq, "rq: ");
> - } else {
> - seq_printf(m, "\t\tELSP[0] idle\n");
> - }
> - rq = READ_ONCE(engine->execlist_port[1].request);
> - if (rq) {
> - seq_printf(m, "\t\tELSP[1] count=%d, ",
> - engine->execlist_port[1].count);
> - print_request(m, rq, "rq: ");
> - } else {
> - seq_printf(m, "\t\tELSP[1] idle\n");
> + for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); idx++) {
> + unsigned int count;
> +
> + rq = port_unpack(&engine->execlist_port[idx],
> + &count);
> + if (rq) {
> + seq_printf(m, "\t\tELSP[%d] count=%d, ",
> + idx, count);
> + print_request(m, rq, "rq: ");
> + } else {
> + seq_printf(m, "\t\tELSP[%d] idle\n",
> + idx);
> + }
> }
> rcu_read_unlock();
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2bc72314cdd1..f6df402a5247 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3039,12 +3039,14 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
> */
>
> if (i915.enable_execlists) {
> + struct execlist_port *port = engine->execlist_port;
> unsigned long flags;
> + unsigned int n;
>
> spin_lock_irqsave(&engine->timeline->lock, flags);
>
> - i915_gem_request_put(engine->execlist_port[0].request);
> - i915_gem_request_put(engine->execlist_port[1].request);
> + for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> + i915_gem_request_put(port_request(&port[n]));
> memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> engine->execlist_queue = RB_ROOT;
> engine->execlist_first = NULL;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 4b247b050dcd..c5cdc6611d7f 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1324,12 +1324,17 @@ static void engine_record_requests(struct intel_engine_cs *engine,
> static void error_record_engine_execlists(struct intel_engine_cs *engine,
> struct drm_i915_error_engine *ee)
> {
> + const struct execlist_port *port = engine->execlist_port;
> unsigned int n;
>
> - for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> - if (engine->execlist_port[n].request)
> - record_request(engine->execlist_port[n].request,
> - &ee->execlist[n]);
> + for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> + struct drm_i915_gem_request *rq = port_request(&port[n]);
> +
> + if (!rq)
> + break;
> +
> + record_request(rq, &ee->execlist[n]);
> + }
> }
>
> static void record_context(struct drm_i915_error_context *e,
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 1642fff9cf13..370373c97b81 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -658,7 +658,7 @@ static void nested_enable_signaling(struct drm_i915_gem_request *rq)
> static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> {
> struct execlist_port *port = engine->execlist_port;
> - struct drm_i915_gem_request *last = port[0].request;
> + struct drm_i915_gem_request *last = port[0].request_count;
It's confusing that in this new scheme sometimes we have direct access
to the request and sometimes we have to go through the port_request macro.
So maybe we should always use the port_request macro. Hm, could we
invent a new type to help enforce that? Like:
struct drm_i915_gem_port_request_slot {
struct drm_i915_gem_request *req_count;
};
And then execlist port would contain these and helpers would need to be
functions?
I've also noticed some GVT/GuC patches which sounded like they are
adding the same single submission constraints so maybe now is the time
to unify the dequeue? (Haven't looked at those patches deeper than the
subject line so might be wrong.)
Not sure 100% of all the above, would need to sketch it. What are your
thoughts?
> struct rb_node *rb;
> bool submit = false;
>
> @@ -672,7 +672,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> if (port != engine->execlist_port)
> break;
>
> - i915_gem_request_assign(&port->request, last);
> + i915_gem_request_assign(&port->request_count, last);
> nested_enable_signaling(last);
> port++;
> }
> @@ -688,7 +688,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> submit = true;
> }
> if (submit) {
> - i915_gem_request_assign(&port->request, last);
> + i915_gem_request_assign(&port->request_count, last);
> nested_enable_signaling(last);
> engine->execlist_first = rb;
> }
> @@ -705,17 +705,19 @@ static void i915_guc_irq_handler(unsigned long data)
> bool submit;
>
> do {
> - rq = port[0].request;
> + rq = port[0].request_count;
> while (rq && i915_gem_request_completed(rq)) {
> trace_i915_gem_request_out(rq);
> i915_gem_request_put(rq);
> - port[0].request = port[1].request;
> - port[1].request = NULL;
> - rq = port[0].request;
> +
> + port[0].request_count = port[1].request_count;
> + port[1].request_count = NULL;
> +
> + rq = port[0].request_count;
> }
>
> submit = false;
> - if (!port[1].request)
> + if (!port[1].request_count)
> submit = i915_guc_dequeue(engine);
> } while (submit);
> }
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 402769d9d840..10027d0a09b5 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1148,7 +1148,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
> return false;
>
> /* Both ports drained, no more ELSP submission? */
> - if (engine->execlist_port[0].request)
> + if (port_request(&engine->execlist_port[0]))
> return false;
>
> /* Ring stopped? */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 7df278fe492e..69299fbab4f9 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -342,39 +342,32 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
>
> static void execlists_submit_ports(struct intel_engine_cs *engine)
> {
> - struct drm_i915_private *dev_priv = engine->i915;
> struct execlist_port *port = engine->execlist_port;
> u32 __iomem *elsp =
> - dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> - u64 desc[2];
> -
> - GEM_BUG_ON(port[0].count > 1);
> - if (!port[0].count)
> - execlists_context_status_change(port[0].request,
> - INTEL_CONTEXT_SCHEDULE_IN);
> - desc[0] = execlists_update_context(port[0].request);
> - GEM_DEBUG_EXEC(port[0].context_id = upper_32_bits(desc[0]));
> - port[0].count++;
> -
> - if (port[1].request) {
> - GEM_BUG_ON(port[1].count);
> - execlists_context_status_change(port[1].request,
> - INTEL_CONTEXT_SCHEDULE_IN);
> - desc[1] = execlists_update_context(port[1].request);
> - GEM_DEBUG_EXEC(port[1].context_id = upper_32_bits(desc[1]));
> - port[1].count = 1;
> - } else {
> - desc[1] = 0;
> - }
> - GEM_BUG_ON(desc[0] == desc[1]);
> -
> - /* You must always write both descriptors in the order below. */
> - writel(upper_32_bits(desc[1]), elsp);
> - writel(lower_32_bits(desc[1]), elsp);
> + engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> + unsigned int n;
> +
> + for (n = ARRAY_SIZE(engine->execlist_port); n--; ) {
We could also add for_each_req_port or something, to iterate and unpack
either req only or the count as well?
Preliminary pass only before the mtg. :)
Regards,
Tvrtko
> + struct drm_i915_gem_request *rq;
> + unsigned int count;
> + u64 desc;
> +
> + rq = port_unpack(&port[n], &count);
> + if (rq) {
> + GEM_BUG_ON(count > !n);
> + if (!count++)
> + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> + port[n].request_count = port_pack(rq, count);
> + desc = execlists_update_context(rq);
> + GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
> + } else {
> + GEM_BUG_ON(!n);
> + desc = 0;
> + }
>
> - writel(upper_32_bits(desc[0]), elsp);
> - /* The context is automatically loaded after the following */
> - writel(lower_32_bits(desc[0]), elsp);
> + writel(upper_32_bits(desc), elsp);
> + writel(lower_32_bits(desc), elsp);
> + }
> }
>
> static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
> @@ -395,6 +388,18 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
> return true;
> }
>
> +static void port_assign(struct execlist_port *port,
> + struct drm_i915_gem_request *rq)
> +{
> + GEM_BUG_ON(rq == port_request(port));
> +
> + if (port->request_count)
> + i915_gem_request_put(port_request(port));
> +
> + port->request_count =
> + port_pack(i915_gem_request_get(rq), port_count(port));
> +}
> +
> static void execlists_dequeue(struct intel_engine_cs *engine)
> {
> struct drm_i915_gem_request *last;
> @@ -402,7 +407,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> struct rb_node *rb;
> bool submit = false;
>
> - last = port->request;
> + last = port_request(port);
> if (last)
> /* WaIdleLiteRestore:bdw,skl
> * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
> @@ -412,7 +417,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> */
> last->tail = last->wa_tail;
>
> - GEM_BUG_ON(port[1].request);
> + GEM_BUG_ON(port[1].request_count);
>
> /* Hardware submission is through 2 ports. Conceptually each port
> * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
> @@ -469,7 +474,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>
> GEM_BUG_ON(last->ctx == cursor->ctx);
>
> - i915_gem_request_assign(&port->request, last);
> + if (submit)
> + port_assign(port, last);
> port++;
> }
>
> @@ -484,7 +490,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> submit = true;
> }
> if (submit) {
> - i915_gem_request_assign(&port->request, last);
> + port_assign(port, last);
> engine->execlist_first = rb;
> }
> spin_unlock_irq(&engine->timeline->lock);
> @@ -495,14 +501,14 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>
> static bool execlists_elsp_idle(struct intel_engine_cs *engine)
> {
> - return !engine->execlist_port[0].request;
> + return !port_count(&engine->execlist_port[0]);
> }
>
> static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
> {
> const struct execlist_port *port = engine->execlist_port;
>
> - return port[0].count + port[1].count < 2;
> + return port_count(&port[0]) + port_count(&port[1]) < 2;
> }
>
> /*
> @@ -552,7 +558,9 @@ static void intel_lrc_irq_handler(unsigned long data)
> tail = GEN8_CSB_WRITE_PTR(head);
> head = GEN8_CSB_READ_PTR(head);
> while (head != tail) {
> + struct drm_i915_gem_request *rq;
> unsigned int status;
> + unsigned int count;
>
> if (++head == GEN8_CSB_ENTRIES)
> head = 0;
> @@ -582,20 +590,24 @@ static void intel_lrc_irq_handler(unsigned long data)
> GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
> port[0].context_id);
>
> - GEM_BUG_ON(port[0].count == 0);
> - if (--port[0].count == 0) {
> + rq = port_unpack(&port[0], &count);
> + GEM_BUG_ON(count == 0);
> + if (--count == 0) {
> GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> - GEM_BUG_ON(!i915_gem_request_completed(port[0].request));
> - execlists_context_status_change(port[0].request,
> - INTEL_CONTEXT_SCHEDULE_OUT);
> + GEM_BUG_ON(!i915_gem_request_completed(rq));
> + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> +
> + trace_i915_gem_request_out(rq);
> + i915_gem_request_put(rq);
>
> - trace_i915_gem_request_out(port[0].request);
> - i915_gem_request_put(port[0].request);
> port[0] = port[1];
> memset(&port[1], 0, sizeof(port[1]));
> + } else {
> + port[0].request_count = port_pack(rq, count);
> }
>
> - GEM_BUG_ON(port[0].count == 0 &&
> + /* After the final element, the hw should be idle */
> + GEM_BUG_ON(port_count(&port[0]) == 0 &&
> !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
> }
>
> @@ -1148,11 +1160,6 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
> return ret;
> }
>
> -static u32 port_seqno(struct execlist_port *port)
> -{
> - return port->request ? port->request->global_seqno : 0;
> -}
> -
> static int gen8_init_common_ring(struct intel_engine_cs *engine)
> {
> struct drm_i915_private *dev_priv = engine->i915;
> @@ -1177,12 +1184,22 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
> /* After a GPU reset, we may have requests to replay */
> clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
> - DRM_DEBUG_DRIVER("Restarting %s from requests [0x%x, 0x%x]\n",
> - engine->name,
> - port_seqno(&engine->execlist_port[0]),
> - port_seqno(&engine->execlist_port[1]));
> - engine->execlist_port[0].count = 0;
> - engine->execlist_port[1].count = 0;
> + struct execlist_port *port = engine->execlist_port;
> + unsigned int n;
> +
> + for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> + if (!port[n].request_count)
> + break;
> +
> + DRM_DEBUG_DRIVER("Restarting %s from 0x%x [%d]\n",
> + engine->name,
> + port_request(&port[n])->global_seqno,
> + n);
> +
> + /* Discard the current inflight count */
> + port[n].request_count = port_request(&port[n]);
> + }
> +
> execlists_submit_ports(engine);
> }
>
> @@ -1261,13 +1278,13 @@ static void reset_common_ring(struct intel_engine_cs *engine,
> intel_ring_update_space(request->ring);
>
> /* Catch up with any missed context-switch interrupts */
> - if (request->ctx != port[0].request->ctx) {
> - i915_gem_request_put(port[0].request);
> + if (request->ctx != port_request(&port[0])->ctx) {
> + i915_gem_request_put(port_request(&port[0]));
> port[0] = port[1];
> memset(&port[1], 0, sizeof(port[1]));
> }
>
> - GEM_BUG_ON(request->ctx != port[0].request->ctx);
> + GEM_BUG_ON(request->ctx != port_request(&port[0])->ctx);
>
> /* Reset WaIdleLiteRestore:bdw,skl as well */
> request->tail =
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index d25b88467e5e..39b733e5cfd3 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -377,8 +377,12 @@ struct intel_engine_cs {
> /* Execlists */
> struct tasklet_struct irq_tasklet;
> struct execlist_port {
> - struct drm_i915_gem_request *request;
> - unsigned int count;
> + struct drm_i915_gem_request *request_count;
Would req(uest)_slot maybe be better?
> +#define EXECLIST_COUNT_BITS 2
> +#define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS)
> +#define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS)
> +#define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS)
> +#define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, EXECLIST_COUNT_BITS)
> GEM_DEBUG_DECL(u32 context_id);
> } execlist_port[2];
> struct rb_root execlist_queue;
>
More information about the Intel-gfx
mailing list