[Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri May 5 10:49:21 UTC 2017
On 03/05/2017 12:37, Chris Wilson wrote:
> add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187)
> function old new delta
> execlists_submit_ports 262 471 +209
> port_assign.isra - 136 +136
> capture 6344 6359 +15
> reset_common_ring 438 452 +14
> execlists_submit_request 228 238 +10
> gen8_init_common_ring 334 341 +7
> intel_engine_is_idle 106 105 -1
> i915_engine_info 2314 2290 -24
> __i915_gem_set_wedged_BKL 485 411 -74
> intel_lrc_irq_handler 1789 1604 -185
> execlists_update_context 294 - -294
>
> The most important change there is the improve to the
> intel_lrc_irq_handler and excclist_submit_ports (net improvement since
> execlists_update_context is now inlined).
>
> v2: Use the port_api() for guc as well (even though currently we do not
> pack any counters in there, yet) and hide all port->request_count inside
> the helpers.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 32 ++++----
> drivers/gpu/drm/i915/i915_gem.c | 6 +-
> drivers/gpu/drm/i915/i915_gpu_error.c | 13 +++-
> drivers/gpu/drm/i915/i915_guc_submission.c | 32 +++++---
> drivers/gpu/drm/i915/intel_engine_cs.c | 2 +-
> drivers/gpu/drm/i915/intel_lrc.c | 117 ++++++++++++++++-------------
> drivers/gpu/drm/i915/intel_ringbuffer.h | 10 ++-
> 7 files changed, 122 insertions(+), 90 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 870c470177b5..0b5d7142d8d9 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
> if (i915.enable_execlists) {
> u32 ptr, read, write;
> struct rb_node *rb;
> + unsigned int idx;
>
> seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
> I915_READ(RING_EXECLIST_STATUS_LO(engine)),
> @@ -3332,8 +3333,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
> if (read > write)
> write += GEN8_CSB_ENTRIES;
> while (read < write) {
> - unsigned int idx = ++read % GEN8_CSB_ENTRIES;
> -
> + idx = ++read % GEN8_CSB_ENTRIES;
> seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
> idx,
> I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
> @@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void *unused)
> }
>
> rcu_read_lock();
> - rq = READ_ONCE(engine->execlist_port[0].request);
> - if (rq) {
> - seq_printf(m, "\t\tELSP[0] count=%d, ",
> - engine->execlist_port[0].count);
> - print_request(m, rq, "rq: ");
> - } else {
> - seq_printf(m, "\t\tELSP[0] idle\n");
> - }
> - rq = READ_ONCE(engine->execlist_port[1].request);
> - if (rq) {
> - seq_printf(m, "\t\tELSP[1] count=%d, ",
> - engine->execlist_port[1].count);
> - print_request(m, rq, "rq: ");
> - } else {
> - seq_printf(m, "\t\tELSP[1] idle\n");
> + for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); idx++) {
> + unsigned int count;
> +
> + rq = port_unpack(&engine->execlist_port[idx],
> + &count);
> + if (rq) {
> + seq_printf(m, "\t\tELSP[%d] count=%d, ",
> + idx, count);
> + print_request(m, rq, "rq: ");
> + } else {
> + seq_printf(m, "\t\tELSP[%d] idle\n",
> + idx);
> + }
> }
> rcu_read_unlock();
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f79079f2e265..c1df4b9d2661 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3038,12 +3038,14 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
> */
>
> if (i915.enable_execlists) {
> + struct execlist_port *port = engine->execlist_port;
> unsigned long flags;
> + unsigned int n;
>
> spin_lock_irqsave(&engine->timeline->lock, flags);
>
> - i915_gem_request_put(engine->execlist_port[0].request);
> - i915_gem_request_put(engine->execlist_port[1].request);
> + for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> + i915_gem_request_put(port_request(&port[n]));
> memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> engine->execlist_queue = RB_ROOT;
> engine->execlist_first = NULL;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index ec526d92f908..e18f350bc364 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1324,12 +1324,17 @@ static void engine_record_requests(struct intel_engine_cs *engine,
> static void error_record_engine_execlists(struct intel_engine_cs *engine,
> struct drm_i915_error_engine *ee)
> {
> + const struct execlist_port *port = engine->execlist_port;
> unsigned int n;
>
> - for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> - if (engine->execlist_port[n].request)
> - record_request(engine->execlist_port[n].request,
> - &ee->execlist[n]);
> + for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> + struct drm_i915_gem_request *rq = port_request(&port[n]);
> +
> + if (!rq)
> + break;
> +
> + record_request(rq, &ee->execlist[n]);
> + }
> }
>
> static void record_context(struct drm_i915_error_context *e,
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 7e85b5ab8ae2..62d3831a8a0d 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -653,10 +653,22 @@ static void nested_enable_signaling(struct drm_i915_gem_request *rq)
> spin_unlock(&rq->lock);
> }
>
> +static void port_assign(struct execlist_port *port,
> + struct drm_i915_gem_request *rq)
> +{
> + GEM_BUG_ON(rq == port_request(port));
> +
> + if (port_isset(port))
> + i915_gem_request_put(port_request(port));
> +
> + port_set(port, i915_gem_request_get(rq));
> + nested_enable_signaling(rq);
> +}
> +
> static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> {
> struct execlist_port *port = engine->execlist_port;
> - struct drm_i915_gem_request *last = port[0].request;
> + struct drm_i915_gem_request *last = port_request(&port[0]);
> struct rb_node *rb;
> bool submit = false;
>
> @@ -670,8 +682,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> if (port != engine->execlist_port)
> break;
>
> - i915_gem_request_assign(&port->request, last);
> - nested_enable_signaling(last);
> + port_assign(port, last);
> port++;
> }
>
> @@ -686,8 +697,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> submit = true;
> }
> if (submit) {
> - i915_gem_request_assign(&port->request, last);
> - nested_enable_signaling(last);
> + port_assign(port, last);
> engine->execlist_first = rb;
> }
> spin_unlock_irq(&engine->timeline->lock);
> @@ -703,17 +713,19 @@ static void i915_guc_irq_handler(unsigned long data)
> bool submit;
>
> do {
> - rq = port[0].request;
> + rq = port_request(&port[0]);
> while (rq && i915_gem_request_completed(rq)) {
> trace_i915_gem_request_out(rq);
> i915_gem_request_put(rq);
> - port[0].request = port[1].request;
> - port[1].request = NULL;
> - rq = port[0].request;
> +
> + port[0] = port[1];
> + memset(&port[1], 0, sizeof(port[1]));
> +
> + rq = port_request(&port[0]);
> }
>
> submit = false;
> - if (!port[1].request)
> + if (!port_count(&port[1]))
> submit = i915_guc_dequeue(engine);
> } while (submit);
> }
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 6d3d83876da9..24659788e7a3 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1230,7 +1230,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
> return false;
>
> /* Both ports drained, no more ELSP submission? */
> - if (engine->execlist_port[0].request)
> + if (port_request(&engine->execlist_port[0]))
> return false;
>
> /* Ring stopped? */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 0909549ad320..9f7b6112c53a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -337,39 +337,32 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
>
> static void execlists_submit_ports(struct intel_engine_cs *engine)
> {
> - struct drm_i915_private *dev_priv = engine->i915;
> struct execlist_port *port = engine->execlist_port;
> u32 __iomem *elsp =
> - dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> - u64 desc[2];
> -
> - GEM_BUG_ON(port[0].count > 1);
> - if (!port[0].count)
> - execlists_context_status_change(port[0].request,
> - INTEL_CONTEXT_SCHEDULE_IN);
> - desc[0] = execlists_update_context(port[0].request);
> - GEM_DEBUG_EXEC(port[0].context_id = upper_32_bits(desc[0]));
> - port[0].count++;
> -
> - if (port[1].request) {
> - GEM_BUG_ON(port[1].count);
> - execlists_context_status_change(port[1].request,
> - INTEL_CONTEXT_SCHEDULE_IN);
> - desc[1] = execlists_update_context(port[1].request);
> - GEM_DEBUG_EXEC(port[1].context_id = upper_32_bits(desc[1]));
> - port[1].count = 1;
> - } else {
> - desc[1] = 0;
> - }
> - GEM_BUG_ON(desc[0] == desc[1]);
> + engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> + unsigned int n;
>
> - /* You must always write both descriptors in the order below. */
> - writel(upper_32_bits(desc[1]), elsp);
> - writel(lower_32_bits(desc[1]), elsp);
> + for (n = ARRAY_SIZE(engine->execlist_port); n--; ) {
> + struct drm_i915_gem_request *rq;
> + unsigned int count;
> + u64 desc;
> +
> + rq = port_unpack(&port[n], &count);
> + if (rq) {
> + GEM_BUG_ON(count > !n);
> + if (!count++)
> + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> + port_set(&port[n], port_pack(rq, count));
> + desc = execlists_update_context(rq);
> + GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
> + } else {
> + GEM_BUG_ON(!n);
> + desc = 0;
> + }
>
> - writel(upper_32_bits(desc[0]), elsp);
> - /* The context is automatically loaded after the following */
> - writel(lower_32_bits(desc[0]), elsp);
> + writel(upper_32_bits(desc), elsp);
> + writel(lower_32_bits(desc), elsp);
> + }
> }
>
> static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
> @@ -390,6 +383,17 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
> return true;
> }
>
> +static void port_assign(struct execlist_port *port,
> + struct drm_i915_gem_request *rq)
> +{
> + GEM_BUG_ON(rq == port_request(port));
> +
> + if (port_isset(port))
> + i915_gem_request_put(port_request(port));
> +
> + port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
> +}
Reason for not having one implementation of port_assign with
enable_nested_signalling outside it in the guc case?
> +
> static void execlists_dequeue(struct intel_engine_cs *engine)
> {
> struct drm_i915_gem_request *last;
> @@ -397,7 +401,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> struct rb_node *rb;
> bool submit = false;
>
> - last = port->request;
> + last = port_request(port);
> if (last)
> /* WaIdleLiteRestore:bdw,skl
> * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
> @@ -407,7 +411,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> */
> last->tail = last->wa_tail;
>
> - GEM_BUG_ON(port[1].request);
> + GEM_BUG_ON(port_isset(&port[1]));
>
> /* Hardware submission is through 2 ports. Conceptually each port
> * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
> @@ -464,7 +468,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>
> GEM_BUG_ON(last->ctx == cursor->ctx);
>
> - i915_gem_request_assign(&port->request, last);
> + if (submit)
> + port_assign(port, last);
Optimisation? GEM_BUG_ON(last != port_request(port))?
> port++;
> }
>
> @@ -479,7 +484,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> submit = true;
> }
> if (submit) {
> - i915_gem_request_assign(&port->request, last);
> + port_assign(port, last);
> engine->execlist_first = rb;
> }
> spin_unlock_irq(&engine->timeline->lock);
> @@ -488,16 +493,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> execlists_submit_ports(engine);
> }
>
> -static bool execlists_elsp_idle(struct intel_engine_cs *engine)
> -{
> - return !engine->execlist_port[0].request;
> -}
> -
> static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
> {
> const struct execlist_port *port = engine->execlist_port;
>
> - return port[0].count + port[1].count < 2;
> + return port_count(&port[0]) + port_count(&port[1]) < 2;
> }
>
> /*
> @@ -547,7 +547,9 @@ static void intel_lrc_irq_handler(unsigned long data)
> tail = GEN8_CSB_WRITE_PTR(head);
> head = GEN8_CSB_READ_PTR(head);
> while (head != tail) {
> + struct drm_i915_gem_request *rq;
> unsigned int status;
> + unsigned int count;
>
> if (++head == GEN8_CSB_ENTRIES)
> head = 0;
> @@ -577,20 +579,24 @@ static void intel_lrc_irq_handler(unsigned long data)
> GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
> port[0].context_id);
>
> - GEM_BUG_ON(port[0].count == 0);
> - if (--port[0].count == 0) {
> + rq = port_unpack(&port[0], &count);
> + GEM_BUG_ON(count == 0);
> + if (--count == 0) {
If you changed this to be:
count--;
port_set(...);
if (count > 0)
continue;
It would be perhaps a bit more readable, but a potentially useless
port_set on the other hand. Not sure.
> GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> - GEM_BUG_ON(!i915_gem_request_completed(port[0].request));
> - execlists_context_status_change(port[0].request,
> - INTEL_CONTEXT_SCHEDULE_OUT);
> + GEM_BUG_ON(!i915_gem_request_completed(rq));
> + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> +
> + trace_i915_gem_request_out(rq);
> + i915_gem_request_put(rq);
>
> - trace_i915_gem_request_out(port[0].request);
> - i915_gem_request_put(port[0].request);
> port[0] = port[1];
> memset(&port[1], 0, sizeof(port[1]));
> + } else {
> + port_set(&port[0], port_pack(rq, count));
> }
>
> - GEM_BUG_ON(port[0].count == 0 &&
> + /* After the final element, the hw should be idle */
> + GEM_BUG_ON(port_count(&port[0]) == 0 &&
> !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
> }
>
> @@ -1148,6 +1154,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
> struct drm_i915_private *dev_priv = engine->i915;
> struct execlist_port *port = engine->execlist_port;
> unsigned int n;
> + bool submit;
> int ret;
>
> ret = intel_mocs_init_engine(engine);
> @@ -1169,19 +1176,21 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
> /* After a GPU reset, we may have requests to replay */
> clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
>
> + submit = false;
> for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> - if (!port[n].request)
> + if (!port_isset(&port[n]))
> break;
>
> DRM_DEBUG_DRIVER("Restarting %s:%d from 0x%x\n",
> engine->name, n,
> - port[n].request->global_seqno);
> + port_request(&port[n])->global_seqno);
>
> /* Discard the current inflight count */
> - port[n].count = 0;
> + port_set(&port[n], port_request(&port[n]));
> + submit = true;
> }
>
> - if (!i915.enable_guc_submission && !execlists_elsp_idle(engine))
> + if (submit && !i915.enable_guc_submission)
> execlists_submit_ports(engine);
>
> return 0;
> @@ -1259,13 +1268,13 @@ static void reset_common_ring(struct intel_engine_cs *engine,
> intel_ring_update_space(request->ring);
>
> /* Catch up with any missed context-switch interrupts */
> - if (request->ctx != port[0].request->ctx) {
> - i915_gem_request_put(port[0].request);
> + if (request->ctx != port_request(&port[0])->ctx) {
> + i915_gem_request_put(port_request(&port[0]));
> port[0] = port[1];
> memset(&port[1], 0, sizeof(port[1]));
> }
>
> - GEM_BUG_ON(request->ctx != port[0].request->ctx);
> + GEM_BUG_ON(request->ctx != port_request(&port[0])->ctx);
>
> /* Reset WaIdleLiteRestore:bdw,skl as well */
> request->tail =
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 4a599e3480a9..68765d45116b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -368,8 +368,14 @@ struct intel_engine_cs {
> /* Execlists */
> struct tasklet_struct irq_tasklet;
> struct execlist_port {
> - struct drm_i915_gem_request *request;
> - unsigned int count;
> + struct drm_i915_gem_request *request_count;
> +#define EXECLIST_COUNT_BITS 2
> +#define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS)
> +#define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS)
> +#define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS)
> +#define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, EXECLIST_COUNT_BITS)
> +#define port_set(p, packed) ((p)->request_count = (packed))
> +#define port_isset(p) ((p)->request_count)
> GEM_DEBUG_DECL(u32 context_id);
> } execlist_port[2];
> struct rb_root execlist_queue;
>
Looks correct but I am still having trouble accepting the structure
shrink and code savings are worth having less readable code.
Regards,
Tvrtko
More information about the Intel-gfx
mailing list