[Intel-gfx] [RFC 5/5] drm/i915: introduce intel_execlists_submission.<c/h>
Chris Wilson
chris at chris-wilson.co.uk
Wed Dec 11 21:31:25 UTC 2019
Quoting Daniele Ceraolo Spurio (2019-12-11 21:12:44)
> Split out all the code related to the execlists submission flow to its
> own file to keep it separate from the general context management,
> because the latter will be re-used by the GuC submission flow.
>
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> ---
> drivers/gpu/drm/i915/Makefile | 1 +
> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 1 +
> .../drm/i915/gt/intel_execlists_submission.c | 2485 ++++++++++++++++
> .../drm/i915/gt/intel_execlists_submission.h | 58 +
> drivers/gpu/drm/i915/gt/intel_lrc.c | 2511 +----------------
> drivers/gpu/drm/i915/gt/intel_lrc.h | 34 +-
> .../gpu/drm/i915/gt/intel_virtual_engine.c | 1 +
> drivers/gpu/drm/i915/gt/selftest_execlists.c | 2 +-
> drivers/gpu/drm/i915/gt/selftest_lrc.c | 2 +-
> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 1 +
> drivers/gpu/drm/i915/gvt/scheduler.c | 1 +
> drivers/gpu/drm/i915/i915_perf.c | 1 +
> 12 files changed, 2584 insertions(+), 2514 deletions(-)
> create mode 100644 drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> create mode 100644 drivers/gpu/drm/i915/gt/intel_execlists_submission.h
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 79f5ef5acd4c..3640e0436c97 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -82,6 +82,7 @@ gt-y += \
> gt/intel_engine_pm.o \
> gt/intel_engine_pool.o \
> gt/intel_engine_user.o \
> + gt/intel_execlists_submission.o \
> gt/intel_gt.o \
> gt/intel_gt_irq.o \
> gt/intel_gt_pm.o \
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 49473c25916c..0a23d01b7589 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -33,6 +33,7 @@
> #include "intel_engine_pm.h"
> #include "intel_engine_pool.h"
> #include "intel_engine_user.h"
> +#include "intel_execlists_submission.h"
> #include "intel_gt.h"
> #include "intel_gt_requests.h"
> #include "intel_lrc.h"
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> new file mode 100644
> index 000000000000..76b878bf15ad
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -0,0 +1,2485 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#include <linux/interrupt.h>
> +
> +#include "gem/i915_gem_context.h"
> +
> +#include "i915_drv.h"
> +#include "i915_perf.h"
> +#include "i915_trace.h"
> +#include "i915_vgpu.h"
> +#include "intel_engine_pm.h"
> +#include "intel_gt.h"
> +#include "intel_gt_pm.h"
> +#include "intel_gt_requests.h"
> +#include "intel_lrc_reg.h"
> +#include "intel_mocs.h"
> +#include "intel_reset.h"
> +#include "intel_ring.h"
> +#include "intel_virtual_engine.h"
> +#include "intel_workarounds.h"
> +#include "intel_execlists_submission.h"
> +
> +#define RING_EXECLIST_QFULL (1 << 0x2)
> +#define RING_EXECLIST1_VALID (1 << 0x3)
> +#define RING_EXECLIST0_VALID (1 << 0x4)
> +#define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
> +#define RING_EXECLIST1_ACTIVE (1 << 0x11)
> +#define RING_EXECLIST0_ACTIVE (1 << 0x12)
> +
> +#define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
> +#define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
> +#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
> +#define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
> +#define GEN8_CTX_STATUS_COMPLETE (1 << 4)
> +#define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
> +
> +#define GEN8_CTX_STATUS_COMPLETED_MASK \
> + (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
> +
> +#define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
> +
> +#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */
> +#define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
> +#define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15)
> +#define GEN12_IDLE_CTX_ID 0x7FF
> +#define GEN12_CSB_CTX_VALID(csb_dw) \
> + (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
> +
> +/* Typical size of the average request (2 pipecontrols and a MI_BB) */
> +#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
> +
> +static void mark_eio(struct i915_request *rq)
> +{
> + if (i915_request_completed(rq))
> + return;
> +
> + GEM_BUG_ON(i915_request_signaled(rq));
> +
> + dma_fence_set_error(&rq->fence, -EIO);
> + i915_request_mark_complete(rq);
> +}
> +
> +static struct i915_request *
> +active_request(const struct intel_timeline * const tl, struct i915_request *rq)
> +{
> + struct i915_request *active = rq;
> +
> + rcu_read_lock();
> + list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
> + if (i915_request_completed(rq))
> + break;
> +
> + active = rq;
> + }
> + rcu_read_unlock();
> +
> + return active;
> +}
> +
> +static inline void
> +ring_set_paused(const struct intel_engine_cs *engine, int state)
> +{
> + /*
> + * We inspect HWS_PREEMPT with a semaphore inside
> + * engine->emit_fini_breadcrumb. If the dword is true,
> + * the ring is paused as the semaphore will busywait
> + * until the dword is false.
> + */
> + engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
> + if (state)
> + wmb();
> +}
> +
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> + return rb_entry(rb, struct i915_priolist, node);
> +}
> +
> +static inline int rq_prio(const struct i915_request *rq)
> +{
> + return rq->sched.attr.priority;
> +}
> +
> +static int effective_prio(const struct i915_request *rq)
> +{
> + int prio = rq_prio(rq);
> +
> + /*
> + * If this request is special and must not be interrupted at any
> + * cost, so be it. Note we are only checking the most recent request
> + * in the context and so may be masking an earlier vip request. It
> + * is hoped that under the conditions where nopreempt is used, this
> + * will not matter (i.e. all requests to that context will be
> + * nopreempt for as long as desired).
> + */
> + if (i915_request_has_nopreempt(rq))
> + prio = I915_PRIORITY_UNPREEMPTABLE;
> +
> + /*
> + * On unwinding the active request, we give it a priority bump
> + * if it has completed waiting on any semaphore. If we know that
> + * the request has already started, we can prevent an unwanted
> + * preempt-to-idle cycle by taking that into account now.
> + */
> + if (__i915_request_has_started(rq))
> + prio |= I915_PRIORITY_NOSEMAPHORE;
> +
> + /* Restrict mere WAIT boosts from triggering preemption */
> + BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
> + return prio | __NO_PREEMPTION;
> +}
> +
> +static int queue_prio(const struct intel_engine_execlists *execlists)
> +{
> + struct i915_priolist *p;
> + struct rb_node *rb;
> +
> + rb = rb_first_cached(&execlists->queue);
> + if (!rb)
> + return INT_MIN;
> +
> + /*
> + * As the priolist[] are inverted, with the highest priority in [0],
> + * we have to flip the index value to become priority.
> + */
> + p = to_priolist(rb);
> + return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
> +}
> +
> +static inline bool need_preempt(const struct intel_engine_cs *engine,
> + const struct i915_request *rq,
> + struct rb_node *rb)
> +{
> + int last_prio;
> +
> + if (!intel_engine_has_semaphores(engine))
> + return false;
> +
> + /*
> + * Check if the current priority hint merits a preemption attempt.
> + *
> + * We record the highest value priority we saw during rescheduling
> + * prior to this dequeue, therefore we know that if it is strictly
> + * less than the current tail of ESLP[0], we do not need to force
> + * a preempt-to-idle cycle.
> + *
> + * However, the priority hint is a mere hint that we may need to
> + * preempt. If that hint is stale or we may be trying to preempt
> + * ourselves, ignore the request.
> + *
> + * More naturally we would write
> + * prio >= max(0, last);
> + * except that we wish to prevent triggering preemption at the same
> + * priority level: the task that is running should remain running
> + * to preserve FIFO ordering of dependencies.
> + */
> + last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
> + if (engine->execlists.queue_priority_hint <= last_prio)
> + return false;
> +
> + /*
> + * Check against the first request in ELSP[1], it will, thanks to the
> + * power of PI, be the highest priority of that context.
> + */
> + if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
> + rq_prio(list_next_entry(rq, sched.link)) > last_prio)
> + return true;
> +
> + if (rb) {
> + struct intel_virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> + bool preempt = false;
> +
> + if (engine == ve->siblings[0]) { /* only preempt one sibling */
> + struct i915_request *next;
> +
> + rcu_read_lock();
> + next = READ_ONCE(ve->request);
> + if (next)
> + preempt = rq_prio(next) > last_prio;
> + rcu_read_unlock();
> + }
> +
> + if (preempt)
> + return preempt;
> + }
> +
> + /*
> + * If the inflight context did not trigger the preemption, then maybe
> + * it was the set of queued requests? Pick the highest priority in
> + * the queue (the first active priolist) and see if it deserves to be
> + * running instead of ELSP[0].
> + *
> + * The highest priority request in the queue can not be either
> + * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
> + * context, it's priority would not exceed ELSP[0] aka last_prio.
> + */
> + return queue_prio(&engine->execlists) > last_prio;
> +}
> +
> +__maybe_unused static inline bool
> +assert_priority_queue(const struct i915_request *prev,
> + const struct i915_request *next)
> +{
> + /*
> + * Without preemption, the prev may refer to the still active element
> + * which we refuse to let go.
> + *
> + * Even with preemption, there are times when we think it is better not
> + * to preempt and leave an ostensibly lower priority request in flight.
> + */
> + if (i915_request_is_active(prev))
> + return true;
> +
> + return rq_prio(prev) >= rq_prio(next);
> +}
> +
> +static struct i915_request *
> +__unwind_incomplete_requests(struct intel_engine_cs *engine)
> +{
> + struct i915_request *rq, *rn, *active = NULL;
> + struct list_head *uninitialized_var(pl);
> + int prio = I915_PRIORITY_INVALID;
> +
> + lockdep_assert_held(&engine->active.lock);
> +
> + list_for_each_entry_safe_reverse(rq, rn,
> + &engine->active.requests,
> + sched.link) {
> + if (i915_request_completed(rq))
> + continue; /* XXX */
> +
> + __i915_request_unsubmit(rq);
> +
> + /*
> + * Push the request back into the queue for later resubmission.
> + * If this request is not native to this physical engine (i.e.
> + * it came from a virtual source), push it back onto the virtual
> + * engine so that it can be moved across onto another physical
> + * engine as load dictates.
> + */
> + if (likely(rq->execution_mask == engine->mask)) {
> + GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> + if (rq_prio(rq) != prio) {
> + prio = rq_prio(rq);
> + pl = i915_sched_lookup_priolist(engine, prio);
> + }
> + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
> +
> + list_move(&rq->sched.link, pl);
> + active = rq;
> + } else {
> + struct intel_engine_cs *owner = rq->hw_context->engine;
> +
> + /*
> + * Decouple the virtual breadcrumb before moving it
> + * back to the virtual engine -- we don't want the
> + * request to complete in the background and try
> + * and cancel the breadcrumb on the virtual engine
> + * (instead of the old engine where it is linked)!
> + */
> + if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> + &rq->fence.flags)) {
> + spin_lock_nested(&rq->lock,
> + SINGLE_DEPTH_NESTING);
> + i915_request_cancel_breadcrumb(rq);
> + spin_unlock(&rq->lock);
> + }
> + rq->engine = owner;
> + owner->submit_request(rq);
> + active = NULL;
> + }
> + }
> +
> + return active;
> +}
> +
> +struct i915_request *
> +execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
There should be no exports from this file... Did you not also make
guc_submission standalone?
> +{
> + struct intel_engine_cs *engine =
> + container_of(execlists, typeof(*engine), execlists);
> +
> + return __unwind_incomplete_requests(engine);
> +}
> +
> +static inline void
> +execlists_context_status_change(struct i915_request *rq, unsigned long status)
> +{
> + /*
> + * Only used when GVT-g is enabled now. When GVT-g is disabled,
> + * The compiler should eliminate this function as dead-code.
> + */
> + if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
> + return;
> +
> + atomic_notifier_call_chain(&rq->engine->context_status_notifier,
> + status, rq);
> +}
> +
> +static void intel_engine_context_in(struct intel_engine_cs *engine)
> +{
> + unsigned long flags;
> +
> + if (READ_ONCE(engine->stats.enabled) == 0)
> + return;
> +
> + write_seqlock_irqsave(&engine->stats.lock, flags);
> +
> + if (engine->stats.enabled > 0) {
> + if (engine->stats.active++ == 0)
> + engine->stats.start = ktime_get();
> + GEM_BUG_ON(engine->stats.active == 0);
> + }
> +
> + write_sequnlock_irqrestore(&engine->stats.lock, flags);
> +}
> +
> +static void intel_engine_context_out(struct intel_engine_cs *engine)
> +{
> + unsigned long flags;
> +
> + if (READ_ONCE(engine->stats.enabled) == 0)
> + return;
> +
> + write_seqlock_irqsave(&engine->stats.lock, flags);
> +
> + if (engine->stats.enabled > 0) {
> + ktime_t last;
> +
> + if (engine->stats.active && --engine->stats.active == 0) {
> + /*
> + * Decrement the active context count and in case GPU
> + * is now idle add up to the running total.
> + */
> + last = ktime_sub(ktime_get(), engine->stats.start);
> +
> + engine->stats.total = ktime_add(engine->stats.total,
> + last);
> + } else if (engine->stats.active == 0) {
> + /*
> + * After turning on engine stats, context out might be
> + * the first event in which case we account from the
> + * time stats gathering was turned on.
> + */
> + last = ktime_sub(ktime_get(), engine->stats.enabled_at);
> +
> + engine->stats.total = ktime_add(engine->stats.total,
> + last);
> + }
> + }
> +
> + write_sequnlock_irqrestore(&engine->stats.lock, flags);
> +}
> +
> +static void
> +execlists_check_context(const struct intel_context *ce,
> + const struct intel_engine_cs *engine)
> +{
> + const struct intel_ring *ring = ce->ring;
> + u32 *regs = ce->lrc_reg_state;
> + bool valid = true;
> + int x;
> +
> + if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
> + pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
> + engine->name,
> + regs[CTX_RING_START],
> + i915_ggtt_offset(ring->vma));
> + regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
> + valid = false;
> + }
> +
> + if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
> + (RING_CTL_SIZE(ring->size) | RING_VALID)) {
> + pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
> + engine->name,
> + regs[CTX_RING_CTL],
> + (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
> + regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
> + valid = false;
> + }
> +
> + x = intel_lrc_ring_mi_mode(engine);
> + if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
> + pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
> + engine->name, regs[x + 1]);
> + regs[x + 1] &= ~STOP_RING;
> + regs[x + 1] |= STOP_RING << 16;
> + valid = false;
> + }
> +
> + WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
> +}
> +
> +static void reset_active(struct i915_request *rq,
> + struct intel_engine_cs *engine)
> +{
> + struct intel_context * const ce = rq->hw_context;
> + u32 head;
> +
> + /*
> + * The executing context has been cancelled. We want to prevent
> + * further execution along this context and propagate the error on
> + * to anything depending on its results.
> + *
> + * In __i915_request_submit(), we apply the -EIO and remove the
> + * requests' payloads for any banned requests. But first, we must
> + * rewind the context back to the start of the incomplete request so
> + * that we do not jump back into the middle of the batch.
> + *
> + * We preserve the breadcrumbs and semaphores of the incomplete
> + * requests so that inter-timeline dependencies (i.e other timelines)
> + * remain correctly ordered. And we defer to __i915_request_submit()
> + * so that all asynchronous waits are correctly handled.
> + */
> + GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
> + __func__, engine->name, rq->fence.context, rq->fence.seqno);
> +
> + /* On resubmission of the active request, payload will be scrubbed */
> + if (i915_request_completed(rq))
> + head = rq->tail;
> + else
> + head = active_request(ce->timeline, rq)->head;
> + ce->ring->head = intel_ring_wrap(ce->ring, head);
> + intel_ring_update_space(ce->ring);
> +
> + /* Scrub the context image to prevent replaying the previous batch */
> + intel_lr_context_restore_default_state(ce, engine);
> + intel_lr_context_update_reg_state(ce, engine);
> +
> + /* We've switched away, so this should be a no-op, but intent matters */
> + ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> +}
> +
> +static inline struct intel_engine_cs *
> +__execlists_schedule_in(struct i915_request *rq)
> +{
> + struct intel_engine_cs * const engine = rq->engine;
> + struct intel_context * const ce = rq->hw_context;
> +
> + intel_context_get(ce);
> +
> + if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
> + reset_active(rq, engine);
> +
> + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> + execlists_check_context(ce, engine);
> +
> + if (ce->tag) {
> + /* Use a fixed tag for OA and friends */
> + ce->lrc_desc |= (u64)ce->tag << 32;
> + } else {
> + /* We don't need a strict matching tag, just different values */
> + ce->lrc_desc &= ~GENMASK_ULL(47, 37);
> + ce->lrc_desc |=
> + (u64)(engine->context_tag++ % NUM_CONTEXT_TAG) <<
> + GEN11_SW_CTX_ID_SHIFT;
> + BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
> + }
> +
> + __intel_gt_pm_get(engine->gt);
> + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> + intel_engine_context_in(engine);
> +
> + return engine;
> +}
> +
> +static inline struct i915_request *
> +execlists_schedule_in(struct i915_request *rq, int idx)
> +{
> + struct intel_context * const ce = rq->hw_context;
> + struct intel_engine_cs *old;
> +
> + GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
> + trace_i915_request_in(rq, idx);
> +
> + old = READ_ONCE(ce->inflight);
> + do {
> + if (!old) {
> + WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
> + break;
> + }
> + } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
> +
> + GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
> + return i915_request_get(rq);
> +}
> +
> +static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
> +{
> + struct intel_virtual_engine *ve =
> + container_of(ce, typeof(*ve), context);
> + struct i915_request *next = READ_ONCE(ve->request);
> +
> + if (next && next->execution_mask & ~rq->execution_mask)
> + tasklet_schedule(&ve->base.execlists.tasklet);
> +}
> +
> +static inline void
> +__execlists_schedule_out(struct i915_request *rq,
> + struct intel_engine_cs * const engine)
> +{
> + struct intel_context * const ce = rq->hw_context;
> +
> + /*
> + * NB process_csb() is not under the engine->active.lock and hence
> + * schedule_out can race with schedule_in meaning that we should
> + * refrain from doing non-trivial work here.
> + */
> +
> + /*
> + * If we have just completed this context, the engine may now be
> + * idle and we want to re-enter powersaving.
> + */
> + if (list_is_last(&rq->link, &ce->timeline->requests) &&
> + i915_request_completed(rq))
> + intel_engine_add_retire(engine, ce->timeline);
> +
> + intel_engine_context_out(engine);
> + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> + intel_gt_pm_put_async(engine->gt);
> +
> + /*
> + * If this is part of a virtual engine, its next request may
> + * have been blocked waiting for access to the active context.
> + * We have to kick all the siblings again in case we need to
> + * switch (e.g. the next request is not runnable on this
> + * engine). Hopefully, we will already have submitted the next
> + * request before the tasklet runs and do not need to rebuild
> + * each virtual tree and kick everyone again.
> + */
> + if (ce->engine != engine)
> + kick_siblings(rq, ce);
> +
> + intel_context_put(ce);
> +}
> +
> +static inline void
> +execlists_schedule_out(struct i915_request *rq)
> +{
> + struct intel_context * const ce = rq->hw_context;
> + struct intel_engine_cs *cur, *old;
> +
> + trace_i915_request_out(rq);
> +
> + old = READ_ONCE(ce->inflight);
> + do
> + cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
> + while (!try_cmpxchg(&ce->inflight, &old, cur));
> + if (!cur)
> + __execlists_schedule_out(rq, old);
> +
> + i915_request_put(rq);
> +}
> +
> +static u64 execlists_update_context(struct i915_request *rq)
> +{
> + struct intel_context *ce = rq->hw_context;
> + u64 desc = ce->lrc_desc;
> + u32 tail;
> +
> + /*
> + * WaIdleLiteRestore:bdw,skl
> + *
> + * We should never submit the context with the same RING_TAIL twice
> + * just in case we submit an empty ring, which confuses the HW.
> + *
> + * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
> + * the normal request to be able to always advance the RING_TAIL on
> + * subsequent resubmissions (for lite restore). Should that fail us,
> + * and we try and submit the same tail again, force the context
> + * reload.
> + */
> + tail = intel_ring_set_tail(rq->ring, rq->tail);
> + if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail))
> + desc |= CTX_DESC_FORCE_RESTORE;
> + ce->lrc_reg_state[CTX_RING_TAIL] = tail;
> + rq->tail = rq->wa_tail;
> +
> + /*
> + * Make sure the context image is complete before we submit it to HW.
> + *
> + * Ostensibly, writes (including the WCB) should be flushed prior to
> + * an uncached write such as our mmio register access, the empirical
> + * evidence (esp. on Braswell) suggests that the WC write into memory
> + * may not be visible to the HW prior to the completion of the UC
> + * register write and that we may begin execution from the context
> + * before its image is complete leading to invalid PD chasing.
> + */
> + wmb();
> +
> + /* Wa_1607138340:tgl */
> + if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0))
> + desc |= CTX_DESC_FORCE_RESTORE;
> +
> + ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
> + return desc;
> +}
> +
> +static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
> +{
> + if (execlists->ctrl_reg) {
> + writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
> + writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
> + } else {
> + writel(upper_32_bits(desc), execlists->submit_reg);
> + writel(lower_32_bits(desc), execlists->submit_reg);
> + }
> +}
> +
> +static __maybe_unused void
> +trace_ports(const struct intel_engine_execlists *execlists,
> + const char *msg,
> + struct i915_request * const *ports)
> +{
> + const struct intel_engine_cs *engine =
> + container_of(execlists, typeof(*engine), execlists);
> +
> + if (!ports[0])
> + return;
> +
> + GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
> + engine->name, msg,
> + ports[0]->fence.context,
> + ports[0]->fence.seqno,
> + i915_request_completed(ports[0]) ? "!" :
> + i915_request_started(ports[0]) ? "*" :
> + "",
> + ports[1] ? ports[1]->fence.context : 0,
> + ports[1] ? ports[1]->fence.seqno : 0);
> +}
> +
> +static __maybe_unused bool
> +assert_pending_valid(const struct intel_engine_execlists *execlists,
> + const char *msg)
> +{
> + struct i915_request * const *port, *rq;
> + struct intel_context *ce = NULL;
> +
> + trace_ports(execlists, msg, execlists->pending);
> +
> + if (!execlists->pending[0]) {
> + GEM_TRACE_ERR("Nothing pending for promotion!\n");
> + return false;
> + }
> +
> + if (execlists->pending[execlists_num_ports(execlists)]) {
> + GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
> + execlists_num_ports(execlists));
> + return false;
> + }
> +
> + for (port = execlists->pending; (rq = *port); port++) {
> + unsigned long flags;
> + bool ok = true;
> +
> + GEM_BUG_ON(!kref_read(&rq->fence.refcount));
> + GEM_BUG_ON(!i915_request_is_active(rq));
> +
> + if (ce == rq->hw_context) {
> + GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
> + ce->timeline->fence_context,
> + port - execlists->pending);
> + return false;
> + }
> + ce = rq->hw_context;
> +
> + /* Hold tightly onto the lock to prevent concurrent retires! */
> + if (!spin_trylock_irqsave(&rq->lock, flags))
> + continue;
> +
> + if (i915_request_completed(rq))
> + goto unlock;
> +
> + if (i915_active_is_idle(&ce->active) &&
> + !i915_gem_context_is_kernel(ce->gem_context)) {
> + GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
> + ce->timeline->fence_context,
> + port - execlists->pending);
> + ok = false;
> + goto unlock;
> + }
> +
> + if (!i915_vma_is_pinned(ce->state)) {
> + GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
> + ce->timeline->fence_context,
> + port - execlists->pending);
> + ok = false;
> + goto unlock;
> + }
> +
> + if (!i915_vma_is_pinned(ce->ring->vma)) {
> + GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
> + ce->timeline->fence_context,
> + port - execlists->pending);
> + ok = false;
> + goto unlock;
> + }
> +
> +unlock:
> + spin_unlock_irqrestore(&rq->lock, flags);
> + if (!ok)
> + return false;
> + }
> +
> + return ce;
> +}
> +
> +static void execlists_submit_ports(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_execlists *execlists = &engine->execlists;
> + unsigned int n;
> +
> + GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
> +
> + /*
> + * We can skip acquiring intel_runtime_pm_get() here as it was taken
> + * on our behalf by the request (see i915_gem_mark_busy()) and it will
> + * not be relinquished until the device is idle (see
> + * i915_gem_idle_work_handler()). As a precaution, we make sure
> + * that all ELSP are drained i.e. we have processed the CSB,
> + * before allowing ourselves to idle and calling intel_runtime_pm_put().
> + */
> + GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
> +
> + /*
> + * ELSQ note: the submit queue is not cleared after being submitted
> + * to the HW so we need to make sure we always clean it up. This is
> + * currently ensured by the fact that we always write the same number
> + * of elsq entries, keep this in mind before changing the loop below.
> + */
> + for (n = execlists_num_ports(execlists); n--; ) {
> + struct i915_request *rq = execlists->pending[n];
> +
> + write_desc(execlists,
> + rq ? execlists_update_context(rq) : 0,
> + n);
> + }
> +
> + /* we need to manually load the submit queue */
> + if (execlists->ctrl_reg)
> + writel(EL_CTRL_LOAD, execlists->ctrl_reg);
> +}
> +
> +static bool ctx_single_port_submission(const struct intel_context *ce)
> +{
> + return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
> + i915_gem_context_force_single_submission(ce->gem_context));
> +}
> +
> +static bool can_merge_ctx(const struct intel_context *prev,
> + const struct intel_context *next)
> +{
> + if (prev != next)
> + return false;
> +
> + if (ctx_single_port_submission(prev))
> + return false;
> +
> + return true;
> +}
> +
> +static bool can_merge_rq(const struct i915_request *prev,
> + const struct i915_request *next)
> +{
> + GEM_BUG_ON(prev == next);
> + GEM_BUG_ON(!assert_priority_queue(prev, next));
> +
> + /*
> + * We do not submit known completed requests. Therefore if the next
> + * request is already completed, we can pretend to merge it in
> + * with the previous context (and we will skip updating the ELSP
> + * and tracking). Thus hopefully keeping the ELSP full with active
> + * contexts, despite the best efforts of preempt-to-busy to confuse
> + * us.
> + */
> + if (i915_request_completed(next))
> + return true;
> +
> + if (unlikely((prev->flags ^ next->flags) &
> + (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL)))
> + return false;
> +
> + if (!can_merge_ctx(prev->hw_context, next->hw_context))
> + return false;
> +
> + return true;
> +}
> +
> +static bool virtual_matches(const struct intel_virtual_engine *ve,
> + const struct i915_request *rq,
> + const struct intel_engine_cs *engine)
> +{
> + const struct intel_engine_cs *inflight;
> +
> + if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
> + return false;
> +
> + /*
> + * We track when the HW has completed saving the context image
> + * (i.e. when we have seen the final CS event switching out of
> + * the context) and must not overwrite the context image before
> + * then. This restricts us to only using the active engine
> + * while the previous virtualized request is inflight (so
> + * we reuse the register offsets). This is a very small
> + * hystersis on the greedy seelction algorithm.
> + */
> + inflight = intel_context_inflight(&ve->context);
> + if (inflight && inflight != engine)
> + return false;
> +
> + return true;
> +}
> +
> +static void virtual_xfer_breadcrumbs(struct intel_virtual_engine *ve,
> + struct intel_engine_cs *engine)
> +{
> + struct intel_engine_cs *old = ve->siblings[0];
> +
> + /* All unattached (rq->engine == old) must already be completed */
> +
> + spin_lock(&old->breadcrumbs.irq_lock);
> + if (!list_empty(&ve->context.signal_link)) {
> + list_move_tail(&ve->context.signal_link,
> + &engine->breadcrumbs.signalers);
> + intel_engine_queue_breadcrumbs(engine);
> + }
> + spin_unlock(&old->breadcrumbs.irq_lock);
> +}
> +
> +static struct i915_request *
> +last_active(const struct intel_engine_execlists *execlists)
> +{
> + struct i915_request * const *last = READ_ONCE(execlists->active);
> +
> + while (*last && i915_request_completed(*last))
> + last++;
> +
> + return *last;
> +}
> +
> +static void defer_request(struct i915_request *rq, struct list_head * const pl)
> +{
> + LIST_HEAD(list);
> +
> + /*
> + * We want to move the interrupted request to the back of
> + * the round-robin list (i.e. its priority level), but
> + * in doing so, we must then move all requests that were in
> + * flight and were waiting for the interrupted request to
> + * be run after it again.
> + */
> + do {
> + struct i915_dependency *p;
> +
> + GEM_BUG_ON(i915_request_is_active(rq));
> + list_move_tail(&rq->sched.link, pl);
> +
> + list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
> + struct i915_request *w =
> + container_of(p->waiter, typeof(*w), sched);
> +
> + /* Leave semaphores spinning on the other engines */
> + if (w->engine != rq->engine)
> + continue;
> +
> + /* No waiter should start before its signaler */
> + GEM_BUG_ON(i915_request_started(w) &&
> + !i915_request_completed(rq));
> +
> + GEM_BUG_ON(i915_request_is_active(w));
> + if (list_empty(&w->sched.link))
> + continue; /* Not yet submitted; unready */
> +
> + if (rq_prio(w) < rq_prio(rq))
> + continue;
> +
> + GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
> + list_move_tail(&w->sched.link, &list);
> + }
> +
> + rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
> + } while (rq);
> +}
> +
> +static void defer_active(struct intel_engine_cs *engine)
> +{
> + struct i915_request *rq;
> +
> + rq = __unwind_incomplete_requests(engine);
> + if (!rq)
> + return;
> +
> + defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
> +}
> +
> +static bool
> +need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
> +{
> + int hint;
> +
> + if (!intel_engine_has_timeslices(engine))
> + return false;
> +
> + if (list_is_last(&rq->sched.link, &engine->active.requests))
> + return false;
> +
> + hint = max(rq_prio(list_next_entry(rq, sched.link)),
> + engine->execlists.queue_priority_hint);
> +
> + return hint >= effective_prio(rq);
> +}
> +
> +static int
> +switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
> +{
> + if (list_is_last(&rq->sched.link, &engine->active.requests))
> + return INT_MIN;
> +
> + return rq_prio(list_next_entry(rq, sched.link));
> +}
> +
> +static inline unsigned long
> +timeslice(const struct intel_engine_cs *engine)
> +{
> + return READ_ONCE(engine->props.timeslice_duration_ms);
> +}
> +
> +static unsigned long
> +active_timeslice(const struct intel_engine_cs *engine)
> +{
> + const struct i915_request *rq = *engine->execlists.active;
> +
> + if (i915_request_completed(rq))
> + return 0;
> +
> + if (engine->execlists.switch_priority_hint < effective_prio(rq))
> + return 0;
> +
> + return timeslice(engine);
> +}
> +
> +static void set_timeslice(struct intel_engine_cs *engine)
> +{
> + if (!intel_engine_has_timeslices(engine))
> + return;
> +
> + set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
> +}
> +
> +static void record_preemption(struct intel_engine_execlists *execlists)
> +{
> + (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
> +}
> +
> +static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
> +{
> + struct i915_request *rq;
> +
> + rq = last_active(&engine->execlists);
> + if (!rq)
> + return 0;
> +
> + /* Force a fast reset for terminated contexts (ignoring sysfs!) */
> + if (unlikely(i915_gem_context_is_banned(rq->gem_context)))
> + return 1;
> +
> + return READ_ONCE(engine->props.preempt_timeout_ms);
> +}
> +
> +static void set_preempt_timeout(struct intel_engine_cs *engine)
> +{
> + if (!intel_engine_has_preempt_reset(engine))
> + return;
> +
> + set_timer_ms(&engine->execlists.preempt,
> + active_preempt_timeout(engine));
> +}
> +
> +static void execlists_dequeue(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_execlists * const execlists = &engine->execlists;
> + struct i915_request **port = execlists->pending;
> + struct i915_request ** const last_port = port + execlists->port_mask;
> + struct i915_request *last;
> + struct rb_node *rb;
> + bool submit = false;
> +
> + /*
> + * Hardware submission is through 2 ports. Conceptually each port
> + * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
> + * static for a context, and unique to each, so we only execute
> + * requests belonging to a single context from each ring. RING_HEAD
> + * is maintained by the CS in the context image, it marks the place
> + * where it got up to last time, and through RING_TAIL we tell the CS
> + * where we want to execute up to this time.
> + *
> + * In this list the requests are in order of execution. Consecutive
> + * requests from the same context are adjacent in the ringbuffer. We
> + * can combine these requests into a single RING_TAIL update:
> + *
> + * RING_HEAD...req1...req2
> + * ^- RING_TAIL
> + * since to execute req2 the CS must first execute req1.
> + *
> + * Our goal then is to point each port to the end of a consecutive
> + * sequence of requests as being the most optimal (fewest wake ups
> + * and context switches) submission.
> + */
> +
> + for (rb = rb_first_cached(&execlists->virtual); rb; ) {
> + struct intel_virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> + struct i915_request *rq = READ_ONCE(ve->request);
> +
> + if (!rq) { /* lazily cleanup after another engine handled rq */
> + rb_erase_cached(rb, &execlists->virtual);
> + RB_CLEAR_NODE(rb);
> + rb = rb_first_cached(&execlists->virtual);
> + continue;
> + }
> +
> + if (!virtual_matches(ve, rq, engine)) {
> + rb = rb_next(rb);
> + continue;
> + }
> +
> + break;
> + }
> +
> + /*
> + * If the queue is higher priority than the last
> + * request in the currently active context, submit afresh.
> + * We will resubmit again afterwards in case we need to split
> + * the active context to interject the preemption request,
> + * i.e. we will retrigger preemption following the ack in case
> + * of trouble.
> + */
> + last = last_active(execlists);
> + if (last) {
> + if (need_preempt(engine, last, rb)) {
> + GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
> + engine->name,
> + last->fence.context,
> + last->fence.seqno,
> + last->sched.attr.priority,
> + execlists->queue_priority_hint);
> + record_preemption(execlists);
> +
> + /*
> + * Don't let the RING_HEAD advance past the breadcrumb
> + * as we unwind (and until we resubmit) so that we do
> + * not accidentally tell it to go backwards.
> + */
> + ring_set_paused(engine, 1);
> +
> + /*
> + * Note that we have not stopped the GPU at this point,
> + * so we are unwinding the incomplete requests as they
> + * remain inflight and so by the time we do complete
> + * the preemption, some of the unwound requests may
> + * complete!
> + */
> + __unwind_incomplete_requests(engine);
> +
> + /*
> + * If we need to return to the preempted context, we
> + * need to skip the lite-restore and force it to
> + * reload the RING_TAIL. Otherwise, the HW has a
> + * tendency to ignore us rewinding the TAIL to the
> + * end of an earlier request.
> + */
> + last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> + last = NULL;
> + } else if (need_timeslice(engine, last) &&
> + timer_expired(&engine->execlists.timer)) {
> + GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
> + engine->name,
> + last->fence.context,
> + last->fence.seqno,
> + last->sched.attr.priority,
> + execlists->queue_priority_hint);
> +
> + ring_set_paused(engine, 1);
> + defer_active(engine);
> +
> + /*
> + * Unlike for preemption, if we rewind and continue
> + * executing the same context as previously active,
> + * the order of execution will remain the same and
> + * the tail will only advance. We do not need to
> + * force a full context restore, as a lite-restore
> + * is sufficient to resample the monotonic TAIL.
> + *
> + * If we switch to any other context, similarly we
> + * will not rewind TAIL of current context, and
> + * normal save/restore will preserve state and allow
> + * us to later continue executing the same request.
> + */
> + last = NULL;
> + } else {
> + /*
> + * Otherwise if we already have a request pending
> + * for execution after the current one, we can
> + * just wait until the next CS event before
> + * queuing more. In either case we will force a
> + * lite-restore preemption event, but if we wait
> + * we hopefully coalesce several updates into a single
> + * submission.
> + */
> + if (!list_is_last(&last->sched.link,
> + &engine->active.requests)) {
> + /*
> + * Even if ELSP[1] is occupied and not worthy
> + * of timeslices, our queue might be.
> + */
> + if (!execlists->timer.expires &&
> + need_timeslice(engine, last))
> + set_timer_ms(&execlists->timer,
> + timeslice(engine));
> +
> + return;
> + }
> + }
> + }
> +
> + while (rb) { /* XXX virtual is always taking precedence */
> + struct intel_virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> + struct i915_request *rq;
> +
> + spin_lock(&ve->base.active.lock);
> +
> + rq = ve->request;
> + if (unlikely(!rq)) { /* lost the race to a sibling */
> + spin_unlock(&ve->base.active.lock);
> + rb_erase_cached(rb, &execlists->virtual);
> + RB_CLEAR_NODE(rb);
> + rb = rb_first_cached(&execlists->virtual);
> + continue;
> + }
> +
> + GEM_BUG_ON(rq != ve->request);
> + GEM_BUG_ON(rq->engine != &ve->base);
> + GEM_BUG_ON(rq->hw_context != &ve->context);
> +
> + if (rq_prio(rq) >= queue_prio(execlists)) {
> + if (!virtual_matches(ve, rq, engine)) {
> + spin_unlock(&ve->base.active.lock);
> + rb = rb_next(rb);
> + continue;
> + }
> +
> + if (last && !can_merge_rq(last, rq)) {
> + spin_unlock(&ve->base.active.lock);
> + return; /* leave this for another */
> + }
> +
> + GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
> + engine->name,
> + rq->fence.context,
> + rq->fence.seqno,
> + i915_request_completed(rq) ? "!" :
> + i915_request_started(rq) ? "*" :
> + "",
> + yesno(engine != ve->siblings[0]));
> +
> + ve->request = NULL;
> + ve->base.execlists.queue_priority_hint = INT_MIN;
> + rb_erase_cached(rb, &execlists->virtual);
> + RB_CLEAR_NODE(rb);
> +
> + GEM_BUG_ON(!(rq->execution_mask & engine->mask));
> + rq->engine = engine;
> +
> + if (engine != ve->siblings[0]) {
> + u32 *regs = ve->context.lrc_reg_state;
> + unsigned int n;
> +
> + GEM_BUG_ON(READ_ONCE(ve->context.inflight));
> +
> + if (!intel_engine_has_relative_mmio(engine))
> + intel_lr_context_set_register_offsets(regs,
> + engine);
> +
> + if (!list_empty(&ve->context.signals))
> + virtual_xfer_breadcrumbs(ve, engine);
> +
> + /*
> + * Move the bound engine to the top of the list
> + * for future execution. We then kick this
> + * tasklet first before checking others, so that
> + * we preferentially reuse this set of bound
> + * registers.
> + */
> + for (n = 1; n < ve->num_siblings; n++) {
> + if (ve->siblings[n] == engine) {
> + swap(ve->siblings[n],
> + ve->siblings[0]);
> + break;
> + }
> + }
> +
> + GEM_BUG_ON(ve->siblings[0] != engine);
> + }
> +
> + if (__i915_request_submit(rq)) {
> + submit = true;
> + last = rq;
> + }
> + i915_request_put(rq);
> +
> + /*
> + * Hmm, we have a bunch of virtual engine requests,
> + * but the first one was already completed (thanks
> + * preempt-to-busy!). Keep looking at the veng queue
> + * until we have no more relevant requests (i.e.
> + * the normal submit queue has higher priority).
> + */
> + if (!submit) {
> + spin_unlock(&ve->base.active.lock);
> + rb = rb_first_cached(&execlists->virtual);
> + continue;
> + }
> + }
> +
> + spin_unlock(&ve->base.active.lock);
> + break;
> + }
> +
> + while ((rb = rb_first_cached(&execlists->queue))) {
> + struct i915_priolist *p = to_priolist(rb);
> + struct i915_request *rq, *rn;
> + int i;
> +
> + priolist_for_each_request_consume(rq, rn, p, i) {
> + bool merge = true;
> +
> + /*
> + * Can we combine this request with the current port?
> + * It has to be the same context/ringbuffer and not
> + * have any exceptions (e.g. GVT saying never to
> + * combine contexts).
> + *
> + * If we can combine the requests, we can execute both
> + * by updating the RING_TAIL to point to the end of the
> + * second request, and so we never need to tell the
> + * hardware about the first.
> + */
> + if (last && !can_merge_rq(last, rq)) {
> + /*
> + * If we are on the second port and cannot
> + * combine this request with the last, then we
> + * are done.
> + */
> + if (port == last_port)
> + goto done;
> +
> + /*
> + * We must not populate both ELSP[] with the
> + * same LRCA, i.e. we must submit 2 different
> + * contexts if we submit 2 ELSP.
> + */
> + if (last->hw_context == rq->hw_context)
> + goto done;
> +
> + if (i915_request_has_sentinel(last))
> + goto done;
> +
> + /*
> + * If GVT overrides us we only ever submit
> + * port[0], leaving port[1] empty. Note that we
> + * also have to be careful that we don't queue
> + * the same context (even though a different
> + * request) to the second port.
> + */
> + if (ctx_single_port_submission(last->hw_context) ||
> + ctx_single_port_submission(rq->hw_context))
> + goto done;
> +
> + merge = false;
> + }
> +
> + if (__i915_request_submit(rq)) {
> + if (!merge) {
> + *port = execlists_schedule_in(last, port - execlists->pending);
> + port++;
> + last = NULL;
> + }
> +
> + GEM_BUG_ON(last &&
> + !can_merge_ctx(last->hw_context,
> + rq->hw_context));
> +
> + submit = true;
> + last = rq;
> + }
> + }
> +
> + rb_erase_cached(&p->node, &execlists->queue);
> + i915_priolist_free(p);
> + }
> +
> +done:
> + /*
> + * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
> + *
> + * We choose the priority hint such that if we add a request of greater
> + * priority than this, we kick the submission tasklet to decide on
> + * the right order of submitting the requests to hardware. We must
> + * also be prepared to reorder requests as they are in-flight on the
> + * HW. We derive the priority hint then as the first "hole" in
> + * the HW submission ports and if there are no available slots,
> + * the priority of the lowest executing request, i.e. last.
> + *
> + * When we do receive a higher priority request ready to run from the
> + * user, see queue_request(), the priority hint is bumped to that
> + * request triggering preemption on the next dequeue (or subsequent
> + * interrupt for secondary ports).
> + */
> + execlists->queue_priority_hint = queue_prio(execlists);
> + GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
> + engine->name, execlists->queue_priority_hint,
> + yesno(submit));
> +
> + if (submit) {
> + *port = execlists_schedule_in(last, port - execlists->pending);
> + execlists->switch_priority_hint =
> + switch_prio(engine, *execlists->pending);
> +
> + /*
> + * Skip if we ended up with exactly the same set of requests,
> + * e.g. trying to timeslice a pair of ordered contexts
> + */
> + if (!memcmp(execlists->active, execlists->pending,
> + (port - execlists->pending + 1) * sizeof(*port))) {
> + do
> + execlists_schedule_out(fetch_and_zero(port));
> + while (port-- != execlists->pending);
> +
> + goto skip_submit;
> + }
> +
> + memset(port + 1, 0, (last_port - port) * sizeof(*port));
> + execlists_submit_ports(engine);
> +
> + set_preempt_timeout(engine);
> + } else {
> +skip_submit:
> + ring_set_paused(engine, 0);
> + }
> +}
> +
> +static void
> +cancel_port_requests(struct intel_engine_execlists * const execlists)
> +{
> + struct i915_request * const *port;
> +
> + for (port = execlists->pending; *port; port++)
> + execlists_schedule_out(*port);
> + memset(execlists->pending, 0, sizeof(execlists->pending));
> +
> + /* Mark the end of active before we overwrite *active */
> + for (port = xchg(&execlists->active, execlists->pending); *port; port++)
> + execlists_schedule_out(*port);
> + WRITE_ONCE(execlists->active,
> + memset(execlists->inflight, 0, sizeof(execlists->inflight)));
> +}
> +
> +static inline void
> +invalidate_csb_entries(const u32 *first, const u32 *last)
> +{
> + clflush((void *)first);
> + clflush((void *)last);
> +}
> +
> +static inline bool
> +reset_in_progress(const struct intel_engine_execlists *execlists)
> +{
> + return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
> +}
> +
> +/*
> + * Starting with Gen12, the status has a new format:
> + *
> + * bit 0: switched to new queue
> + * bit 1: reserved
> + * bit 2: semaphore wait mode (poll or signal), only valid when
> + * switch detail is set to "wait on semaphore"
> + * bits 3-5: engine class
> + * bits 6-11: engine instance
> + * bits 12-14: reserved
> + * bits 15-25: sw context id of the lrc the GT switched to
> + * bits 26-31: sw counter of the lrc the GT switched to
> + * bits 32-35: context switch detail
> + * - 0: ctx complete
> + * - 1: wait on sync flip
> + * - 2: wait on vblank
> + * - 3: wait on scanline
> + * - 4: wait on semaphore
> + * - 5: context preempted (not on SEMAPHORE_WAIT or
> + * WAIT_FOR_EVENT)
> + * bit 36: reserved
> + * bits 37-43: wait detail (for switch detail 1 to 4)
> + * bits 44-46: reserved
> + * bits 47-57: sw context id of the lrc the GT switched away from
> + * bits 58-63: sw counter of the lrc the GT switched away from
> + */
> +static inline bool
> +gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
> +{
> + u32 lower_dw = csb[0];
> + u32 upper_dw = csb[1];
> + bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
> + bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
> + bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
> +
> + /*
> + * The context switch detail is not guaranteed to be 5 when a preemption
> + * occurs, so we can't just check for that. The check below works for
> + * all the cases we care about, including preemptions of WAIT
> + * instructions and lite-restore. Preempt-to-idle via the CTRL register
> + * would require some extra handling, but we don't support that.
> + */
> + if (!ctx_away_valid || new_queue) {
> + GEM_BUG_ON(!ctx_to_valid);
> + return true;
> + }
> +
> + /*
> + * switch detail = 5 is covered by the case above and we do not expect a
> + * context switch on an unsuccessful wait instruction since we always
> + * use polling mode.
> + */
> + GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
> + return false;
> +}
> +
> +static inline bool
> +gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
> +{
> + return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
> +}
> +
> +static void process_csb(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_execlists * const execlists = &engine->execlists;
> + const u32 * const buf = execlists->csb_status;
> + const u8 num_entries = execlists->csb_size;
> + u8 head, tail;
> +
> + /*
> + * As we modify our execlists state tracking we require exclusive
> + * access. Either we are inside the tasklet, or the tasklet is disabled
> + * and we assume that is only inside the reset paths and so serialised.
> + */
> + GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
> + !reset_in_progress(execlists));
> + GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
> +
> + /*
> + * Note that csb_write, csb_status may be either in HWSP or mmio.
> + * When reading from the csb_write mmio register, we have to be
> + * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
> + * the low 4bits. As it happens we know the next 4bits are always
> + * zero and so we can simply masked off the low u8 of the register
> + * and treat it identically to reading from the HWSP (without having
> + * to use explicit shifting and masking, and probably bifurcating
> + * the code to handle the legacy mmio read).
> + */
> + head = execlists->csb_head;
> + tail = READ_ONCE(*execlists->csb_write);
> + GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
> + if (unlikely(head == tail))
> + return;
> +
> + /*
> + * Hopefully paired with a wmb() in HW!
> + *
> + * We must complete the read of the write pointer before any reads
> + * from the CSB, so that we do not see stale values. Without an rmb
> + * (lfence) the HW may speculatively perform the CSB[] reads *before*
> + * we perform the READ_ONCE(*csb_write).
> + */
> + rmb();
> +
> + do {
> + bool promote;
> +
> + if (++head == num_entries)
> + head = 0;
> +
> + /*
> + * We are flying near dragons again.
> + *
> + * We hold a reference to the request in execlist_port[]
> + * but no more than that. We are operating in softirq
> + * context and so cannot hold any mutex or sleep. That
> + * prevents us stopping the requests we are processing
> + * in port[] from being retired simultaneously (the
> + * breadcrumb will be complete before we see the
> + * context-switch). As we only hold the reference to the
> + * request, any pointer chasing underneath the request
> + * is subject to a potential use-after-free. Thus we
> + * store all of the bookkeeping within port[] as
> + * required, and avoid using unguarded pointers beneath
> + * request itself. The same applies to the atomic
> + * status notifier.
> + */
> +
> + GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
> + engine->name, head,
> + buf[2 * head + 0], buf[2 * head + 1]);
> +
> + if (INTEL_GEN(engine->i915) >= 12)
> + promote = gen12_csb_parse(execlists, buf + 2 * head);
> + else
> + promote = gen8_csb_parse(execlists, buf + 2 * head);
> + if (promote) {
> + struct i915_request * const *old = execlists->active;
> +
> + /* Point active to the new ELSP; prevent overwriting */
> + WRITE_ONCE(execlists->active, execlists->pending);
> + set_timeslice(engine);
> +
> + if (!inject_preempt_hang(execlists))
> + ring_set_paused(engine, 0);
> +
> + /* cancel old inflight, prepare for switch */
> + trace_ports(execlists, "preempted", old);
> + while (*old)
> + execlists_schedule_out(*old++);
> +
> + /* switch pending to inflight */
> + GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
> + WRITE_ONCE(execlists->active,
> + memcpy(execlists->inflight,
> + execlists->pending,
> + execlists_num_ports(execlists) *
> + sizeof(*execlists->pending)));
> +
> + WRITE_ONCE(execlists->pending[0], NULL);
> + } else {
> + GEM_BUG_ON(!*execlists->active);
> +
> + /* port0 completed, advanced to port1 */
> + trace_ports(execlists, "completed", execlists->active);
> +
> + /*
> + * We rely on the hardware being strongly
> + * ordered, that the breadcrumb write is
> + * coherent (visible from the CPU) before the
> + * user interrupt and CSB is processed.
> + */
> + GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
> + !reset_in_progress(execlists));
> + execlists_schedule_out(*execlists->active++);
> +
> + GEM_BUG_ON(execlists->active - execlists->inflight >
> + execlists_num_ports(execlists));
> + }
> + } while (head != tail);
> +
> + execlists->csb_head = head;
> +
> + /*
> + * Gen11 has proven to fail wrt global observation point between
> + * entry and tail update, failing on the ordering and thus
> + * we see an old entry in the context status buffer.
> + *
> + * Forcibly evict out entries for the next gpu csb update,
> + * to increase the odds that we get a fresh entries with non
> + * working hardware. The cost for doing so comes out mostly with
> + * the wash as hardware, working or not, will need to do the
> + * invalidation before.
> + */
> + invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
> +}
> +
> +static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
> +{
> + lockdep_assert_held(&engine->active.lock);
> + if (!engine->execlists.pending[0]) {
> + rcu_read_lock(); /* protect peeking at execlists->active */
> + execlists_dequeue(engine);
> + rcu_read_unlock();
> + }
> +}
> +
> +static noinline void preempt_reset(struct intel_engine_cs *engine)
> +{
> + const unsigned int bit = I915_RESET_ENGINE + engine->id;
> + unsigned long *lock = &engine->gt->reset.flags;
> +
> + if (i915_modparams.reset < 3)
> + return;
> +
> + if (test_and_set_bit(bit, lock))
> + return;
> +
> + /* Mark this tasklet as disabled to avoid waiting for it to complete */
> + tasklet_disable_nosync(&engine->execlists.tasklet);
> +
> + GEM_TRACE("%s: preempt timeout %lu+%ums\n",
> + engine->name,
> + READ_ONCE(engine->props.preempt_timeout_ms),
> + jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
> + intel_engine_reset(engine, "preemption time out");
> +
> + tasklet_enable(&engine->execlists.tasklet);
> + clear_and_wake_up_bit(bit, lock);
> +}
> +
> +static bool preempt_timeout(const struct intel_engine_cs *const engine)
> +{
> + const struct timer_list *t = &engine->execlists.preempt;
> +
> + if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
> + return false;
> +
> + if (!timer_expired(t))
> + return false;
> +
> + return READ_ONCE(engine->execlists.pending[0]);
> +}
> +
> +/*
> + * Check the unread Context Status Buffers and manage the submission of new
> + * contexts to the ELSP accordingly.
> + */
> +static void execlists_submission_tasklet(unsigned long data)
> +{
> + struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
> + bool timeout = preempt_timeout(engine);
> +
> + process_csb(engine);
> + if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
> + unsigned long flags;
> +
> + spin_lock_irqsave(&engine->active.lock, flags);
> + __execlists_submission_tasklet(engine);
> + spin_unlock_irqrestore(&engine->active.lock, flags);
> +
> + /* Recheck after serialising with direct-submission */
> + if (timeout && preempt_timeout(engine))
> + preempt_reset(engine);
> + }
> +}
> +
> +static void __execlists_kick(struct intel_engine_execlists *execlists)
> +{
> + /* Kick the tasklet for some interrupt coalescing and reset handling */
> + tasklet_hi_schedule(&execlists->tasklet);
> +}
> +
> +#define execlists_kick(t, member) \
> + __execlists_kick(container_of(t, struct intel_engine_execlists, member))
> +
> +static void execlists_timeslice(struct timer_list *timer)
> +{
> + execlists_kick(timer, timer);
> +}
> +
> +static void execlists_preempt(struct timer_list *timer)
> +{
> + execlists_kick(timer, preempt);
> +}
> +
> +static void queue_request(struct intel_engine_cs *engine,
> + struct i915_sched_node *node,
> + int prio)
> +{
> + GEM_BUG_ON(!list_empty(&node->link));
> + list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
> +}
> +
> +static void __submit_queue_imm(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_execlists * const execlists = &engine->execlists;
> +
> + if (reset_in_progress(execlists))
> + return; /* defer until we restart the engine following reset */
> +
> + if (execlists->tasklet.func == execlists_submission_tasklet)
> + __execlists_submission_tasklet(engine);
> + else
> + tasklet_hi_schedule(&execlists->tasklet);
> +}
> +
> +static void submit_queue(struct intel_engine_cs *engine,
> + const struct i915_request *rq)
> +{
> + struct intel_engine_execlists *execlists = &engine->execlists;
> +
> + if (rq_prio(rq) <= execlists->queue_priority_hint)
> + return;
> +
> + execlists->queue_priority_hint = rq_prio(rq);
> + __submit_queue_imm(engine);
> +}
> +
> +static void execlists_submit_request(struct i915_request *request)
> +{
> + struct intel_engine_cs *engine = request->engine;
> + unsigned long flags;
> +
> + /* Will be called from irq-context when using foreign fences. */
> + spin_lock_irqsave(&engine->active.lock, flags);
> +
> + queue_request(engine, &request->sched, rq_prio(request));
> +
> + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
> + GEM_BUG_ON(list_empty(&request->sched.link));
> +
> + submit_queue(engine, request);
> +
> + spin_unlock_irqrestore(&engine->active.lock, flags);
> +}
> +
> +static void execlists_context_destroy(struct kref *kref)
> +{
> + struct intel_context *ce = container_of(kref, typeof(*ce), ref);
> +
> + GEM_BUG_ON(!i915_active_is_idle(&ce->active));
> + GEM_BUG_ON(intel_context_is_pinned(ce));
> +
> + if (ce->state)
> + intel_lr_context_fini(ce);
> +
> + intel_context_fini(ce);
> + intel_context_free(ce);
> +}
> +
> +static int execlists_context_pin(struct intel_context *ce)
> +{
> + return intel_lr_context_pin(ce, ce->engine);
> +}
> +
> +static int execlists_context_alloc(struct intel_context *ce)
> +{
> + return intel_lr_context_alloc(ce, ce->engine);
> +}
> +
> +static void execlists_context_reset(struct intel_context *ce)
> +{
> + /*
> + * Because we emit WA_TAIL_DWORDS there may be a disparity
> + * between our bookkeeping in ce->ring->head and ce->ring->tail and
> + * that stored in context. As we only write new commands from
> + * ce->ring->tail onwards, everything before that is junk. If the GPU
> + * starts reading from its RING_HEAD from the context, it may try to
> + * execute that junk and die.
> + *
> + * The contexts that are stilled pinned on resume belong to the
> + * kernel, and are local to each engine. All other contexts will
> + * have their head/tail sanitized upon pinning before use, so they
> + * will never see garbage,
> + *
> + * So to avoid that we reset the context images upon resume. For
> + * simplicity, we just zero everything out.
> + */
> + intel_ring_reset(ce->ring, 0);
> + intel_lr_context_update_reg_state(ce, ce->engine);
> +}
> +
> +static const struct intel_context_ops execlists_context_ops = {
> + .alloc = execlists_context_alloc,
> +
> + .pin = execlists_context_pin,
> + .unpin = intel_lr_context_unpin,
> +
> + .enter = intel_context_enter_engine,
> + .exit = intel_context_exit_engine,
> +
> + .reset = execlists_context_reset,
> + .destroy = execlists_context_destroy,
> +};
> +
> +static int execlists_request_alloc(struct i915_request *request)
> +{
> + int ret;
> +
> + GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
> +
> + /*
> + * Flush enough space to reduce the likelihood of waiting after
> + * we start building the request - in which case we will just
> + * have to repeat work.
> + */
> + request->reserved_space += EXECLISTS_REQUEST_SIZE;
> +
> + /*
> + * Note that after this point, we have committed to using
> + * this request as it is being used to both track the
> + * state of engine initialisation and liveness of the
> + * golden renderstate above. Think twice before you try
> + * to cancel/unwind this request now.
> + */
> +
> + /* Unconditionally invalidate GPU caches and TLBs. */
> + ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
> + if (ret)
> + return ret;
> +
> + request->reserved_space -= EXECLISTS_REQUEST_SIZE;
> + return 0;
> +}
> +
> +static void execlists_reset_prepare(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_execlists * const execlists = &engine->execlists;
> + unsigned long flags;
> +
> + GEM_TRACE("%s: depth<-%d\n", engine->name,
> + atomic_read(&execlists->tasklet.count));
> +
> + /*
> + * Prevent request submission to the hardware until we have
> + * completed the reset in i915_gem_reset_finish(). If a request
> + * is completed by one engine, it may then queue a request
> + * to a second via its execlists->tasklet *just* as we are
> + * calling engine->resume() and also writing the ELSP.
> + * Turning off the execlists->tasklet until the reset is over
> + * prevents the race.
> + */
> + __tasklet_disable_sync_once(&execlists->tasklet);
> + GEM_BUG_ON(!reset_in_progress(execlists));
> +
> + /* And flush any current direct submission. */
> + spin_lock_irqsave(&engine->active.lock, flags);
> + spin_unlock_irqrestore(&engine->active.lock, flags);
> +
> + /*
> + * We stop engines, otherwise we might get failed reset and a
> + * dead gpu (on elk). Also as modern gpu as kbl can suffer
> + * from system hang if batchbuffer is progressing when
> + * the reset is issued, regardless of READY_TO_RESET ack.
> + * Thus assume it is best to stop engines on all gens
> + * where we have a gpu reset.
> + *
> + * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
> + *
> + * FIXME: Wa for more modern gens needs to be validated
> + */
> + intel_engine_stop_cs(engine);
> +}
> +
> +static void reset_csb_pointers(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_execlists * const execlists = &engine->execlists;
> + const unsigned int reset_value = execlists->csb_size - 1;
> +
> + ring_set_paused(engine, 0);
> +
> + /*
> + * After a reset, the HW starts writing into CSB entry [0]. We
> + * therefore have to set our HEAD pointer back one entry so that
> + * the *first* entry we check is entry 0. To complicate this further,
> + * as we don't wait for the first interrupt after reset, we have to
> + * fake the HW write to point back to the last entry so that our
> + * inline comparison of our cached head position against the last HW
> + * write works even before the first interrupt.
> + */
> + execlists->csb_head = reset_value;
> + WRITE_ONCE(*execlists->csb_write, reset_value);
> + wmb(); /* Make sure this is visible to HW (paranoia?) */
> +
> + /*
> + * Sometimes Icelake forgets to reset its pointers on a GPU reset.
> + * Bludgeon them with a mmio update to be sure.
> + */
> + ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
> + reset_value << 8 | reset_value);
> + ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
> +
> + invalidate_csb_entries(&execlists->csb_status[0],
> + &execlists->csb_status[reset_value]);
> +}
> +
> +static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
> +{
> + struct intel_engine_execlists * const execlists = &engine->execlists;
> + struct intel_context *ce;
> + struct i915_request *rq;
> +
> + mb(); /* paranoia: read the CSB pointers from after the reset */
> + clflush(execlists->csb_write);
> + mb();
> +
> + process_csb(engine); /* drain preemption events */
> +
> + /* Following the reset, we need to reload the CSB read/write pointers */
> + reset_csb_pointers(engine);
> +
> + /*
> + * Save the currently executing context, even if we completed
> + * its request, it was still running at the time of the
> + * reset and will have been clobbered.
> + */
> + rq = execlists_active(execlists);
> + if (!rq)
> + goto unwind;
> +
> + /* We still have requests in-flight; the engine should be active */
> + GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
> +
> + ce = rq->hw_context;
> + GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
> +
> + if (i915_request_completed(rq)) {
> + /* Idle context; tidy up the ring so we can restart afresh */
> + ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
> + goto out_replay;
> + }
> +
> + /* Context has requests still in-flight; it should not be idle! */
> + GEM_BUG_ON(i915_active_is_idle(&ce->active));
> + rq = active_request(ce->timeline, rq);
> + ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
> + GEM_BUG_ON(ce->ring->head == ce->ring->tail);
> +
> + /*
> + * If this request hasn't started yet, e.g. it is waiting on a
> + * semaphore, we need to avoid skipping the request or else we
> + * break the signaling chain. However, if the context is corrupt
> + * the request will not restart and we will be stuck with a wedged
> + * device. It is quite often the case that if we issue a reset
> + * while the GPU is loading the context image, that the context
> + * image becomes corrupt.
> + *
> + * Otherwise, if we have not started yet, the request should replay
> + * perfectly and we do not need to flag the result as being erroneous.
> + */
> + if (!i915_request_started(rq))
> + goto out_replay;
> +
> + /*
> + * If the request was innocent, we leave the request in the ELSP
> + * and will try to replay it on restarting. The context image may
> + * have been corrupted by the reset, in which case we may have
> + * to service a new GPU hang, but more likely we can continue on
> + * without impact.
> + *
> + * If the request was guilty, we presume the context is corrupt
> + * and have to at least restore the RING register in the context
> + * image back to the expected values to skip over the guilty request.
> + */
> + __i915_request_reset(rq, stalled);
> + if (!stalled)
> + goto out_replay;
> +
> + /*
> + * We want a simple context + ring to execute the breadcrumb update.
> + * We cannot rely on the context being intact across the GPU hang,
> + * so clear it and rebuild just what we need for the breadcrumb.
> + * All pending requests for this context will be zapped, and any
> + * future request will be after userspace has had the opportunity
> + * to recreate its own state.
> + */
> + GEM_BUG_ON(!intel_context_is_pinned(ce));
> + intel_lr_context_restore_default_state(ce, engine);
> +
> +out_replay:
> + GEM_TRACE("%s replay {head:%04x, tail:%04x}\n",
> + engine->name, ce->ring->head, ce->ring->tail);
> + intel_ring_update_space(ce->ring);
> + intel_lr_context_reset_reg_state(ce, engine);
> + intel_lr_context_update_reg_state(ce, engine);
> + ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
> +
> +unwind:
> + /* Push back any incomplete requests for replay after the reset. */
> + cancel_port_requests(execlists);
> + __unwind_incomplete_requests(engine);
> +}
> +
> +static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
> +{
> + unsigned long flags;
> +
> + GEM_TRACE("%s\n", engine->name);
> +
> + spin_lock_irqsave(&engine->active.lock, flags);
> +
> + __execlists_reset(engine, stalled);
> +
> + spin_unlock_irqrestore(&engine->active.lock, flags);
> +}
> +
> +static void nop_submission_tasklet(unsigned long data)
> +{
> + /* The driver is wedged; don't process any more events. */
> +}
> +
> +static void execlists_cancel_requests(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_execlists * const execlists = &engine->execlists;
> + struct i915_request *rq, *rn;
> + struct rb_node *rb;
> + unsigned long flags;
> +
> + GEM_TRACE("%s\n", engine->name);
> +
> + /*
> + * Before we call engine->cancel_requests(), we should have exclusive
> + * access to the submission state. This is arranged for us by the
> + * caller disabling the interrupt generation, the tasklet and other
> + * threads that may then access the same state, giving us a free hand
> + * to reset state. However, we still need to let lockdep be aware that
> + * we know this state may be accessed in hardirq context, so we
> + * disable the irq around this manipulation and we want to keep
> + * the spinlock focused on its duties and not accidentally conflate
> + * coverage to the submission's irq state. (Similarly, although we
> + * shouldn't need to disable irq around the manipulation of the
> + * submission's irq state, we also wish to remind ourselves that
> + * it is irq state.)
> + */
> + spin_lock_irqsave(&engine->active.lock, flags);
> +
> + __execlists_reset(engine, true);
> +
> + /* Mark all executing requests as skipped. */
> + list_for_each_entry(rq, &engine->active.requests, sched.link)
> + mark_eio(rq);
> +
> + /* Flush the queued requests to the timeline list (for retiring). */
> + while ((rb = rb_first_cached(&execlists->queue))) {
> + struct i915_priolist *p = to_priolist(rb);
> + int i;
> +
> + priolist_for_each_request_consume(rq, rn, p, i) {
> + mark_eio(rq);
> + __i915_request_submit(rq);
> + }
> +
> + rb_erase_cached(&p->node, &execlists->queue);
> + i915_priolist_free(p);
> + }
> +
> + /* Cancel all attached virtual engines */
> + while ((rb = rb_first_cached(&execlists->virtual))) {
> + struct intel_virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +
> + rb_erase_cached(rb, &execlists->virtual);
> + RB_CLEAR_NODE(rb);
> +
> + spin_lock(&ve->base.active.lock);
> + rq = fetch_and_zero(&ve->request);
> + if (rq) {
> + mark_eio(rq);
> +
> + rq->engine = engine;
> + __i915_request_submit(rq);
> + i915_request_put(rq);
> +
> + ve->base.execlists.queue_priority_hint = INT_MIN;
> + }
> + spin_unlock(&ve->base.active.lock);
> + }
> +
> + /* Remaining _unready_ requests will be nop'ed when submitted */
> +
> + execlists->queue_priority_hint = INT_MIN;
> + execlists->queue = RB_ROOT_CACHED;
> +
> + GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
> + execlists->tasklet.func = nop_submission_tasklet;
> +
> + spin_unlock_irqrestore(&engine->active.lock, flags);
> +}
> +
> +static void execlists_reset_finish(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_execlists * const execlists = &engine->execlists;
> +
> + /*
> + * After a GPU reset, we may have requests to replay. Do so now while
> + * we still have the forcewake to be sure that the GPU is not allowed
> + * to sleep before we restart and reload a context.
> + */
> + GEM_BUG_ON(!reset_in_progress(execlists));
> + if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
> + execlists->tasklet.func(execlists->tasklet.data);
> +
> + if (__tasklet_enable(&execlists->tasklet))
> + /* And kick in case we missed a new request submission. */
> + tasklet_hi_schedule(&execlists->tasklet);
> + GEM_TRACE("%s: depth->%d\n", engine->name,
> + atomic_read(&execlists->tasklet.count));
> +}
> +
> +static void execlists_park(struct intel_engine_cs *engine)
> +{
> + cancel_timer(&engine->execlists.timer);
> + cancel_timer(&engine->execlists.preempt);
> +}
> +
> +static void execlists_destroy(struct intel_engine_cs *engine)
> +{
> + /* Synchronise with residual timers and any softirq they raise */
> + del_timer_sync(&engine->execlists.timer);
> + del_timer_sync(&engine->execlists.preempt);
> + tasklet_kill(&engine->execlists.tasklet);
> +
> + intel_logical_ring_destroy(engine);
> +}
> +
> +void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
> +{
> + engine->request_alloc = execlists_request_alloc;
> + engine->submit_request = execlists_submit_request;
> + engine->cancel_requests = execlists_cancel_requests;
> + engine->schedule = i915_schedule;
> + engine->execlists.tasklet.func = execlists_submission_tasklet;
> +
> + engine->reset.prepare = execlists_reset_prepare;
> + engine->reset.reset = execlists_reset;
> + engine->reset.finish = execlists_reset_finish;
> +
> + engine->destroy = execlists_destroy;
> + engine->park = execlists_park;
> + engine->unpark = NULL;
> +
> + engine->flags |= I915_ENGINE_SUPPORTS_STATS;
> + if (!intel_vgpu_active(engine->i915)) {
> + engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
> + if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
> + engine->flags |= I915_ENGINE_HAS_PREEMPTION;
> + }
> +
> + if (INTEL_GEN(engine->i915) >= 12)
> + engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
> +}
> +
> +int intel_execlists_submission_setup(struct intel_engine_cs *engine)
> +{
> + tasklet_init(&engine->execlists.tasklet,
> + execlists_submission_tasklet, (unsigned long)engine);
> + timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
> + timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
> +
> + intel_logical_ring_setup(engine);
> +
> + engine->set_default_submission = intel_execlists_set_default_submission;
> + engine->cops = &execlists_context_ops;
> +
> + return 0;
> +}
> +
> +int intel_execlists_submission_init(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_execlists * const execlists = &engine->execlists;
> + struct drm_i915_private *i915 = engine->i915;
> + struct intel_uncore *uncore = engine->uncore;
> + u32 base = engine->mmio_base;
> + int ret;
> +
> + ret = intel_logical_ring_init(engine);
> + if (ret)
> + return ret;
> +
> + if (HAS_LOGICAL_RING_ELSQ(i915)) {
> + execlists->submit_reg = uncore->regs +
> + i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
> + execlists->ctrl_reg = uncore->regs +
> + i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
> + } else {
> + execlists->submit_reg = uncore->regs +
> + i915_mmio_reg_offset(RING_ELSP(base));
> + }
> +
> + execlists->csb_status =
> + &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
> +
> + execlists->csb_write =
> + &engine->status_page.addr[intel_hws_csb_write_index(i915)];
> +
> + if (INTEL_GEN(i915) < 11)
> + execlists->csb_size = GEN8_CSB_ENTRIES;
> + else
> + execlists->csb_size = GEN11_CSB_ENTRIES;
> +
> + reset_csb_pointers(engine);
> +
> + return 0;
> +}
> +
> +static intel_engine_mask_t
> +virtual_submission_mask(struct intel_virtual_engine *ve)
> +{
> + struct i915_request *rq;
> + intel_engine_mask_t mask;
> +
> + rq = READ_ONCE(ve->request);
> + if (!rq)
> + return 0;
> +
> + /* The rq is ready for submission; rq->execution_mask is now stable. */
> + mask = rq->execution_mask;
> + if (unlikely(!mask)) {
> + /* Invalid selection, submit to a random engine in error */
> + i915_request_skip(rq, -ENODEV);
> + mask = ve->siblings[0]->mask;
> + }
> +
> + GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
> + ve->base.name,
> + rq->fence.context, rq->fence.seqno,
> + mask, ve->base.execlists.queue_priority_hint);
> +
> + return mask;
> +}
> +
> +static void virtual_submission_tasklet(unsigned long data)
> +{
> + struct intel_virtual_engine * const ve =
> + (struct intel_virtual_engine *)data;
> + const int prio = ve->base.execlists.queue_priority_hint;
> + intel_engine_mask_t mask;
> + unsigned int n;
> +
> + rcu_read_lock();
> + mask = virtual_submission_mask(ve);
> + rcu_read_unlock();
> + if (unlikely(!mask))
> + return;
> +
> + local_irq_disable();
> + for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
> + struct intel_engine_cs *sibling = ve->siblings[n];
> + struct ve_node * const node = &ve->nodes[sibling->id];
> + struct rb_node **parent, *rb;
> + bool first;
> +
> + if (unlikely(!(mask & sibling->mask))) {
> + if (!RB_EMPTY_NODE(&node->rb)) {
> + spin_lock(&sibling->active.lock);
> + rb_erase_cached(&node->rb,
> + &sibling->execlists.virtual);
> + RB_CLEAR_NODE(&node->rb);
> + spin_unlock(&sibling->active.lock);
> + }
> + continue;
> + }
> +
> + spin_lock(&sibling->active.lock);
> +
> + if (!RB_EMPTY_NODE(&node->rb)) {
> + /*
> + * Cheat and avoid rebalancing the tree if we can
> + * reuse this node in situ.
> + */
> + first = rb_first_cached(&sibling->execlists.virtual) ==
> + &node->rb;
> + if (prio == node->prio || (prio > node->prio && first))
> + goto submit_engine;
> +
> + rb_erase_cached(&node->rb, &sibling->execlists.virtual);
> + }
> +
> + rb = NULL;
> + first = true;
> + parent = &sibling->execlists.virtual.rb_root.rb_node;
> + while (*parent) {
> + struct ve_node *other;
> +
> + rb = *parent;
> + other = rb_entry(rb, typeof(*other), rb);
> + if (prio > other->prio) {
> + parent = &rb->rb_left;
> + } else {
> + parent = &rb->rb_right;
> + first = false;
> + }
> + }
> +
> + rb_link_node(&node->rb, rb, parent);
> + rb_insert_color_cached(&node->rb,
> + &sibling->execlists.virtual,
> + first);
> +
> +submit_engine:
> + GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
> + node->prio = prio;
> + if (first && prio > sibling->execlists.queue_priority_hint) {
> + sibling->execlists.queue_priority_hint = prio;
> + tasklet_hi_schedule(&sibling->execlists.tasklet);
> + }
> +
> + spin_unlock(&sibling->active.lock);
> + }
> + local_irq_enable();
> +}
> +
> +static void virtual_submit_request(struct i915_request *rq)
> +{
> + struct intel_virtual_engine *ve = to_virtual_engine(rq->engine);
> + struct i915_request *old;
> + unsigned long flags;
> +
> + GEM_TRACE("%s: rq=%llx:%lld\n",
> + ve->base.name,
> + rq->fence.context,
> + rq->fence.seqno);
> +
> + GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
> +
> + spin_lock_irqsave(&ve->base.active.lock, flags);
> +
> + old = ve->request;
> + if (old) { /* background completion event from preempt-to-busy */
> + GEM_BUG_ON(!i915_request_completed(old));
> + __i915_request_submit(old);
> + i915_request_put(old);
> + }
> +
> + if (i915_request_completed(rq)) {
> + __i915_request_submit(rq);
> +
> + ve->base.execlists.queue_priority_hint = INT_MIN;
> + ve->request = NULL;
> + } else {
> + ve->base.execlists.queue_priority_hint = rq_prio(rq);
> + ve->request = i915_request_get(rq);
> +
> + GEM_BUG_ON(!list_empty(intel_virtual_engine_queue(ve)));
> + list_move_tail(&rq->sched.link, intel_virtual_engine_queue(ve));
> +
> + tasklet_schedule(&ve->base.execlists.tasklet);
> + }
> +
> + spin_unlock_irqrestore(&ve->base.active.lock, flags);
> +}
> +
> +static void
> +virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
> +{
> + struct intel_virtual_engine *ve = to_virtual_engine(rq->engine);
> + intel_engine_mask_t allowed, exec;
> + struct ve_bond *bond;
> +
> + allowed = ~to_request(signal)->engine->mask;
> +
> + bond = intel_virtual_engine_find_bond(ve, to_request(signal)->engine);
> + if (bond)
> + allowed &= bond->sibling_mask;
> +
> + /* Restrict the bonded request to run on only the available engines */
> + exec = READ_ONCE(rq->execution_mask);
> + while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
> + ;
> +
> + /* Prevent the master from being re-run on the bonded engines */
> + to_request(signal)->execution_mask &= ~allowed;
> +}
> +
> +void intel_execlists_virtual_submission_init(struct intel_virtual_engine *ve)
> +{
> + ve->base.request_alloc = execlists_request_alloc;
> + ve->base.submit_request = virtual_submit_request;
> + ve->base.bond_execute = virtual_bond_execute;
> + tasklet_init(&ve->base.execlists.tasklet,
> + virtual_submission_tasklet,
> + (unsigned long)ve);
> +}
> +
> +void intel_execlists_show_requests(struct intel_engine_cs *engine,
> + struct drm_printer *m,
> + void (*show_request)(struct drm_printer *m,
> + struct i915_request *rq,
> + const char *prefix),
> + unsigned int max)
> +{
> + const struct intel_engine_execlists *execlists = &engine->execlists;
> + struct i915_request *rq, *last;
> + unsigned long flags;
> + unsigned int count;
> + struct rb_node *rb;
> +
> + spin_lock_irqsave(&engine->active.lock, flags);
> +
> + last = NULL;
> + count = 0;
> + list_for_each_entry(rq, &engine->active.requests, sched.link) {
> + if (count++ < max - 1)
> + show_request(m, rq, "\t\tE ");
> + else
> + last = rq;
> + }
> + if (last) {
> + if (count > max) {
> + drm_printf(m,
> + "\t\t...skipping %d executing requests...\n",
> + count - max);
> + }
> + show_request(m, last, "\t\tE ");
> + }
> +
> + last = NULL;
> + count = 0;
> + if (execlists->queue_priority_hint != INT_MIN)
> + drm_printf(m, "\t\tQueue priority hint: %d\n",
> + execlists->queue_priority_hint);
> + for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> + struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> + int i;
> +
> + priolist_for_each_request(rq, p, i) {
> + if (count++ < max - 1)
> + show_request(m, rq, "\t\tQ ");
> + else
> + last = rq;
> + }
> + }
> + if (last) {
> + if (count > max) {
> + drm_printf(m,
> + "\t\t...skipping %d queued requests...\n",
> + count - max);
> + }
> + show_request(m, last, "\t\tQ ");
> + }
> +
> + last = NULL;
> + count = 0;
> + for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
> + struct intel_virtual_engine *ve =
> + rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> + struct i915_request *rq = READ_ONCE(ve->request);
> +
> + if (rq) {
> + if (count++ < max - 1)
> + show_request(m, rq, "\t\tV ");
> + else
> + last = rq;
> + }
> + }
> + if (last) {
> + if (count > max) {
> + drm_printf(m,
> + "\t\t...skipping %d virtual requests...\n",
> + count - max);
> + }
> + show_request(m, last, "\t\tV ");
> + }
> +
> + spin_unlock_irqrestore(&engine->active.lock, flags);
> +}
> +
> +bool
> +intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
> +{
> + return engine->set_default_submission ==
> + intel_execlists_set_default_submission;
> +}
The breadcrumb submission code is specialised to execlists and should
not be shared (leaves emit_flush, emit_bb_start as common
gen8_submission.c). The reset code is specialised to execlists and should not
be shared. The virtual engine is specialised to execlists and should not
be shared. Even submit_request should be distinct between guc and
execlists, especially request_alloc (which you may like to put on the
context_ops rather than engine)
-Chris
More information about the Intel-gfx
mailing list