[Intel-gfx] [RFC 5/5] drm/i915: introduce intel_execlists_submission.<c/h>

Wed Dec 11 21:31:25 UTC 2019

Quoting Daniele Ceraolo Spurio (2019-12-11 21:12:44)
> Split out all the code related to the execlists submission flow to its
> own file to keep it separate from the general context management,
> because the latter will be re-used by the GuC submission flow.
> 
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> ---
>  drivers/gpu/drm/i915/Makefile                 |    1 +
>  drivers/gpu/drm/i915/gt/intel_engine_cs.c     |    1 +
>  .../drm/i915/gt/intel_execlists_submission.c  | 2485 ++++++++++++++++
>  .../drm/i915/gt/intel_execlists_submission.h  |   58 +
>  drivers/gpu/drm/i915/gt/intel_lrc.c           | 2511 +----------------
>  drivers/gpu/drm/i915/gt/intel_lrc.h           |   34 +-
>  .../gpu/drm/i915/gt/intel_virtual_engine.c    |    1 +
>  drivers/gpu/drm/i915/gt/selftest_execlists.c  |    2 +-
>  drivers/gpu/drm/i915/gt/selftest_lrc.c        |    2 +-
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |    1 +
>  drivers/gpu/drm/i915/gvt/scheduler.c          |    1 +
>  drivers/gpu/drm/i915/i915_perf.c              |    1 +
>  12 files changed, 2584 insertions(+), 2514 deletions(-)
>  create mode 100644 drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>  create mode 100644 drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 79f5ef5acd4c..3640e0436c97 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -82,6 +82,7 @@ gt-y += \
>         gt/intel_engine_pm.o \
>         gt/intel_engine_pool.o \
>         gt/intel_engine_user.o \
> +       gt/intel_execlists_submission.o \
>         gt/intel_gt.o \
>         gt/intel_gt_irq.o \
>         gt/intel_gt_pm.o \
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 49473c25916c..0a23d01b7589 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -33,6 +33,7 @@
>  #include "intel_engine_pm.h"
>  #include "intel_engine_pool.h"
>  #include "intel_engine_user.h"
> +#include "intel_execlists_submission.h"
>  #include "intel_gt.h"
>  #include "intel_gt_requests.h"
>  #include "intel_lrc.h"
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> new file mode 100644
> index 000000000000..76b878bf15ad
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -0,0 +1,2485 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#include <linux/interrupt.h>
> +
> +#include "gem/i915_gem_context.h"
> +
> +#include "i915_drv.h"
> +#include "i915_perf.h"
> +#include "i915_trace.h"
> +#include "i915_vgpu.h"
> +#include "intel_engine_pm.h"
> +#include "intel_gt.h"
> +#include "intel_gt_pm.h"
> +#include "intel_gt_requests.h"
> +#include "intel_lrc_reg.h"
> +#include "intel_mocs.h"
> +#include "intel_reset.h"
> +#include "intel_ring.h"
> +#include "intel_virtual_engine.h"
> +#include "intel_workarounds.h"
> +#include "intel_execlists_submission.h"
> +
> +#define RING_EXECLIST_QFULL            (1 << 0x2)
> +#define RING_EXECLIST1_VALID           (1 << 0x3)
> +#define RING_EXECLIST0_VALID           (1 << 0x4)
> +#define RING_EXECLIST_ACTIVE_STATUS    (3 << 0xE)
> +#define RING_EXECLIST1_ACTIVE          (1 << 0x11)
> +#define RING_EXECLIST0_ACTIVE          (1 << 0x12)
> +
> +#define GEN8_CTX_STATUS_IDLE_ACTIVE    (1 << 0)
> +#define GEN8_CTX_STATUS_PREEMPTED      (1 << 1)
> +#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
> +#define GEN8_CTX_STATUS_ACTIVE_IDLE    (1 << 3)
> +#define GEN8_CTX_STATUS_COMPLETE       (1 << 4)
> +#define GEN8_CTX_STATUS_LITE_RESTORE   (1 << 15)
> +
> +#define GEN8_CTX_STATUS_COMPLETED_MASK \
> +        (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
> +
> +#define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
> +
> +#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */
> +#define GEN12_CTX_SWITCH_DETAIL(csb_dw)        ((csb_dw) & 0xF) /* upper csb dword */
> +#define GEN12_CSB_SW_CTX_ID_MASK               GENMASK(25, 15)
> +#define GEN12_IDLE_CTX_ID              0x7FF
> +#define GEN12_CSB_CTX_VALID(csb_dw) \
> +       (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
> +
> +/* Typical size of the average request (2 pipecontrols and a MI_BB) */
> +#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
> +
> +static void mark_eio(struct i915_request *rq)
> +{
> +       if (i915_request_completed(rq))
> +               return;
> +
> +       GEM_BUG_ON(i915_request_signaled(rq));
> +
> +       dma_fence_set_error(&rq->fence, -EIO);
> +       i915_request_mark_complete(rq);
> +}
> +
> +static struct i915_request *
> +active_request(const struct intel_timeline * const tl, struct i915_request *rq)
> +{
> +       struct i915_request *active = rq;
> +
> +       rcu_read_lock();
> +       list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
> +               if (i915_request_completed(rq))
> +                       break;
> +
> +               active = rq;
> +       }
> +       rcu_read_unlock();
> +
> +       return active;
> +}
> +
> +static inline void
> +ring_set_paused(const struct intel_engine_cs *engine, int state)
> +{
> +       /*
> +        * We inspect HWS_PREEMPT with a semaphore inside
> +        * engine->emit_fini_breadcrumb. If the dword is true,
> +        * the ring is paused as the semaphore will busywait
> +        * until the dword is false.
> +        */
> +       engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
> +       if (state)
> +               wmb();
> +}
> +
> +static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +{
> +       return rb_entry(rb, struct i915_priolist, node);
> +}
> +
> +static inline int rq_prio(const struct i915_request *rq)
> +{
> +       return rq->sched.attr.priority;
> +}
> +
> +static int effective_prio(const struct i915_request *rq)
> +{
> +       int prio = rq_prio(rq);
> +
> +       /*
> +        * If this request is special and must not be interrupted at any
> +        * cost, so be it. Note we are only checking the most recent request
> +        * in the context and so may be masking an earlier vip request. It
> +        * is hoped that under the conditions where nopreempt is used, this
> +        * will not matter (i.e. all requests to that context will be
> +        * nopreempt for as long as desired).
> +        */
> +       if (i915_request_has_nopreempt(rq))
> +               prio = I915_PRIORITY_UNPREEMPTABLE;
> +
> +       /*
> +        * On unwinding the active request, we give it a priority bump
> +        * if it has completed waiting on any semaphore. If we know that
> +        * the request has already started, we can prevent an unwanted
> +        * preempt-to-idle cycle by taking that into account now.
> +        */
> +       if (__i915_request_has_started(rq))
> +               prio |= I915_PRIORITY_NOSEMAPHORE;
> +
> +       /* Restrict mere WAIT boosts from triggering preemption */
> +       BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
> +       return prio | __NO_PREEMPTION;
> +}
> +
> +static int queue_prio(const struct intel_engine_execlists *execlists)
> +{
> +       struct i915_priolist *p;
> +       struct rb_node *rb;
> +
> +       rb = rb_first_cached(&execlists->queue);
> +       if (!rb)
> +               return INT_MIN;
> +
> +       /*
> +        * As the priolist[] are inverted, with the highest priority in [0],
> +        * we have to flip the index value to become priority.
> +        */
> +       p = to_priolist(rb);
> +       return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
> +}
> +
> +static inline bool need_preempt(const struct intel_engine_cs *engine,
> +                               const struct i915_request *rq,
> +                               struct rb_node *rb)
> +{
> +       int last_prio;
> +
> +       if (!intel_engine_has_semaphores(engine))
> +               return false;
> +
> +       /*
> +        * Check if the current priority hint merits a preemption attempt.
> +        *
> +        * We record the highest value priority we saw during rescheduling
> +        * prior to this dequeue, therefore we know that if it is strictly
> +        * less than the current tail of ESLP[0], we do not need to force
> +        * a preempt-to-idle cycle.
> +        *
> +        * However, the priority hint is a mere hint that we may need to
> +        * preempt. If that hint is stale or we may be trying to preempt
> +        * ourselves, ignore the request.
> +        *
> +        * More naturally we would write
> +        *      prio >= max(0, last);
> +        * except that we wish to prevent triggering preemption at the same
> +        * priority level: the task that is running should remain running
> +        * to preserve FIFO ordering of dependencies.
> +        */
> +       last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
> +       if (engine->execlists.queue_priority_hint <= last_prio)
> +               return false;
> +
> +       /*
> +        * Check against the first request in ELSP[1], it will, thanks to the
> +        * power of PI, be the highest priority of that context.
> +        */
> +       if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
> +           rq_prio(list_next_entry(rq, sched.link)) > last_prio)
> +               return true;
> +
> +       if (rb) {
> +               struct intel_virtual_engine *ve =
> +                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +               bool preempt = false;
> +
> +               if (engine == ve->siblings[0]) { /* only preempt one sibling */
> +                       struct i915_request *next;
> +
> +                       rcu_read_lock();
> +                       next = READ_ONCE(ve->request);
> +                       if (next)
> +                               preempt = rq_prio(next) > last_prio;
> +                       rcu_read_unlock();
> +               }
> +
> +               if (preempt)
> +                       return preempt;
> +       }
> +
> +       /*
> +        * If the inflight context did not trigger the preemption, then maybe
> +        * it was the set of queued requests? Pick the highest priority in
> +        * the queue (the first active priolist) and see if it deserves to be
> +        * running instead of ELSP[0].
> +        *
> +        * The highest priority request in the queue can not be either
> +        * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
> +        * context, it's priority would not exceed ELSP[0] aka last_prio.
> +        */
> +       return queue_prio(&engine->execlists) > last_prio;
> +}
> +
> +__maybe_unused static inline bool
> +assert_priority_queue(const struct i915_request *prev,
> +                     const struct i915_request *next)
> +{
> +       /*
> +        * Without preemption, the prev may refer to the still active element
> +        * which we refuse to let go.
> +        *
> +        * Even with preemption, there are times when we think it is better not
> +        * to preempt and leave an ostensibly lower priority request in flight.
> +        */
> +       if (i915_request_is_active(prev))
> +               return true;
> +
> +       return rq_prio(prev) >= rq_prio(next);
> +}
> +
> +static struct i915_request *
> +__unwind_incomplete_requests(struct intel_engine_cs *engine)
> +{
> +       struct i915_request *rq, *rn, *active = NULL;
> +       struct list_head *uninitialized_var(pl);
> +       int prio = I915_PRIORITY_INVALID;
> +
> +       lockdep_assert_held(&engine->active.lock);
> +
> +       list_for_each_entry_safe_reverse(rq, rn,
> +                                        &engine->active.requests,
> +                                        sched.link) {
> +               if (i915_request_completed(rq))
> +                       continue; /* XXX */
> +
> +               __i915_request_unsubmit(rq);
> +
> +               /*
> +                * Push the request back into the queue for later resubmission.
> +                * If this request is not native to this physical engine (i.e.
> +                * it came from a virtual source), push it back onto the virtual
> +                * engine so that it can be moved across onto another physical
> +                * engine as load dictates.
> +                */
> +               if (likely(rq->execution_mask == engine->mask)) {
> +                       GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> +                       if (rq_prio(rq) != prio) {
> +                               prio = rq_prio(rq);
> +                               pl = i915_sched_lookup_priolist(engine, prio);
> +                       }
> +                       GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
> +
> +                       list_move(&rq->sched.link, pl);
> +                       active = rq;
> +               } else {
> +                       struct intel_engine_cs *owner = rq->hw_context->engine;
> +
> +                       /*
> +                        * Decouple the virtual breadcrumb before moving it
> +                        * back to the virtual engine -- we don't want the
> +                        * request to complete in the background and try
> +                        * and cancel the breadcrumb on the virtual engine
> +                        * (instead of the old engine where it is linked)!
> +                        */
> +                       if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> +                                    &rq->fence.flags)) {
> +                               spin_lock_nested(&rq->lock,
> +                                                SINGLE_DEPTH_NESTING);
> +                               i915_request_cancel_breadcrumb(rq);
> +                               spin_unlock(&rq->lock);
> +                       }
> +                       rq->engine = owner;
> +                       owner->submit_request(rq);
> +                       active = NULL;
> +               }
> +       }
> +
> +       return active;
> +}
> +
> +struct i915_request *
> +execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)

There should be no exports from this file... Did you not also make
guc_submission standalone?

> +{
> +       struct intel_engine_cs *engine =
> +               container_of(execlists, typeof(*engine), execlists);
> +
> +       return __unwind_incomplete_requests(engine);
> +}
> +
> +static inline void
> +execlists_context_status_change(struct i915_request *rq, unsigned long status)
> +{
> +       /*
> +        * Only used when GVT-g is enabled now. When GVT-g is disabled,
> +        * The compiler should eliminate this function as dead-code.
> +        */
> +       if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
> +               return;
> +
> +       atomic_notifier_call_chain(&rq->engine->context_status_notifier,
> +                                  status, rq);
> +}
> +
> +static void intel_engine_context_in(struct intel_engine_cs *engine)
> +{
> +       unsigned long flags;
> +
> +       if (READ_ONCE(engine->stats.enabled) == 0)
> +               return;
> +
> +       write_seqlock_irqsave(&engine->stats.lock, flags);
> +
> +       if (engine->stats.enabled > 0) {
> +               if (engine->stats.active++ == 0)
> +                       engine->stats.start = ktime_get();
> +               GEM_BUG_ON(engine->stats.active == 0);
> +       }
> +
> +       write_sequnlock_irqrestore(&engine->stats.lock, flags);
> +}
> +
> +static void intel_engine_context_out(struct intel_engine_cs *engine)
> +{
> +       unsigned long flags;
> +
> +       if (READ_ONCE(engine->stats.enabled) == 0)
> +               return;
> +
> +       write_seqlock_irqsave(&engine->stats.lock, flags);
> +
> +       if (engine->stats.enabled > 0) {
> +               ktime_t last;
> +
> +               if (engine->stats.active && --engine->stats.active == 0) {
> +                       /*
> +                        * Decrement the active context count and in case GPU
> +                        * is now idle add up to the running total.
> +                        */
> +                       last = ktime_sub(ktime_get(), engine->stats.start);
> +
> +                       engine->stats.total = ktime_add(engine->stats.total,
> +                                                       last);
> +               } else if (engine->stats.active == 0) {
> +                       /*
> +                        * After turning on engine stats, context out might be
> +                        * the first event in which case we account from the
> +                        * time stats gathering was turned on.
> +                        */
> +                       last = ktime_sub(ktime_get(), engine->stats.enabled_at);
> +
> +                       engine->stats.total = ktime_add(engine->stats.total,
> +                                                       last);
> +               }
> +       }
> +
> +       write_sequnlock_irqrestore(&engine->stats.lock, flags);
> +}
> +
> +static void
> +execlists_check_context(const struct intel_context *ce,
> +                       const struct intel_engine_cs *engine)
> +{
> +       const struct intel_ring *ring = ce->ring;
> +       u32 *regs = ce->lrc_reg_state;
> +       bool valid = true;
> +       int x;
> +
> +       if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
> +               pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
> +                      engine->name,
> +                      regs[CTX_RING_START],
> +                      i915_ggtt_offset(ring->vma));
> +               regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
> +               valid = false;
> +       }
> +
> +       if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
> +           (RING_CTL_SIZE(ring->size) | RING_VALID)) {
> +               pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
> +                      engine->name,
> +                      regs[CTX_RING_CTL],
> +                      (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
> +               regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
> +               valid = false;
> +       }
> +
> +       x = intel_lrc_ring_mi_mode(engine);
> +       if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
> +               pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
> +                      engine->name, regs[x + 1]);
> +               regs[x + 1] &= ~STOP_RING;
> +               regs[x + 1] |= STOP_RING << 16;
> +               valid = false;
> +       }
> +
> +       WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
> +}
> +
> +static void reset_active(struct i915_request *rq,
> +                        struct intel_engine_cs *engine)
> +{
> +       struct intel_context * const ce = rq->hw_context;
> +       u32 head;
> +
> +       /*
> +        * The executing context has been cancelled. We want to prevent
> +        * further execution along this context and propagate the error on
> +        * to anything depending on its results.
> +        *
> +        * In __i915_request_submit(), we apply the -EIO and remove the
> +        * requests' payloads for any banned requests. But first, we must
> +        * rewind the context back to the start of the incomplete request so
> +        * that we do not jump back into the middle of the batch.
> +        *
> +        * We preserve the breadcrumbs and semaphores of the incomplete
> +        * requests so that inter-timeline dependencies (i.e other timelines)
> +        * remain correctly ordered. And we defer to __i915_request_submit()
> +        * so that all asynchronous waits are correctly handled.
> +        */
> +       GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
> +                 __func__, engine->name, rq->fence.context, rq->fence.seqno);
> +
> +       /* On resubmission of the active request, payload will be scrubbed */
> +       if (i915_request_completed(rq))
> +               head = rq->tail;
> +       else
> +               head = active_request(ce->timeline, rq)->head;
> +       ce->ring->head = intel_ring_wrap(ce->ring, head);
> +       intel_ring_update_space(ce->ring);
> +
> +       /* Scrub the context image to prevent replaying the previous batch */
> +       intel_lr_context_restore_default_state(ce, engine);
> +       intel_lr_context_update_reg_state(ce, engine);
> +
> +       /* We've switched away, so this should be a no-op, but intent matters */
> +       ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> +}
> +
> +static inline struct intel_engine_cs *
> +__execlists_schedule_in(struct i915_request *rq)
> +{
> +       struct intel_engine_cs * const engine = rq->engine;
> +       struct intel_context * const ce = rq->hw_context;
> +
> +       intel_context_get(ce);
> +
> +       if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
> +               reset_active(rq, engine);
> +
> +       if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> +               execlists_check_context(ce, engine);
> +
> +       if (ce->tag) {
> +               /* Use a fixed tag for OA and friends */
> +               ce->lrc_desc |= (u64)ce->tag << 32;
> +       } else {
> +               /* We don't need a strict matching tag, just different values */
> +               ce->lrc_desc &= ~GENMASK_ULL(47, 37);
> +               ce->lrc_desc |=
> +                       (u64)(engine->context_tag++ % NUM_CONTEXT_TAG) <<
> +                       GEN11_SW_CTX_ID_SHIFT;
> +               BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
> +       }
> +
> +       __intel_gt_pm_get(engine->gt);
> +       execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> +       intel_engine_context_in(engine);
> +
> +       return engine;
> +}
> +
> +static inline struct i915_request *
> +execlists_schedule_in(struct i915_request *rq, int idx)
> +{
> +       struct intel_context * const ce = rq->hw_context;
> +       struct intel_engine_cs *old;
> +
> +       GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
> +       trace_i915_request_in(rq, idx);
> +
> +       old = READ_ONCE(ce->inflight);
> +       do {
> +               if (!old) {
> +                       WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
> +                       break;
> +               }
> +       } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
> +
> +       GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
> +       return i915_request_get(rq);
> +}
> +
> +static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
> +{
> +       struct intel_virtual_engine *ve =
> +               container_of(ce, typeof(*ve), context);
> +       struct i915_request *next = READ_ONCE(ve->request);
> +
> +       if (next && next->execution_mask & ~rq->execution_mask)
> +               tasklet_schedule(&ve->base.execlists.tasklet);
> +}
> +
> +static inline void
> +__execlists_schedule_out(struct i915_request *rq,
> +                        struct intel_engine_cs * const engine)
> +{
> +       struct intel_context * const ce = rq->hw_context;
> +
> +       /*
> +        * NB process_csb() is not under the engine->active.lock and hence
> +        * schedule_out can race with schedule_in meaning that we should
> +        * refrain from doing non-trivial work here.
> +        */
> +
> +       /*
> +        * If we have just completed this context, the engine may now be
> +        * idle and we want to re-enter powersaving.
> +        */
> +       if (list_is_last(&rq->link, &ce->timeline->requests) &&
> +           i915_request_completed(rq))
> +               intel_engine_add_retire(engine, ce->timeline);
> +
> +       intel_engine_context_out(engine);
> +       execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> +       intel_gt_pm_put_async(engine->gt);
> +
> +       /*
> +        * If this is part of a virtual engine, its next request may
> +        * have been blocked waiting for access to the active context.
> +        * We have to kick all the siblings again in case we need to
> +        * switch (e.g. the next request is not runnable on this
> +        * engine). Hopefully, we will already have submitted the next
> +        * request before the tasklet runs and do not need to rebuild
> +        * each virtual tree and kick everyone again.
> +        */
> +       if (ce->engine != engine)
> +               kick_siblings(rq, ce);
> +
> +       intel_context_put(ce);
> +}
> +
> +static inline void
> +execlists_schedule_out(struct i915_request *rq)
> +{
> +       struct intel_context * const ce = rq->hw_context;
> +       struct intel_engine_cs *cur, *old;
> +
> +       trace_i915_request_out(rq);
> +
> +       old = READ_ONCE(ce->inflight);
> +       do
> +               cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
> +       while (!try_cmpxchg(&ce->inflight, &old, cur));
> +       if (!cur)
> +               __execlists_schedule_out(rq, old);
> +
> +       i915_request_put(rq);
> +}
> +
> +static u64 execlists_update_context(struct i915_request *rq)
> +{
> +       struct intel_context *ce = rq->hw_context;
> +       u64 desc = ce->lrc_desc;
> +       u32 tail;
> +
> +       /*
> +        * WaIdleLiteRestore:bdw,skl
> +        *
> +        * We should never submit the context with the same RING_TAIL twice
> +        * just in case we submit an empty ring, which confuses the HW.
> +        *
> +        * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
> +        * the normal request to be able to always advance the RING_TAIL on
> +        * subsequent resubmissions (for lite restore). Should that fail us,
> +        * and we try and submit the same tail again, force the context
> +        * reload.
> +        */
> +       tail = intel_ring_set_tail(rq->ring, rq->tail);
> +       if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail))
> +               desc |= CTX_DESC_FORCE_RESTORE;
> +       ce->lrc_reg_state[CTX_RING_TAIL] = tail;
> +       rq->tail = rq->wa_tail;
> +
> +       /*
> +        * Make sure the context image is complete before we submit it to HW.
> +        *
> +        * Ostensibly, writes (including the WCB) should be flushed prior to
> +        * an uncached write such as our mmio register access, the empirical
> +        * evidence (esp. on Braswell) suggests that the WC write into memory
> +        * may not be visible to the HW prior to the completion of the UC
> +        * register write and that we may begin execution from the context
> +        * before its image is complete leading to invalid PD chasing.
> +        */
> +       wmb();
> +
> +       /* Wa_1607138340:tgl */
> +       if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0))
> +               desc |= CTX_DESC_FORCE_RESTORE;
> +
> +       ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
> +       return desc;
> +}
> +
> +static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
> +{
> +       if (execlists->ctrl_reg) {
> +               writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
> +               writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
> +       } else {
> +               writel(upper_32_bits(desc), execlists->submit_reg);
> +               writel(lower_32_bits(desc), execlists->submit_reg);
> +       }
> +}
> +
> +static __maybe_unused void
> +trace_ports(const struct intel_engine_execlists *execlists,
> +           const char *msg,
> +           struct i915_request * const *ports)
> +{
> +       const struct intel_engine_cs *engine =
> +               container_of(execlists, typeof(*engine), execlists);
> +
> +       if (!ports[0])
> +               return;
> +
> +       GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
> +                 engine->name, msg,
> +                 ports[0]->fence.context,
> +                 ports[0]->fence.seqno,
> +                 i915_request_completed(ports[0]) ? "!" :
> +                 i915_request_started(ports[0]) ? "*" :
> +                 "",
> +                 ports[1] ? ports[1]->fence.context : 0,
> +                 ports[1] ? ports[1]->fence.seqno : 0);
> +}
> +
> +static __maybe_unused bool
> +assert_pending_valid(const struct intel_engine_execlists *execlists,
> +                    const char *msg)
> +{
> +       struct i915_request * const *port, *rq;
> +       struct intel_context *ce = NULL;
> +
> +       trace_ports(execlists, msg, execlists->pending);
> +
> +       if (!execlists->pending[0]) {
> +               GEM_TRACE_ERR("Nothing pending for promotion!\n");
> +               return false;
> +       }
> +
> +       if (execlists->pending[execlists_num_ports(execlists)]) {
> +               GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
> +                             execlists_num_ports(execlists));
> +               return false;
> +       }
> +
> +       for (port = execlists->pending; (rq = *port); port++) {
> +               unsigned long flags;
> +               bool ok = true;
> +
> +               GEM_BUG_ON(!kref_read(&rq->fence.refcount));
> +               GEM_BUG_ON(!i915_request_is_active(rq));
> +
> +               if (ce == rq->hw_context) {
> +                       GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
> +                                     ce->timeline->fence_context,
> +                                     port - execlists->pending);
> +                       return false;
> +               }
> +               ce = rq->hw_context;
> +
> +               /* Hold tightly onto the lock to prevent concurrent retires! */
> +               if (!spin_trylock_irqsave(&rq->lock, flags))
> +                       continue;
> +
> +               if (i915_request_completed(rq))
> +                       goto unlock;
> +
> +               if (i915_active_is_idle(&ce->active) &&
> +                   !i915_gem_context_is_kernel(ce->gem_context)) {
> +                       GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
> +                                     ce->timeline->fence_context,
> +                                     port - execlists->pending);
> +                       ok = false;
> +                       goto unlock;
> +               }
> +
> +               if (!i915_vma_is_pinned(ce->state)) {
> +                       GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
> +                                     ce->timeline->fence_context,
> +                                     port - execlists->pending);
> +                       ok = false;
> +                       goto unlock;
> +               }
> +
> +               if (!i915_vma_is_pinned(ce->ring->vma)) {
> +                       GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
> +                                     ce->timeline->fence_context,
> +                                     port - execlists->pending);
> +                       ok = false;
> +                       goto unlock;
> +               }
> +
> +unlock:
> +               spin_unlock_irqrestore(&rq->lock, flags);
> +               if (!ok)
> +                       return false;
> +       }
> +
> +       return ce;
> +}
> +
> +static void execlists_submit_ports(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists *execlists = &engine->execlists;
> +       unsigned int n;
> +
> +       GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
> +
> +       /*
> +        * We can skip acquiring intel_runtime_pm_get() here as it was taken
> +        * on our behalf by the request (see i915_gem_mark_busy()) and it will
> +        * not be relinquished until the device is idle (see
> +        * i915_gem_idle_work_handler()). As a precaution, we make sure
> +        * that all ELSP are drained i.e. we have processed the CSB,
> +        * before allowing ourselves to idle and calling intel_runtime_pm_put().
> +        */
> +       GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
> +
> +       /*
> +        * ELSQ note: the submit queue is not cleared after being submitted
> +        * to the HW so we need to make sure we always clean it up. This is
> +        * currently ensured by the fact that we always write the same number
> +        * of elsq entries, keep this in mind before changing the loop below.
> +        */
> +       for (n = execlists_num_ports(execlists); n--; ) {
> +               struct i915_request *rq = execlists->pending[n];
> +
> +               write_desc(execlists,
> +                          rq ? execlists_update_context(rq) : 0,
> +                          n);
> +       }
> +
> +       /* we need to manually load the submit queue */
> +       if (execlists->ctrl_reg)
> +               writel(EL_CTRL_LOAD, execlists->ctrl_reg);
> +}
> +
> +static bool ctx_single_port_submission(const struct intel_context *ce)
> +{
> +       return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
> +               i915_gem_context_force_single_submission(ce->gem_context));
> +}
> +
> +static bool can_merge_ctx(const struct intel_context *prev,
> +                         const struct intel_context *next)
> +{
> +       if (prev != next)
> +               return false;
> +
> +       if (ctx_single_port_submission(prev))
> +               return false;
> +
> +       return true;
> +}
> +
> +static bool can_merge_rq(const struct i915_request *prev,
> +                        const struct i915_request *next)
> +{
> +       GEM_BUG_ON(prev == next);
> +       GEM_BUG_ON(!assert_priority_queue(prev, next));
> +
> +       /*
> +        * We do not submit known completed requests. Therefore if the next
> +        * request is already completed, we can pretend to merge it in
> +        * with the previous context (and we will skip updating the ELSP
> +        * and tracking). Thus hopefully keeping the ELSP full with active
> +        * contexts, despite the best efforts of preempt-to-busy to confuse
> +        * us.
> +        */
> +       if (i915_request_completed(next))
> +               return true;
> +
> +       if (unlikely((prev->flags ^ next->flags) &
> +                    (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL)))
> +               return false;
> +
> +       if (!can_merge_ctx(prev->hw_context, next->hw_context))
> +               return false;
> +
> +       return true;
> +}
> +
> +static bool virtual_matches(const struct intel_virtual_engine *ve,
> +                           const struct i915_request *rq,
> +                           const struct intel_engine_cs *engine)
> +{
> +       const struct intel_engine_cs *inflight;
> +
> +       if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
> +               return false;
> +
> +       /*
> +        * We track when the HW has completed saving the context image
> +        * (i.e. when we have seen the final CS event switching out of
> +        * the context) and must not overwrite the context image before
> +        * then. This restricts us to only using the active engine
> +        * while the previous virtualized request is inflight (so
> +        * we reuse the register offsets). This is a very small
> +        * hystersis on the greedy seelction algorithm.
> +        */
> +       inflight = intel_context_inflight(&ve->context);
> +       if (inflight && inflight != engine)
> +               return false;
> +
> +       return true;
> +}
> +
> +static void virtual_xfer_breadcrumbs(struct intel_virtual_engine *ve,
> +                                    struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_cs *old = ve->siblings[0];
> +
> +       /* All unattached (rq->engine == old) must already be completed */
> +
> +       spin_lock(&old->breadcrumbs.irq_lock);
> +       if (!list_empty(&ve->context.signal_link)) {
> +               list_move_tail(&ve->context.signal_link,
> +                              &engine->breadcrumbs.signalers);
> +               intel_engine_queue_breadcrumbs(engine);
> +       }
> +       spin_unlock(&old->breadcrumbs.irq_lock);
> +}
> +
> +static struct i915_request *
> +last_active(const struct intel_engine_execlists *execlists)
> +{
> +       struct i915_request * const *last = READ_ONCE(execlists->active);
> +
> +       while (*last && i915_request_completed(*last))
> +               last++;
> +
> +       return *last;
> +}
> +
> +static void defer_request(struct i915_request *rq, struct list_head * const pl)
> +{
> +       LIST_HEAD(list);
> +
> +       /*
> +        * We want to move the interrupted request to the back of
> +        * the round-robin list (i.e. its priority level), but
> +        * in doing so, we must then move all requests that were in
> +        * flight and were waiting for the interrupted request to
> +        * be run after it again.
> +        */
> +       do {
> +               struct i915_dependency *p;
> +
> +               GEM_BUG_ON(i915_request_is_active(rq));
> +               list_move_tail(&rq->sched.link, pl);
> +
> +               list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
> +                       struct i915_request *w =
> +                               container_of(p->waiter, typeof(*w), sched);
> +
> +                       /* Leave semaphores spinning on the other engines */
> +                       if (w->engine != rq->engine)
> +                               continue;
> +
> +                       /* No waiter should start before its signaler */
> +                       GEM_BUG_ON(i915_request_started(w) &&
> +                                  !i915_request_completed(rq));
> +
> +                       GEM_BUG_ON(i915_request_is_active(w));
> +                       if (list_empty(&w->sched.link))
> +                               continue; /* Not yet submitted; unready */
> +
> +                       if (rq_prio(w) < rq_prio(rq))
> +                               continue;
> +
> +                       GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
> +                       list_move_tail(&w->sched.link, &list);
> +               }
> +
> +               rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
> +       } while (rq);
> +}
> +
> +static void defer_active(struct intel_engine_cs *engine)
> +{
> +       struct i915_request *rq;
> +
> +       rq = __unwind_incomplete_requests(engine);
> +       if (!rq)
> +               return;
> +
> +       defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
> +}
> +
> +static bool
> +need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
> +{
> +       int hint;
> +
> +       if (!intel_engine_has_timeslices(engine))
> +               return false;
> +
> +       if (list_is_last(&rq->sched.link, &engine->active.requests))
> +               return false;
> +
> +       hint = max(rq_prio(list_next_entry(rq, sched.link)),
> +                  engine->execlists.queue_priority_hint);
> +
> +       return hint >= effective_prio(rq);
> +}
> +
> +static int
> +switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
> +{
> +       if (list_is_last(&rq->sched.link, &engine->active.requests))
> +               return INT_MIN;
> +
> +       return rq_prio(list_next_entry(rq, sched.link));
> +}
> +
> +static inline unsigned long
> +timeslice(const struct intel_engine_cs *engine)
> +{
> +       return READ_ONCE(engine->props.timeslice_duration_ms);
> +}
> +
> +static unsigned long
> +active_timeslice(const struct intel_engine_cs *engine)
> +{
> +       const struct i915_request *rq = *engine->execlists.active;
> +
> +       if (i915_request_completed(rq))
> +               return 0;
> +
> +       if (engine->execlists.switch_priority_hint < effective_prio(rq))
> +               return 0;
> +
> +       return timeslice(engine);
> +}
> +
> +static void set_timeslice(struct intel_engine_cs *engine)
> +{
> +       if (!intel_engine_has_timeslices(engine))
> +               return;
> +
> +       set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
> +}
> +
> +static void record_preemption(struct intel_engine_execlists *execlists)
> +{
> +       (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
> +}
> +
> +static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
> +{
> +       struct i915_request *rq;
> +
> +       rq = last_active(&engine->execlists);
> +       if (!rq)
> +               return 0;
> +
> +       /* Force a fast reset for terminated contexts (ignoring sysfs!) */
> +       if (unlikely(i915_gem_context_is_banned(rq->gem_context)))
> +               return 1;
> +
> +       return READ_ONCE(engine->props.preempt_timeout_ms);
> +}
> +
> +static void set_preempt_timeout(struct intel_engine_cs *engine)
> +{
> +       if (!intel_engine_has_preempt_reset(engine))
> +               return;
> +
> +       set_timer_ms(&engine->execlists.preempt,
> +                    active_preempt_timeout(engine));
> +}
> +
> +static void execlists_dequeue(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       struct i915_request **port = execlists->pending;
> +       struct i915_request ** const last_port = port + execlists->port_mask;
> +       struct i915_request *last;
> +       struct rb_node *rb;
> +       bool submit = false;
> +
> +       /*
> +        * Hardware submission is through 2 ports. Conceptually each port
> +        * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
> +        * static for a context, and unique to each, so we only execute
> +        * requests belonging to a single context from each ring. RING_HEAD
> +        * is maintained by the CS in the context image, it marks the place
> +        * where it got up to last time, and through RING_TAIL we tell the CS
> +        * where we want to execute up to this time.
> +        *
> +        * In this list the requests are in order of execution. Consecutive
> +        * requests from the same context are adjacent in the ringbuffer. We
> +        * can combine these requests into a single RING_TAIL update:
> +        *
> +        *              RING_HEAD...req1...req2
> +        *                                    ^- RING_TAIL
> +        * since to execute req2 the CS must first execute req1.
> +        *
> +        * Our goal then is to point each port to the end of a consecutive
> +        * sequence of requests as being the most optimal (fewest wake ups
> +        * and context switches) submission.
> +        */
> +
> +       for (rb = rb_first_cached(&execlists->virtual); rb; ) {
> +               struct intel_virtual_engine *ve =
> +                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +               struct i915_request *rq = READ_ONCE(ve->request);
> +
> +               if (!rq) { /* lazily cleanup after another engine handled rq */
> +                       rb_erase_cached(rb, &execlists->virtual);
> +                       RB_CLEAR_NODE(rb);
> +                       rb = rb_first_cached(&execlists->virtual);
> +                       continue;
> +               }
> +
> +               if (!virtual_matches(ve, rq, engine)) {
> +                       rb = rb_next(rb);
> +                       continue;
> +               }
> +
> +               break;
> +       }
> +
> +       /*
> +        * If the queue is higher priority than the last
> +        * request in the currently active context, submit afresh.
> +        * We will resubmit again afterwards in case we need to split
> +        * the active context to interject the preemption request,
> +        * i.e. we will retrigger preemption following the ack in case
> +        * of trouble.
> +        */
> +       last = last_active(execlists);
> +       if (last) {
> +               if (need_preempt(engine, last, rb)) {
> +                       GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
> +                                 engine->name,
> +                                 last->fence.context,
> +                                 last->fence.seqno,
> +                                 last->sched.attr.priority,
> +                                 execlists->queue_priority_hint);
> +                       record_preemption(execlists);
> +
> +                       /*
> +                        * Don't let the RING_HEAD advance past the breadcrumb
> +                        * as we unwind (and until we resubmit) so that we do
> +                        * not accidentally tell it to go backwards.
> +                        */
> +                       ring_set_paused(engine, 1);
> +
> +                       /*
> +                        * Note that we have not stopped the GPU at this point,
> +                        * so we are unwinding the incomplete requests as they
> +                        * remain inflight and so by the time we do complete
> +                        * the preemption, some of the unwound requests may
> +                        * complete!
> +                        */
> +                       __unwind_incomplete_requests(engine);
> +
> +                       /*
> +                        * If we need to return to the preempted context, we
> +                        * need to skip the lite-restore and force it to
> +                        * reload the RING_TAIL. Otherwise, the HW has a
> +                        * tendency to ignore us rewinding the TAIL to the
> +                        * end of an earlier request.
> +                        */
> +                       last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> +                       last = NULL;
> +               } else if (need_timeslice(engine, last) &&
> +                          timer_expired(&engine->execlists.timer)) {
> +                       GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
> +                                 engine->name,
> +                                 last->fence.context,
> +                                 last->fence.seqno,
> +                                 last->sched.attr.priority,
> +                                 execlists->queue_priority_hint);
> +
> +                       ring_set_paused(engine, 1);
> +                       defer_active(engine);
> +
> +                       /*
> +                        * Unlike for preemption, if we rewind and continue
> +                        * executing the same context as previously active,
> +                        * the order of execution will remain the same and
> +                        * the tail will only advance. We do not need to
> +                        * force a full context restore, as a lite-restore
> +                        * is sufficient to resample the monotonic TAIL.
> +                        *
> +                        * If we switch to any other context, similarly we
> +                        * will not rewind TAIL of current context, and
> +                        * normal save/restore will preserve state and allow
> +                        * us to later continue executing the same request.
> +                        */
> +                       last = NULL;
> +               } else {
> +                       /*
> +                        * Otherwise if we already have a request pending
> +                        * for execution after the current one, we can
> +                        * just wait until the next CS event before
> +                        * queuing more. In either case we will force a
> +                        * lite-restore preemption event, but if we wait
> +                        * we hopefully coalesce several updates into a single
> +                        * submission.
> +                        */
> +                       if (!list_is_last(&last->sched.link,
> +                                         &engine->active.requests)) {
> +                               /*
> +                                * Even if ELSP[1] is occupied and not worthy
> +                                * of timeslices, our queue might be.
> +                                */
> +                               if (!execlists->timer.expires &&
> +                                   need_timeslice(engine, last))
> +                                       set_timer_ms(&execlists->timer,
> +                                                    timeslice(engine));
> +
> +                               return;
> +                       }
> +               }
> +       }
> +
> +       while (rb) { /* XXX virtual is always taking precedence */
> +               struct intel_virtual_engine *ve =
> +                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +               struct i915_request *rq;
> +
> +               spin_lock(&ve->base.active.lock);
> +
> +               rq = ve->request;
> +               if (unlikely(!rq)) { /* lost the race to a sibling */
> +                       spin_unlock(&ve->base.active.lock);
> +                       rb_erase_cached(rb, &execlists->virtual);
> +                       RB_CLEAR_NODE(rb);
> +                       rb = rb_first_cached(&execlists->virtual);
> +                       continue;
> +               }
> +
> +               GEM_BUG_ON(rq != ve->request);
> +               GEM_BUG_ON(rq->engine != &ve->base);
> +               GEM_BUG_ON(rq->hw_context != &ve->context);
> +
> +               if (rq_prio(rq) >= queue_prio(execlists)) {
> +                       if (!virtual_matches(ve, rq, engine)) {
> +                               spin_unlock(&ve->base.active.lock);
> +                               rb = rb_next(rb);
> +                               continue;
> +                       }
> +
> +                       if (last && !can_merge_rq(last, rq)) {
> +                               spin_unlock(&ve->base.active.lock);
> +                               return; /* leave this for another */
> +                       }
> +
> +                       GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
> +                                 engine->name,
> +                                 rq->fence.context,
> +                                 rq->fence.seqno,
> +                                 i915_request_completed(rq) ? "!" :
> +                                 i915_request_started(rq) ? "*" :
> +                                 "",
> +                                 yesno(engine != ve->siblings[0]));
> +
> +                       ve->request = NULL;
> +                       ve->base.execlists.queue_priority_hint = INT_MIN;
> +                       rb_erase_cached(rb, &execlists->virtual);
> +                       RB_CLEAR_NODE(rb);
> +
> +                       GEM_BUG_ON(!(rq->execution_mask & engine->mask));
> +                       rq->engine = engine;
> +
> +                       if (engine != ve->siblings[0]) {
> +                               u32 *regs = ve->context.lrc_reg_state;
> +                               unsigned int n;
> +
> +                               GEM_BUG_ON(READ_ONCE(ve->context.inflight));
> +
> +                               if (!intel_engine_has_relative_mmio(engine))
> +                                       intel_lr_context_set_register_offsets(regs,
> +                                                                             engine);
> +
> +                               if (!list_empty(&ve->context.signals))
> +                                       virtual_xfer_breadcrumbs(ve, engine);
> +
> +                               /*
> +                                * Move the bound engine to the top of the list
> +                                * for future execution. We then kick this
> +                                * tasklet first before checking others, so that
> +                                * we preferentially reuse this set of bound
> +                                * registers.
> +                                */
> +                               for (n = 1; n < ve->num_siblings; n++) {
> +                                       if (ve->siblings[n] == engine) {
> +                                               swap(ve->siblings[n],
> +                                                    ve->siblings[0]);
> +                                               break;
> +                                       }
> +                               }
> +
> +                               GEM_BUG_ON(ve->siblings[0] != engine);
> +                       }
> +
> +                       if (__i915_request_submit(rq)) {
> +                               submit = true;
> +                               last = rq;
> +                       }
> +                       i915_request_put(rq);
> +
> +                       /*
> +                        * Hmm, we have a bunch of virtual engine requests,
> +                        * but the first one was already completed (thanks
> +                        * preempt-to-busy!). Keep looking at the veng queue
> +                        * until we have no more relevant requests (i.e.
> +                        * the normal submit queue has higher priority).
> +                        */
> +                       if (!submit) {
> +                               spin_unlock(&ve->base.active.lock);
> +                               rb = rb_first_cached(&execlists->virtual);
> +                               continue;
> +                       }
> +               }
> +
> +               spin_unlock(&ve->base.active.lock);
> +               break;
> +       }
> +
> +       while ((rb = rb_first_cached(&execlists->queue))) {
> +               struct i915_priolist *p = to_priolist(rb);
> +               struct i915_request *rq, *rn;
> +               int i;
> +
> +               priolist_for_each_request_consume(rq, rn, p, i) {
> +                       bool merge = true;
> +
> +                       /*
> +                        * Can we combine this request with the current port?
> +                        * It has to be the same context/ringbuffer and not
> +                        * have any exceptions (e.g. GVT saying never to
> +                        * combine contexts).
> +                        *
> +                        * If we can combine the requests, we can execute both
> +                        * by updating the RING_TAIL to point to the end of the
> +                        * second request, and so we never need to tell the
> +                        * hardware about the first.
> +                        */
> +                       if (last && !can_merge_rq(last, rq)) {
> +                               /*
> +                                * If we are on the second port and cannot
> +                                * combine this request with the last, then we
> +                                * are done.
> +                                */
> +                               if (port == last_port)
> +                                       goto done;
> +
> +                               /*
> +                                * We must not populate both ELSP[] with the
> +                                * same LRCA, i.e. we must submit 2 different
> +                                * contexts if we submit 2 ELSP.
> +                                */
> +                               if (last->hw_context == rq->hw_context)
> +                                       goto done;
> +
> +                               if (i915_request_has_sentinel(last))
> +                                       goto done;
> +
> +                               /*
> +                                * If GVT overrides us we only ever submit
> +                                * port[0], leaving port[1] empty. Note that we
> +                                * also have to be careful that we don't queue
> +                                * the same context (even though a different
> +                                * request) to the second port.
> +                                */
> +                               if (ctx_single_port_submission(last->hw_context) ||
> +                                   ctx_single_port_submission(rq->hw_context))
> +                                       goto done;
> +
> +                               merge = false;
> +                       }
> +
> +                       if (__i915_request_submit(rq)) {
> +                               if (!merge) {
> +                                       *port = execlists_schedule_in(last, port - execlists->pending);
> +                                       port++;
> +                                       last = NULL;
> +                               }
> +
> +                               GEM_BUG_ON(last &&
> +                                          !can_merge_ctx(last->hw_context,
> +                                                         rq->hw_context));
> +
> +                               submit = true;
> +                               last = rq;
> +                       }
> +               }
> +
> +               rb_erase_cached(&p->node, &execlists->queue);
> +               i915_priolist_free(p);
> +       }
> +
> +done:
> +       /*
> +        * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
> +        *
> +        * We choose the priority hint such that if we add a request of greater
> +        * priority than this, we kick the submission tasklet to decide on
> +        * the right order of submitting the requests to hardware. We must
> +        * also be prepared to reorder requests as they are in-flight on the
> +        * HW. We derive the priority hint then as the first "hole" in
> +        * the HW submission ports and if there are no available slots,
> +        * the priority of the lowest executing request, i.e. last.
> +        *
> +        * When we do receive a higher priority request ready to run from the
> +        * user, see queue_request(), the priority hint is bumped to that
> +        * request triggering preemption on the next dequeue (or subsequent
> +        * interrupt for secondary ports).
> +        */
> +       execlists->queue_priority_hint = queue_prio(execlists);
> +       GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
> +                 engine->name, execlists->queue_priority_hint,
> +                 yesno(submit));
> +
> +       if (submit) {
> +               *port = execlists_schedule_in(last, port - execlists->pending);
> +               execlists->switch_priority_hint =
> +                       switch_prio(engine, *execlists->pending);
> +
> +               /*
> +                * Skip if we ended up with exactly the same set of requests,
> +                * e.g. trying to timeslice a pair of ordered contexts
> +                */
> +               if (!memcmp(execlists->active, execlists->pending,
> +                           (port - execlists->pending + 1) * sizeof(*port))) {
> +                       do
> +                               execlists_schedule_out(fetch_and_zero(port));
> +                       while (port-- != execlists->pending);
> +
> +                       goto skip_submit;
> +               }
> +
> +               memset(port + 1, 0, (last_port - port) * sizeof(*port));
> +               execlists_submit_ports(engine);
> +
> +               set_preempt_timeout(engine);
> +       } else {
> +skip_submit:
> +               ring_set_paused(engine, 0);
> +       }
> +}
> +
> +static void
> +cancel_port_requests(struct intel_engine_execlists * const execlists)
> +{
> +       struct i915_request * const *port;
> +
> +       for (port = execlists->pending; *port; port++)
> +               execlists_schedule_out(*port);
> +       memset(execlists->pending, 0, sizeof(execlists->pending));
> +
> +       /* Mark the end of active before we overwrite *active */
> +       for (port = xchg(&execlists->active, execlists->pending); *port; port++)
> +               execlists_schedule_out(*port);
> +       WRITE_ONCE(execlists->active,
> +                  memset(execlists->inflight, 0, sizeof(execlists->inflight)));
> +}
> +
> +static inline void
> +invalidate_csb_entries(const u32 *first, const u32 *last)
> +{
> +       clflush((void *)first);
> +       clflush((void *)last);
> +}
> +
> +static inline bool
> +reset_in_progress(const struct intel_engine_execlists *execlists)
> +{
> +       return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
> +}
> +
> +/*
> + * Starting with Gen12, the status has a new format:
> + *
> + *     bit  0:     switched to new queue
> + *     bit  1:     reserved
> + *     bit  2:     semaphore wait mode (poll or signal), only valid when
> + *                 switch detail is set to "wait on semaphore"
> + *     bits 3-5:   engine class
> + *     bits 6-11:  engine instance
> + *     bits 12-14: reserved
> + *     bits 15-25: sw context id of the lrc the GT switched to
> + *     bits 26-31: sw counter of the lrc the GT switched to
> + *     bits 32-35: context switch detail
> + *                  - 0: ctx complete
> + *                  - 1: wait on sync flip
> + *                  - 2: wait on vblank
> + *                  - 3: wait on scanline
> + *                  - 4: wait on semaphore
> + *                  - 5: context preempted (not on SEMAPHORE_WAIT or
> + *                       WAIT_FOR_EVENT)
> + *     bit  36:    reserved
> + *     bits 37-43: wait detail (for switch detail 1 to 4)
> + *     bits 44-46: reserved
> + *     bits 47-57: sw context id of the lrc the GT switched away from
> + *     bits 58-63: sw counter of the lrc the GT switched away from
> + */
> +static inline bool
> +gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
> +{
> +       u32 lower_dw = csb[0];
> +       u32 upper_dw = csb[1];
> +       bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
> +       bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
> +       bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
> +
> +       /*
> +        * The context switch detail is not guaranteed to be 5 when a preemption
> +        * occurs, so we can't just check for that. The check below works for
> +        * all the cases we care about, including preemptions of WAIT
> +        * instructions and lite-restore. Preempt-to-idle via the CTRL register
> +        * would require some extra handling, but we don't support that.
> +        */
> +       if (!ctx_away_valid || new_queue) {
> +               GEM_BUG_ON(!ctx_to_valid);
> +               return true;
> +       }
> +
> +       /*
> +        * switch detail = 5 is covered by the case above and we do not expect a
> +        * context switch on an unsuccessful wait instruction since we always
> +        * use polling mode.
> +        */
> +       GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
> +       return false;
> +}
> +
> +static inline bool
> +gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
> +{
> +       return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
> +}
> +
> +static void process_csb(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       const u32 * const buf = execlists->csb_status;
> +       const u8 num_entries = execlists->csb_size;
> +       u8 head, tail;
> +
> +       /*
> +        * As we modify our execlists state tracking we require exclusive
> +        * access. Either we are inside the tasklet, or the tasklet is disabled
> +        * and we assume that is only inside the reset paths and so serialised.
> +        */
> +       GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
> +                  !reset_in_progress(execlists));
> +       GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
> +
> +       /*
> +        * Note that csb_write, csb_status may be either in HWSP or mmio.
> +        * When reading from the csb_write mmio register, we have to be
> +        * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
> +        * the low 4bits. As it happens we know the next 4bits are always
> +        * zero and so we can simply masked off the low u8 of the register
> +        * and treat it identically to reading from the HWSP (without having
> +        * to use explicit shifting and masking, and probably bifurcating
> +        * the code to handle the legacy mmio read).
> +        */
> +       head = execlists->csb_head;
> +       tail = READ_ONCE(*execlists->csb_write);
> +       GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
> +       if (unlikely(head == tail))
> +               return;
> +
> +       /*
> +        * Hopefully paired with a wmb() in HW!
> +        *
> +        * We must complete the read of the write pointer before any reads
> +        * from the CSB, so that we do not see stale values. Without an rmb
> +        * (lfence) the HW may speculatively perform the CSB[] reads *before*
> +        * we perform the READ_ONCE(*csb_write).
> +        */
> +       rmb();
> +
> +       do {
> +               bool promote;
> +
> +               if (++head == num_entries)
> +                       head = 0;
> +
> +               /*
> +                * We are flying near dragons again.
> +                *
> +                * We hold a reference to the request in execlist_port[]
> +                * but no more than that. We are operating in softirq
> +                * context and so cannot hold any mutex or sleep. That
> +                * prevents us stopping the requests we are processing
> +                * in port[] from being retired simultaneously (the
> +                * breadcrumb will be complete before we see the
> +                * context-switch). As we only hold the reference to the
> +                * request, any pointer chasing underneath the request
> +                * is subject to a potential use-after-free. Thus we
> +                * store all of the bookkeeping within port[] as
> +                * required, and avoid using unguarded pointers beneath
> +                * request itself. The same applies to the atomic
> +                * status notifier.
> +                */
> +
> +               GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
> +                         engine->name, head,
> +                         buf[2 * head + 0], buf[2 * head + 1]);
> +
> +               if (INTEL_GEN(engine->i915) >= 12)
> +                       promote = gen12_csb_parse(execlists, buf + 2 * head);
> +               else
> +                       promote = gen8_csb_parse(execlists, buf + 2 * head);
> +               if (promote) {
> +                       struct i915_request * const *old = execlists->active;
> +
> +                       /* Point active to the new ELSP; prevent overwriting */
> +                       WRITE_ONCE(execlists->active, execlists->pending);
> +                       set_timeslice(engine);
> +
> +                       if (!inject_preempt_hang(execlists))
> +                               ring_set_paused(engine, 0);
> +
> +                       /* cancel old inflight, prepare for switch */
> +                       trace_ports(execlists, "preempted", old);
> +                       while (*old)
> +                               execlists_schedule_out(*old++);
> +
> +                       /* switch pending to inflight */
> +                       GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
> +                       WRITE_ONCE(execlists->active,
> +                                  memcpy(execlists->inflight,
> +                                         execlists->pending,
> +                                         execlists_num_ports(execlists) *
> +                                         sizeof(*execlists->pending)));
> +
> +                       WRITE_ONCE(execlists->pending[0], NULL);
> +               } else {
> +                       GEM_BUG_ON(!*execlists->active);
> +
> +                       /* port0 completed, advanced to port1 */
> +                       trace_ports(execlists, "completed", execlists->active);
> +
> +                       /*
> +                        * We rely on the hardware being strongly
> +                        * ordered, that the breadcrumb write is
> +                        * coherent (visible from the CPU) before the
> +                        * user interrupt and CSB is processed.
> +                        */
> +                       GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
> +                                  !reset_in_progress(execlists));
> +                       execlists_schedule_out(*execlists->active++);
> +
> +                       GEM_BUG_ON(execlists->active - execlists->inflight >
> +                                  execlists_num_ports(execlists));
> +               }
> +       } while (head != tail);
> +
> +       execlists->csb_head = head;
> +
> +       /*
> +        * Gen11 has proven to fail wrt global observation point between
> +        * entry and tail update, failing on the ordering and thus
> +        * we see an old entry in the context status buffer.
> +        *
> +        * Forcibly evict out entries for the next gpu csb update,
> +        * to increase the odds that we get a fresh entries with non
> +        * working hardware. The cost for doing so comes out mostly with
> +        * the wash as hardware, working or not, will need to do the
> +        * invalidation before.
> +        */
> +       invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
> +}
> +
> +static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
> +{
> +       lockdep_assert_held(&engine->active.lock);
> +       if (!engine->execlists.pending[0]) {
> +               rcu_read_lock(); /* protect peeking at execlists->active */
> +               execlists_dequeue(engine);
> +               rcu_read_unlock();
> +       }
> +}
> +
> +static noinline void preempt_reset(struct intel_engine_cs *engine)
> +{
> +       const unsigned int bit = I915_RESET_ENGINE + engine->id;
> +       unsigned long *lock = &engine->gt->reset.flags;
> +
> +       if (i915_modparams.reset < 3)
> +               return;
> +
> +       if (test_and_set_bit(bit, lock))
> +               return;
> +
> +       /* Mark this tasklet as disabled to avoid waiting for it to complete */
> +       tasklet_disable_nosync(&engine->execlists.tasklet);
> +
> +       GEM_TRACE("%s: preempt timeout %lu+%ums\n",
> +                 engine->name,
> +                 READ_ONCE(engine->props.preempt_timeout_ms),
> +                 jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
> +       intel_engine_reset(engine, "preemption time out");
> +
> +       tasklet_enable(&engine->execlists.tasklet);
> +       clear_and_wake_up_bit(bit, lock);
> +}
> +
> +static bool preempt_timeout(const struct intel_engine_cs *const engine)
> +{
> +       const struct timer_list *t = &engine->execlists.preempt;
> +
> +       if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
> +               return false;
> +
> +       if (!timer_expired(t))
> +               return false;
> +
> +       return READ_ONCE(engine->execlists.pending[0]);
> +}
> +
> +/*
> + * Check the unread Context Status Buffers and manage the submission of new
> + * contexts to the ELSP accordingly.
> + */
> +static void execlists_submission_tasklet(unsigned long data)
> +{
> +       struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
> +       bool timeout = preempt_timeout(engine);
> +
> +       process_csb(engine);
> +       if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
> +               unsigned long flags;
> +
> +               spin_lock_irqsave(&engine->active.lock, flags);
> +               __execlists_submission_tasklet(engine);
> +               spin_unlock_irqrestore(&engine->active.lock, flags);
> +
> +               /* Recheck after serialising with direct-submission */
> +               if (timeout && preempt_timeout(engine))
> +                       preempt_reset(engine);
> +       }
> +}
> +
> +static void __execlists_kick(struct intel_engine_execlists *execlists)
> +{
> +       /* Kick the tasklet for some interrupt coalescing and reset handling */
> +       tasklet_hi_schedule(&execlists->tasklet);
> +}
> +
> +#define execlists_kick(t, member) \
> +       __execlists_kick(container_of(t, struct intel_engine_execlists, member))
> +
> +static void execlists_timeslice(struct timer_list *timer)
> +{
> +       execlists_kick(timer, timer);
> +}
> +
> +static void execlists_preempt(struct timer_list *timer)
> +{
> +       execlists_kick(timer, preempt);
> +}
> +
> +static void queue_request(struct intel_engine_cs *engine,
> +                         struct i915_sched_node *node,
> +                         int prio)
> +{
> +       GEM_BUG_ON(!list_empty(&node->link));
> +       list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
> +}
> +
> +static void __submit_queue_imm(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +
> +       if (reset_in_progress(execlists))
> +               return; /* defer until we restart the engine following reset */
> +
> +       if (execlists->tasklet.func == execlists_submission_tasklet)
> +               __execlists_submission_tasklet(engine);
> +       else
> +               tasklet_hi_schedule(&execlists->tasklet);
> +}
> +
> +static void submit_queue(struct intel_engine_cs *engine,
> +                        const struct i915_request *rq)
> +{
> +       struct intel_engine_execlists *execlists = &engine->execlists;
> +
> +       if (rq_prio(rq) <= execlists->queue_priority_hint)
> +               return;
> +
> +       execlists->queue_priority_hint = rq_prio(rq);
> +       __submit_queue_imm(engine);
> +}
> +
> +static void execlists_submit_request(struct i915_request *request)
> +{
> +       struct intel_engine_cs *engine = request->engine;
> +       unsigned long flags;
> +
> +       /* Will be called from irq-context when using foreign fences. */
> +       spin_lock_irqsave(&engine->active.lock, flags);
> +
> +       queue_request(engine, &request->sched, rq_prio(request));
> +
> +       GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
> +       GEM_BUG_ON(list_empty(&request->sched.link));
> +
> +       submit_queue(engine, request);
> +
> +       spin_unlock_irqrestore(&engine->active.lock, flags);
> +}
> +
> +static void execlists_context_destroy(struct kref *kref)
> +{
> +       struct intel_context *ce = container_of(kref, typeof(*ce), ref);
> +
> +       GEM_BUG_ON(!i915_active_is_idle(&ce->active));
> +       GEM_BUG_ON(intel_context_is_pinned(ce));
> +
> +       if (ce->state)
> +               intel_lr_context_fini(ce);
> +
> +       intel_context_fini(ce);
> +       intel_context_free(ce);
> +}
> +
> +static int execlists_context_pin(struct intel_context *ce)
> +{
> +       return intel_lr_context_pin(ce, ce->engine);
> +}
> +
> +static int execlists_context_alloc(struct intel_context *ce)
> +{
> +       return intel_lr_context_alloc(ce, ce->engine);
> +}
> +
> +static void execlists_context_reset(struct intel_context *ce)
> +{
> +       /*
> +        * Because we emit WA_TAIL_DWORDS there may be a disparity
> +        * between our bookkeeping in ce->ring->head and ce->ring->tail and
> +        * that stored in context. As we only write new commands from
> +        * ce->ring->tail onwards, everything before that is junk. If the GPU
> +        * starts reading from its RING_HEAD from the context, it may try to
> +        * execute that junk and die.
> +        *
> +        * The contexts that are stilled pinned on resume belong to the
> +        * kernel, and are local to each engine. All other contexts will
> +        * have their head/tail sanitized upon pinning before use, so they
> +        * will never see garbage,
> +        *
> +        * So to avoid that we reset the context images upon resume. For
> +        * simplicity, we just zero everything out.
> +        */
> +       intel_ring_reset(ce->ring, 0);
> +       intel_lr_context_update_reg_state(ce, ce->engine);
> +}
> +
> +static const struct intel_context_ops execlists_context_ops = {
> +       .alloc = execlists_context_alloc,
> +
> +       .pin = execlists_context_pin,
> +       .unpin = intel_lr_context_unpin,
> +
> +       .enter = intel_context_enter_engine,
> +       .exit = intel_context_exit_engine,
> +
> +       .reset = execlists_context_reset,
> +       .destroy = execlists_context_destroy,
> +};
> +
> +static int execlists_request_alloc(struct i915_request *request)
> +{
> +       int ret;
> +
> +       GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
> +
> +       /*
> +        * Flush enough space to reduce the likelihood of waiting after
> +        * we start building the request - in which case we will just
> +        * have to repeat work.
> +        */
> +       request->reserved_space += EXECLISTS_REQUEST_SIZE;
> +
> +       /*
> +        * Note that after this point, we have committed to using
> +        * this request as it is being used to both track the
> +        * state of engine initialisation and liveness of the
> +        * golden renderstate above. Think twice before you try
> +        * to cancel/unwind this request now.
> +        */
> +
> +       /* Unconditionally invalidate GPU caches and TLBs. */
> +       ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
> +       if (ret)
> +               return ret;
> +
> +       request->reserved_space -= EXECLISTS_REQUEST_SIZE;
> +       return 0;
> +}
> +
> +static void execlists_reset_prepare(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       unsigned long flags;
> +
> +       GEM_TRACE("%s: depth<-%d\n", engine->name,
> +                 atomic_read(&execlists->tasklet.count));
> +
> +       /*
> +        * Prevent request submission to the hardware until we have
> +        * completed the reset in i915_gem_reset_finish(). If a request
> +        * is completed by one engine, it may then queue a request
> +        * to a second via its execlists->tasklet *just* as we are
> +        * calling engine->resume() and also writing the ELSP.
> +        * Turning off the execlists->tasklet until the reset is over
> +        * prevents the race.
> +        */
> +       __tasklet_disable_sync_once(&execlists->tasklet);
> +       GEM_BUG_ON(!reset_in_progress(execlists));
> +
> +       /* And flush any current direct submission. */
> +       spin_lock_irqsave(&engine->active.lock, flags);
> +       spin_unlock_irqrestore(&engine->active.lock, flags);
> +
> +       /*
> +        * We stop engines, otherwise we might get failed reset and a
> +        * dead gpu (on elk). Also as modern gpu as kbl can suffer
> +        * from system hang if batchbuffer is progressing when
> +        * the reset is issued, regardless of READY_TO_RESET ack.
> +        * Thus assume it is best to stop engines on all gens
> +        * where we have a gpu reset.
> +        *
> +        * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
> +        *
> +        * FIXME: Wa for more modern gens needs to be validated
> +        */
> +       intel_engine_stop_cs(engine);
> +}
> +
> +static void reset_csb_pointers(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       const unsigned int reset_value = execlists->csb_size - 1;
> +
> +       ring_set_paused(engine, 0);
> +
> +       /*
> +        * After a reset, the HW starts writing into CSB entry [0]. We
> +        * therefore have to set our HEAD pointer back one entry so that
> +        * the *first* entry we check is entry 0. To complicate this further,
> +        * as we don't wait for the first interrupt after reset, we have to
> +        * fake the HW write to point back to the last entry so that our
> +        * inline comparison of our cached head position against the last HW
> +        * write works even before the first interrupt.
> +        */
> +       execlists->csb_head = reset_value;
> +       WRITE_ONCE(*execlists->csb_write, reset_value);
> +       wmb(); /* Make sure this is visible to HW (paranoia?) */
> +
> +       /*
> +        * Sometimes Icelake forgets to reset its pointers on a GPU reset.
> +        * Bludgeon them with a mmio update to be sure.
> +        */
> +       ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
> +                    reset_value << 8 | reset_value);
> +       ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
> +
> +       invalidate_csb_entries(&execlists->csb_status[0],
> +                              &execlists->csb_status[reset_value]);
> +}
> +
> +static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       struct intel_context *ce;
> +       struct i915_request *rq;
> +
> +       mb(); /* paranoia: read the CSB pointers from after the reset */
> +       clflush(execlists->csb_write);
> +       mb();
> +
> +       process_csb(engine); /* drain preemption events */
> +
> +       /* Following the reset, we need to reload the CSB read/write pointers */
> +       reset_csb_pointers(engine);
> +
> +       /*
> +        * Save the currently executing context, even if we completed
> +        * its request, it was still running at the time of the
> +        * reset and will have been clobbered.
> +        */
> +       rq = execlists_active(execlists);
> +       if (!rq)
> +               goto unwind;
> +
> +       /* We still have requests in-flight; the engine should be active */
> +       GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
> +
> +       ce = rq->hw_context;
> +       GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
> +
> +       if (i915_request_completed(rq)) {
> +               /* Idle context; tidy up the ring so we can restart afresh */
> +               ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
> +               goto out_replay;
> +       }
> +
> +       /* Context has requests still in-flight; it should not be idle! */
> +       GEM_BUG_ON(i915_active_is_idle(&ce->active));
> +       rq = active_request(ce->timeline, rq);
> +       ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
> +       GEM_BUG_ON(ce->ring->head == ce->ring->tail);
> +
> +       /*
> +        * If this request hasn't started yet, e.g. it is waiting on a
> +        * semaphore, we need to avoid skipping the request or else we
> +        * break the signaling chain. However, if the context is corrupt
> +        * the request will not restart and we will be stuck with a wedged
> +        * device. It is quite often the case that if we issue a reset
> +        * while the GPU is loading the context image, that the context
> +        * image becomes corrupt.
> +        *
> +        * Otherwise, if we have not started yet, the request should replay
> +        * perfectly and we do not need to flag the result as being erroneous.
> +        */
> +       if (!i915_request_started(rq))
> +               goto out_replay;
> +
> +       /*
> +        * If the request was innocent, we leave the request in the ELSP
> +        * and will try to replay it on restarting. The context image may
> +        * have been corrupted by the reset, in which case we may have
> +        * to service a new GPU hang, but more likely we can continue on
> +        * without impact.
> +        *
> +        * If the request was guilty, we presume the context is corrupt
> +        * and have to at least restore the RING register in the context
> +        * image back to the expected values to skip over the guilty request.
> +        */
> +       __i915_request_reset(rq, stalled);
> +       if (!stalled)
> +               goto out_replay;
> +
> +       /*
> +        * We want a simple context + ring to execute the breadcrumb update.
> +        * We cannot rely on the context being intact across the GPU hang,
> +        * so clear it and rebuild just what we need for the breadcrumb.
> +        * All pending requests for this context will be zapped, and any
> +        * future request will be after userspace has had the opportunity
> +        * to recreate its own state.
> +        */
> +       GEM_BUG_ON(!intel_context_is_pinned(ce));
> +       intel_lr_context_restore_default_state(ce, engine);
> +
> +out_replay:
> +       GEM_TRACE("%s replay {head:%04x, tail:%04x}\n",
> +                 engine->name, ce->ring->head, ce->ring->tail);
> +       intel_ring_update_space(ce->ring);
> +       intel_lr_context_reset_reg_state(ce, engine);
> +       intel_lr_context_update_reg_state(ce, engine);
> +       ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
> +
> +unwind:
> +       /* Push back any incomplete requests for replay after the reset. */
> +       cancel_port_requests(execlists);
> +       __unwind_incomplete_requests(engine);
> +}
> +
> +static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
> +{
> +       unsigned long flags;
> +
> +       GEM_TRACE("%s\n", engine->name);
> +
> +       spin_lock_irqsave(&engine->active.lock, flags);
> +
> +       __execlists_reset(engine, stalled);
> +
> +       spin_unlock_irqrestore(&engine->active.lock, flags);
> +}
> +
> +static void nop_submission_tasklet(unsigned long data)
> +{
> +       /* The driver is wedged; don't process any more events. */
> +}
> +
> +static void execlists_cancel_requests(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       struct i915_request *rq, *rn;
> +       struct rb_node *rb;
> +       unsigned long flags;
> +
> +       GEM_TRACE("%s\n", engine->name);
> +
> +       /*
> +        * Before we call engine->cancel_requests(), we should have exclusive
> +        * access to the submission state. This is arranged for us by the
> +        * caller disabling the interrupt generation, the tasklet and other
> +        * threads that may then access the same state, giving us a free hand
> +        * to reset state. However, we still need to let lockdep be aware that
> +        * we know this state may be accessed in hardirq context, so we
> +        * disable the irq around this manipulation and we want to keep
> +        * the spinlock focused on its duties and not accidentally conflate
> +        * coverage to the submission's irq state. (Similarly, although we
> +        * shouldn't need to disable irq around the manipulation of the
> +        * submission's irq state, we also wish to remind ourselves that
> +        * it is irq state.)
> +        */
> +       spin_lock_irqsave(&engine->active.lock, flags);
> +
> +       __execlists_reset(engine, true);
> +
> +       /* Mark all executing requests as skipped. */
> +       list_for_each_entry(rq, &engine->active.requests, sched.link)
> +               mark_eio(rq);
> +
> +       /* Flush the queued requests to the timeline list (for retiring). */
> +       while ((rb = rb_first_cached(&execlists->queue))) {
> +               struct i915_priolist *p = to_priolist(rb);
> +               int i;
> +
> +               priolist_for_each_request_consume(rq, rn, p, i) {
> +                       mark_eio(rq);
> +                       __i915_request_submit(rq);
> +               }
> +
> +               rb_erase_cached(&p->node, &execlists->queue);
> +               i915_priolist_free(p);
> +       }
> +
> +       /* Cancel all attached virtual engines */
> +       while ((rb = rb_first_cached(&execlists->virtual))) {
> +               struct intel_virtual_engine *ve =
> +                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +
> +               rb_erase_cached(rb, &execlists->virtual);
> +               RB_CLEAR_NODE(rb);
> +
> +               spin_lock(&ve->base.active.lock);
> +               rq = fetch_and_zero(&ve->request);
> +               if (rq) {
> +                       mark_eio(rq);
> +
> +                       rq->engine = engine;
> +                       __i915_request_submit(rq);
> +                       i915_request_put(rq);
> +
> +                       ve->base.execlists.queue_priority_hint = INT_MIN;
> +               }
> +               spin_unlock(&ve->base.active.lock);
> +       }
> +
> +       /* Remaining _unready_ requests will be nop'ed when submitted */
> +
> +       execlists->queue_priority_hint = INT_MIN;
> +       execlists->queue = RB_ROOT_CACHED;
> +
> +       GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
> +       execlists->tasklet.func = nop_submission_tasklet;
> +
> +       spin_unlock_irqrestore(&engine->active.lock, flags);
> +}
> +
> +static void execlists_reset_finish(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +
> +       /*
> +        * After a GPU reset, we may have requests to replay. Do so now while
> +        * we still have the forcewake to be sure that the GPU is not allowed
> +        * to sleep before we restart and reload a context.
> +        */
> +       GEM_BUG_ON(!reset_in_progress(execlists));
> +       if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
> +               execlists->tasklet.func(execlists->tasklet.data);
> +
> +       if (__tasklet_enable(&execlists->tasklet))
> +               /* And kick in case we missed a new request submission. */
> +               tasklet_hi_schedule(&execlists->tasklet);
> +       GEM_TRACE("%s: depth->%d\n", engine->name,
> +                 atomic_read(&execlists->tasklet.count));
> +}
> +
> +static void execlists_park(struct intel_engine_cs *engine)
> +{
> +       cancel_timer(&engine->execlists.timer);
> +       cancel_timer(&engine->execlists.preempt);
> +}
> +
> +static void execlists_destroy(struct intel_engine_cs *engine)
> +{
> +       /* Synchronise with residual timers and any softirq they raise */
> +       del_timer_sync(&engine->execlists.timer);
> +       del_timer_sync(&engine->execlists.preempt);
> +       tasklet_kill(&engine->execlists.tasklet);
> +
> +       intel_logical_ring_destroy(engine);
> +}
> +
> +void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
> +{
> +       engine->request_alloc = execlists_request_alloc;
> +       engine->submit_request = execlists_submit_request;
> +       engine->cancel_requests = execlists_cancel_requests;
> +       engine->schedule = i915_schedule;
> +       engine->execlists.tasklet.func = execlists_submission_tasklet;
> +
> +       engine->reset.prepare = execlists_reset_prepare;
> +       engine->reset.reset = execlists_reset;
> +       engine->reset.finish = execlists_reset_finish;
> +
> +       engine->destroy = execlists_destroy;
> +       engine->park = execlists_park;
> +       engine->unpark = NULL;
> +
> +       engine->flags |= I915_ENGINE_SUPPORTS_STATS;
> +       if (!intel_vgpu_active(engine->i915)) {
> +               engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
> +               if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
> +                       engine->flags |= I915_ENGINE_HAS_PREEMPTION;
> +       }
> +
> +       if (INTEL_GEN(engine->i915) >= 12)
> +               engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
> +}
> +
> +int intel_execlists_submission_setup(struct intel_engine_cs *engine)
> +{
> +       tasklet_init(&engine->execlists.tasklet,
> +                    execlists_submission_tasklet, (unsigned long)engine);
> +       timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
> +       timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
> +
> +       intel_logical_ring_setup(engine);
> +
> +       engine->set_default_submission = intel_execlists_set_default_submission;
> +       engine->cops = &execlists_context_ops;
> +
> +       return 0;
> +}
> +
> +int intel_execlists_submission_init(struct intel_engine_cs *engine)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       struct drm_i915_private *i915 = engine->i915;
> +       struct intel_uncore *uncore = engine->uncore;
> +       u32 base = engine->mmio_base;
> +       int ret;
> +
> +       ret = intel_logical_ring_init(engine);
> +       if (ret)
> +               return ret;
> +
> +       if (HAS_LOGICAL_RING_ELSQ(i915)) {
> +               execlists->submit_reg = uncore->regs +
> +                       i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
> +               execlists->ctrl_reg = uncore->regs +
> +                       i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
> +       } else {
> +               execlists->submit_reg = uncore->regs +
> +                       i915_mmio_reg_offset(RING_ELSP(base));
> +       }
> +
> +       execlists->csb_status =
> +               &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
> +
> +       execlists->csb_write =
> +               &engine->status_page.addr[intel_hws_csb_write_index(i915)];
> +
> +       if (INTEL_GEN(i915) < 11)
> +               execlists->csb_size = GEN8_CSB_ENTRIES;
> +       else
> +               execlists->csb_size = GEN11_CSB_ENTRIES;
> +
> +       reset_csb_pointers(engine);
> +
> +       return 0;
> +}
> +
> +static intel_engine_mask_t
> +virtual_submission_mask(struct intel_virtual_engine *ve)
> +{
> +       struct i915_request *rq;
> +       intel_engine_mask_t mask;
> +
> +       rq = READ_ONCE(ve->request);
> +       if (!rq)
> +               return 0;
> +
> +       /* The rq is ready for submission; rq->execution_mask is now stable. */
> +       mask = rq->execution_mask;
> +       if (unlikely(!mask)) {
> +               /* Invalid selection, submit to a random engine in error */
> +               i915_request_skip(rq, -ENODEV);
> +               mask = ve->siblings[0]->mask;
> +       }
> +
> +       GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
> +                 ve->base.name,
> +                 rq->fence.context, rq->fence.seqno,
> +                 mask, ve->base.execlists.queue_priority_hint);
> +
> +       return mask;
> +}
> +
> +static void virtual_submission_tasklet(unsigned long data)
> +{
> +       struct intel_virtual_engine * const ve =
> +               (struct intel_virtual_engine *)data;
> +       const int prio = ve->base.execlists.queue_priority_hint;
> +       intel_engine_mask_t mask;
> +       unsigned int n;
> +
> +       rcu_read_lock();
> +       mask = virtual_submission_mask(ve);
> +       rcu_read_unlock();
> +       if (unlikely(!mask))
> +               return;
> +
> +       local_irq_disable();
> +       for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
> +               struct intel_engine_cs *sibling = ve->siblings[n];
> +               struct ve_node * const node = &ve->nodes[sibling->id];
> +               struct rb_node **parent, *rb;
> +               bool first;
> +
> +               if (unlikely(!(mask & sibling->mask))) {
> +                       if (!RB_EMPTY_NODE(&node->rb)) {
> +                               spin_lock(&sibling->active.lock);
> +                               rb_erase_cached(&node->rb,
> +                                               &sibling->execlists.virtual);
> +                               RB_CLEAR_NODE(&node->rb);
> +                               spin_unlock(&sibling->active.lock);
> +                       }
> +                       continue;
> +               }
> +
> +               spin_lock(&sibling->active.lock);
> +
> +               if (!RB_EMPTY_NODE(&node->rb)) {
> +                       /*
> +                        * Cheat and avoid rebalancing the tree if we can
> +                        * reuse this node in situ.
> +                        */
> +                       first = rb_first_cached(&sibling->execlists.virtual) ==
> +                               &node->rb;
> +                       if (prio == node->prio || (prio > node->prio && first))
> +                               goto submit_engine;
> +
> +                       rb_erase_cached(&node->rb, &sibling->execlists.virtual);
> +               }
> +
> +               rb = NULL;
> +               first = true;
> +               parent = &sibling->execlists.virtual.rb_root.rb_node;
> +               while (*parent) {
> +                       struct ve_node *other;
> +
> +                       rb = *parent;
> +                       other = rb_entry(rb, typeof(*other), rb);
> +                       if (prio > other->prio) {
> +                               parent = &rb->rb_left;
> +                       } else {
> +                               parent = &rb->rb_right;
> +                               first = false;
> +                       }
> +               }
> +
> +               rb_link_node(&node->rb, rb, parent);
> +               rb_insert_color_cached(&node->rb,
> +                                      &sibling->execlists.virtual,
> +                                      first);
> +
> +submit_engine:
> +               GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
> +               node->prio = prio;
> +               if (first && prio > sibling->execlists.queue_priority_hint) {
> +                       sibling->execlists.queue_priority_hint = prio;
> +                       tasklet_hi_schedule(&sibling->execlists.tasklet);
> +               }
> +
> +               spin_unlock(&sibling->active.lock);
> +       }
> +       local_irq_enable();
> +}
> +
> +static void virtual_submit_request(struct i915_request *rq)
> +{
> +       struct intel_virtual_engine *ve = to_virtual_engine(rq->engine);
> +       struct i915_request *old;
> +       unsigned long flags;
> +
> +       GEM_TRACE("%s: rq=%llx:%lld\n",
> +                 ve->base.name,
> +                 rq->fence.context,
> +                 rq->fence.seqno);
> +
> +       GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
> +
> +       spin_lock_irqsave(&ve->base.active.lock, flags);
> +
> +       old = ve->request;
> +       if (old) { /* background completion event from preempt-to-busy */
> +               GEM_BUG_ON(!i915_request_completed(old));
> +               __i915_request_submit(old);
> +               i915_request_put(old);
> +       }
> +
> +       if (i915_request_completed(rq)) {
> +               __i915_request_submit(rq);
> +
> +               ve->base.execlists.queue_priority_hint = INT_MIN;
> +               ve->request = NULL;
> +       } else {
> +               ve->base.execlists.queue_priority_hint = rq_prio(rq);
> +               ve->request = i915_request_get(rq);
> +
> +               GEM_BUG_ON(!list_empty(intel_virtual_engine_queue(ve)));
> +               list_move_tail(&rq->sched.link, intel_virtual_engine_queue(ve));
> +
> +               tasklet_schedule(&ve->base.execlists.tasklet);
> +       }
> +
> +       spin_unlock_irqrestore(&ve->base.active.lock, flags);
> +}
> +
> +static void
> +virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
> +{
> +       struct intel_virtual_engine *ve = to_virtual_engine(rq->engine);
> +       intel_engine_mask_t allowed, exec;
> +       struct ve_bond *bond;
> +
> +       allowed = ~to_request(signal)->engine->mask;
> +
> +       bond = intel_virtual_engine_find_bond(ve, to_request(signal)->engine);
> +       if (bond)
> +               allowed &= bond->sibling_mask;
> +
> +       /* Restrict the bonded request to run on only the available engines */
> +       exec = READ_ONCE(rq->execution_mask);
> +       while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
> +               ;
> +
> +       /* Prevent the master from being re-run on the bonded engines */
> +       to_request(signal)->execution_mask &= ~allowed;
> +}
> +
> +void intel_execlists_virtual_submission_init(struct intel_virtual_engine *ve)
> +{
> +       ve->base.request_alloc = execlists_request_alloc;
> +       ve->base.submit_request = virtual_submit_request;
> +       ve->base.bond_execute = virtual_bond_execute;
> +       tasklet_init(&ve->base.execlists.tasklet,
> +                    virtual_submission_tasklet,
> +                    (unsigned long)ve);
> +}
> +
> +void intel_execlists_show_requests(struct intel_engine_cs *engine,
> +                                  struct drm_printer *m,
> +                                  void (*show_request)(struct drm_printer *m,
> +                                                       struct i915_request *rq,
> +                                                       const char *prefix),
> +                                  unsigned int max)
> +{
> +       const struct intel_engine_execlists *execlists = &engine->execlists;
> +       struct i915_request *rq, *last;
> +       unsigned long flags;
> +       unsigned int count;
> +       struct rb_node *rb;
> +
> +       spin_lock_irqsave(&engine->active.lock, flags);
> +
> +       last = NULL;
> +       count = 0;
> +       list_for_each_entry(rq, &engine->active.requests, sched.link) {
> +               if (count++ < max - 1)
> +                       show_request(m, rq, "\t\tE ");
> +               else
> +                       last = rq;
> +       }
> +       if (last) {
> +               if (count > max) {
> +                       drm_printf(m,
> +                                  "\t\t...skipping %d executing requests...\n",
> +                                  count - max);
> +               }
> +               show_request(m, last, "\t\tE ");
> +       }
> +
> +       last = NULL;
> +       count = 0;
> +       if (execlists->queue_priority_hint != INT_MIN)
> +               drm_printf(m, "\t\tQueue priority hint: %d\n",
> +                          execlists->queue_priority_hint);
> +       for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> +               struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +               int i;
> +
> +               priolist_for_each_request(rq, p, i) {
> +                       if (count++ < max - 1)
> +                               show_request(m, rq, "\t\tQ ");
> +                       else
> +                               last = rq;
> +               }
> +       }
> +       if (last) {
> +               if (count > max) {
> +                       drm_printf(m,
> +                                  "\t\t...skipping %d queued requests...\n",
> +                                  count - max);
> +               }
> +               show_request(m, last, "\t\tQ ");
> +       }
> +
> +       last = NULL;
> +       count = 0;
> +       for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
> +               struct intel_virtual_engine *ve =
> +                       rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
> +               struct i915_request *rq = READ_ONCE(ve->request);
> +
> +               if (rq) {
> +                       if (count++ < max - 1)
> +                               show_request(m, rq, "\t\tV ");
> +                       else
> +                               last = rq;
> +               }
> +       }
> +       if (last) {
> +               if (count > max) {
> +                       drm_printf(m,
> +                                  "\t\t...skipping %d virtual requests...\n",
> +                                  count - max);
> +               }
> +               show_request(m, last, "\t\tV ");
> +       }
> +
> +       spin_unlock_irqrestore(&engine->active.lock, flags);
> +}
> +
> +bool
> +intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
> +{
> +       return engine->set_default_submission ==
> +              intel_execlists_set_default_submission;
> +}

The breadcrumb submission code is specialised to execlists and should
not be shared (leaves emit_flush, emit_bb_start as common
gen8_submission.c). The reset code is specialised to execlists and should not
be shared. The virtual engine is specialised to execlists and should not
be shared. Even submit_request should be distinct between guc and
execlists, especially request_alloc (which you may like to put on the
context_ops rather than engine)
-Chris